aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2021-01-28 16:13:42 -0300
committerSilvio Rhatto <rhatto@riseup.net>2021-01-28 16:13:42 -0300
commit4b075c2096d0e464c848e9c894071330c68dcd73 (patch)
treeaeb23ffd790c71de1151ea90940b718a8005d55e
parent2b343942870441b1c0f83cc6afdb030056d45c2e (diff)
downloadcsv-hasher-4b075c2096d0e464c848e9c894071330c68dcd73.tar.gz
csv-hasher-4b075c2096d0e464c848e9c894071330c68dcd73.tar.bz2
Feat: support for human-readable notation at --chunksize
-rw-r--r--Pipfile1
-rw-r--r--Pipfile.lock10
-rw-r--r--README.md1
-rwxr-xr-xcsv-hasher.py9
4 files changed, 17 insertions, 4 deletions
diff --git a/Pipfile b/Pipfile
index b143e89..30be324 100644
--- a/Pipfile
+++ b/Pipfile
@@ -6,6 +6,7 @@ name = "pypi"
[packages]
pandas = "*"
tqdm = "*"
+humanfriendly = "*"
[dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
index 194341f..cd7b1b2 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "19ab6829f09294559ac6466b24082f8537cb5c7be2d6aec8bbe7b18814d3d587"
+ "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e"
},
"pipfile-spec": 6,
"requires": {
@@ -16,6 +16,14 @@
]
},
"default": {
+ "humanfriendly": {
+ "hashes": [
+ "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d",
+ "sha256:d5c731705114b9ad673754f3317d9fa4c23212f36b29bdc4272a892eafc9bc72"
+ ],
+ "index": "pypi",
+ "version": "==9.1"
+ },
"numpy": {
"hashes": [
"sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94",
diff --git a/README.md b/README.md
index 0e9531a..a8b4aaa 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Running:
* [Python 3](https://python.org).
* [Pandas](https://pandas.pydata.org).
+* [python-humanfriendly](https://github.com/xolox/python-humanfriendly).
Testing:
diff --git a/csv-hasher.py b/csv-hasher.py
index 71c3593..090b226 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -23,6 +23,7 @@ import argparse
import pandas as pd
import hashlib
import subprocess
+import humanfriendly
from sys import exit
from tqdm import tqdm
@@ -31,7 +32,8 @@ class CsvHasher:
def __init__(self, args):
# Save arguments
- self.args = args
+ self.args = args
+ self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))
# Check if source file exists
if not os.path.exists(args.infile[0]):
@@ -43,6 +45,7 @@ class CsvHasher:
exit (1)
def apply_hash(self, df):
+
return df[self.args.colname[0]].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
@@ -138,7 +141,7 @@ def cmdline():
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
- parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000')
+ parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
@@ -150,7 +153,7 @@ def cmdline():
# Add default values and get args
parser.set_defaults(sep=',')
- parser.set_defaults(chunksize=1000)
+ parser.set_defaults(chunksize='1M')
parser.set_defaults(hashfunc='sha256')
parser.set_defaults(progress=True)
args = parser.parse_args()