aboutsummaryrefslogtreecommitdiff
path: root/csv-hasher.py
diff options
context:
space:
mode:
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-xcsv-hasher.py9
1 files changed, 6 insertions, 3 deletions
diff --git a/csv-hasher.py b/csv-hasher.py
index 71c3593..090b226 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -23,6 +23,7 @@ import argparse
import pandas as pd
import hashlib
import subprocess
+import humanfriendly
from sys import exit
from tqdm import tqdm
@@ -31,7 +32,8 @@ class CsvHasher:
def __init__(self, args):
# Save arguments
- self.args = args
+ self.args = args
+ self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))
# Check if source file exists
if not os.path.exists(args.infile[0]):
@@ -43,6 +45,7 @@ class CsvHasher:
exit (1)
def apply_hash(self, df):
+
return df[self.args.colname[0]].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
@@ -138,7 +141,7 @@ def cmdline():
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
- parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000')
+ parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
@@ -150,7 +153,7 @@ def cmdline():
# Add default values and get args
parser.set_defaults(sep=',')
- parser.set_defaults(chunksize=1000)
+ parser.set_defaults(chunksize='1M')
parser.set_defaults(hashfunc='sha256')
parser.set_defaults(progress=True)
args = parser.parse_args()