diff options
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-x | csv-hasher.py | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/csv-hasher.py b/csv-hasher.py index 71c3593..090b226 100755 --- a/csv-hasher.py +++ b/csv-hasher.py @@ -23,6 +23,7 @@ import argparse import pandas as pd import hashlib import subprocess +import humanfriendly from sys import exit from tqdm import tqdm @@ -31,7 +32,8 @@ class CsvHasher: def __init__(self, args): # Save arguments - self.args = args + self.args = args + self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize)) # Check if source file exists if not os.path.exists(args.infile[0]): @@ -43,6 +45,7 @@ class CsvHasher: exit (1) def apply_hash(self, df): + return df[self.args.colname[0]].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) @@ -138,7 +141,7 @@ def cmdline(): parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') - parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000') + parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation') parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') @@ -150,7 +153,7 @@ def cmdline(): # Add default values and get args parser.set_defaults(sep=',') - parser.set_defaults(chunksize=1000) + parser.set_defaults(chunksize='1M') parser.set_defaults(hashfunc='sha256') parser.set_defaults(progress=True) args = parser.parse_args() |