From 4b075c2096d0e464c848e9c894071330c68dcd73 Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Thu, 28 Jan 2021 16:13:42 -0300 Subject: Feat: support for human-readable notation at --chunksize --- Pipfile | 1 + Pipfile.lock | 10 +++++++++- README.md | 1 + csv-hasher.py | 9 ++++++--- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Pipfile b/Pipfile index b143e89..30be324 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,7 @@ name = "pypi" [packages] pandas = "*" tqdm = "*" +humanfriendly = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 194341f..cd7b1b2 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "19ab6829f09294559ac6466b24082f8537cb5c7be2d6aec8bbe7b18814d3d587" + "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,14 @@ ] }, "default": { + "humanfriendly": { + "hashes": [ + "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d", + "sha256:d5c731705114b9ad673754f3317d9fa4c23212f36b29bdc4272a892eafc9bc72" + ], + "index": "pypi", + "version": "==9.1" + }, "numpy": { "hashes": [ "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94", diff --git a/README.md b/README.md index 0e9531a..a8b4aaa 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Running: * [Python 3](https://python.org). * [Pandas](https://pandas.pydata.org). +* [python-humanfriendly](https://github.com/xolox/python-humanfriendly). Testing: diff --git a/csv-hasher.py b/csv-hasher.py index 71c3593..090b226 100755 --- a/csv-hasher.py +++ b/csv-hasher.py @@ -23,6 +23,7 @@ import argparse import pandas as pd import hashlib import subprocess +import humanfriendly from sys import exit from tqdm import tqdm @@ -31,7 +32,8 @@ class CsvHasher: def __init__(self, args): # Save arguments - self.args = args + self.args = args + self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize)) # Check if source file exists if not os.path.exists(args.infile[0]): @@ -43,6 +45,7 @@ class CsvHasher: exit (1) def apply_hash(self, df): + return df[self.args.colname[0]].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) @@ -138,7 +141,7 @@ def cmdline(): parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') - parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000') + parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation') parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') @@ -150,7 +153,7 @@ def cmdline(): # Add default values and get args parser.set_defaults(sep=',') - parser.set_defaults(chunksize=1000) + parser.set_defaults(chunksize='1M') parser.set_defaults(hashfunc='sha256') parser.set_defaults(progress=True) args = parser.parse_args() -- cgit v1.2.3