diff options
-rw-r--r-- | Makefile | 8 | ||||
-rwxr-xr-x | bin/provision | 2 | ||||
-rwxr-xr-x | csv-hasher.py | 30 | ||||
-rwxr-xr-x | csv-sampler.py | 18 |
4 files changed, 34 insertions, 24 deletions
@@ -2,10 +2,10 @@ # Makefile for csv-hasher # -CHUNKSIZE = 10000 -CHECK_LINES = 20 -SAMPLE_ITERATIONS = 1000 -SAMPLE_ROWS_PER_ITERATION = 1000 +CHUNKSIZE = 64K +CHECK_LINES = 16 +SAMPLE_ITERATIONS = 1024 +SAMPLE_ROWS_PER_ITERATION = 1024 TESTS = tests COLNAME = id SAMPLE = $(TESTS)/sample.csv diff --git a/bin/provision b/bin/provision index 89da228..df1ef5a 100755 --- a/bin/provision +++ b/bin/provision @@ -18,4 +18,4 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # Setuo pipenv -sudo apt install pipenv +sudo apt install -y pipenv diff --git a/csv-hasher.py b/csv-hasher.py index c07adb0..e76c7b0 100755 --- a/csv-hasher.py +++ b/csv-hasher.py @@ -90,7 +90,7 @@ class CsvHasher: # Check the input file if nlines < 2: print('CSV file is too small.') - exit (1) + return False # Holds columns definition columns = None @@ -104,7 +104,7 @@ class CsvHasher: # Check for the column if self.args.colname[0] not in columns: print('Column not found: ' + self.args.colname[0]) - exit (1) + return False # Start with an empty file try: @@ -112,7 +112,7 @@ class CsvHasher: f.truncate(0) except IOError: print('Error writing to ' + outfile) - exit(1) + return False # Initialize progress bar progress_bar = tqdm(total=nlines) if self.args.progress else False @@ -136,7 +136,7 @@ class CsvHasher: df[self.args.colname[0]] = self.apply_hash(df) except KeyError as e: print('Column not found: ' + self.args.colname[0]) - exit (1) + return False # Writing the new CSV output df.to_csv(outfile, index=False, mode='a', header=write_header) @@ -173,7 +173,11 @@ def cmdline(): :return: Command line arguments. """ - basename = os.path.basename(__file__) + # Defaults + basename = os.path.basename(__file__) + chunksize = '1M' + hashfunc = 'sha256' + progress = True # Parse CLI #examples = "Examples:\n\t" + basename + " --no-progress \n" @@ -190,23 +194,23 @@ def cmdline(): parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') parser.add_argument('--chunksize', dest='chunksize', - help='Read chunks at a time, defaults to 1M, supports human-readable notation') + help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize) - parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') + parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc) parser.add_argument('--progress', dest='progress', action='store_true', - help='Enable progress bar.') + help='Enable progress bar, defaults to ' + str(progress)) parser.add_argument('--no-progress', dest='progress', action='store_false', help='Disable progress bar.') parser.add_argument('--check', dest='check', action='store_true', - help='Check both files for differences (test suite), defaults to false.') + help='Check both files for differences (test suite), defaults to ' + str(not progress)) # Add default values and get args parser.set_defaults(sep=',') - parser.set_defaults(chunksize='1M') - parser.set_defaults(hashfunc='sha256') + parser.set_defaults(chunksize=chunksize) + parser.set_defaults(hashfunc=hashfunc) parser.set_defaults(progress=True) parser.set_defaults(check=False) args = parser.parse_args() @@ -216,8 +220,10 @@ def cmdline(): if __name__ == "__main__": args = cmdline() instance = CsvHasher(args) + status = instance.run() - instance.run() + if status is False: + exit(1) if args.check == True: instance.check() diff --git a/csv-sampler.py b/csv-sampler.py index 35d82db..fa861a8 100755 --- a/csv-sampler.py +++ b/csv-sampler.py @@ -66,7 +66,11 @@ def cmdline(): :return: Command line arguments. """ - basename = os.path.basename(__file__) + # Defaults + basename = os.path.basename(__file__) + rows_per_iteration = 1024 + iterations = 1024 + progress = True # Parse CLI #examples = "Examples:\n\t" + basename + " --no-progress \n" @@ -76,23 +80,23 @@ def cmdline(): epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter,) - parser.add_argument('outfile', nargs=1, help='CSV output file name') + parser.add_argument('outfile', nargs=1, help='CSV output file name') parser.add_argument('--rows_per_iteration', dest='rows_per_iteration', - type=int, help='Rows per iteration, defaults to 1000') + type=int, help='Rows per iteration, defaults to ' + str(rows_per_iteration)) parser.add_argument('--iterations', dest='iterations', - help='Number of iterations, defaults to 1000') + help='Number of iterations, defaults to ' + str(iterations)) parser.add_argument('--progress', dest='progress', action='store_true', - help='Enable progress bar.') + help='Enable progress bar, defaults to ' + str(progress)) parser.add_argument('--no-progress', dest='progress', action='store_false', help='Disable progress bar.') # Add default values and get args - parser.set_defaults(rows_per_iteration=1000) - parser.set_defaults(iterations=1000) + parser.set_defaults(rows_per_iteration=rows_per_iteration) + parser.set_defaults(iterations=iterations) parser.set_defaults(progress=True) args = parser.parse_args() |