aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile8
-rwxr-xr-xbin/provision2
-rwxr-xr-xcsv-hasher.py30
-rwxr-xr-xcsv-sampler.py18
4 files changed, 34 insertions, 24 deletions
diff --git a/Makefile b/Makefile
index 8732717..df79e73 100644
--- a/Makefile
+++ b/Makefile
@@ -2,10 +2,10 @@
# Makefile for csv-hasher
#
-CHUNKSIZE = 10000
-CHECK_LINES = 20
-SAMPLE_ITERATIONS = 1000
-SAMPLE_ROWS_PER_ITERATION = 1000
+CHUNKSIZE = 64K
+CHECK_LINES = 16
+SAMPLE_ITERATIONS = 1024
+SAMPLE_ROWS_PER_ITERATION = 1024
TESTS = tests
COLNAME = id
SAMPLE = $(TESTS)/sample.csv
diff --git a/bin/provision b/bin/provision
index 89da228..df1ef5a 100755
--- a/bin/provision
+++ b/bin/provision
@@ -18,4 +18,4 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Setuo pipenv
-sudo apt install pipenv
+sudo apt install -y pipenv
diff --git a/csv-hasher.py b/csv-hasher.py
index c07adb0..e76c7b0 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -90,7 +90,7 @@ class CsvHasher:
# Check the input file
if nlines < 2:
print('CSV file is too small.')
- exit (1)
+ return False
# Holds columns definition
columns = None
@@ -104,7 +104,7 @@ class CsvHasher:
# Check for the column
if self.args.colname[0] not in columns:
print('Column not found: ' + self.args.colname[0])
- exit (1)
+ return False
# Start with an empty file
try:
@@ -112,7 +112,7 @@ class CsvHasher:
f.truncate(0)
except IOError:
print('Error writing to ' + outfile)
- exit(1)
+ return False
# Initialize progress bar
progress_bar = tqdm(total=nlines) if self.args.progress else False
@@ -136,7 +136,7 @@ class CsvHasher:
df[self.args.colname[0]] = self.apply_hash(df)
except KeyError as e:
print('Column not found: ' + self.args.colname[0])
- exit (1)
+ return False
# Writing the new CSV output
df.to_csv(outfile, index=False, mode='a', header=write_header)
@@ -173,7 +173,11 @@ def cmdline():
:return: Command line arguments.
"""
- basename = os.path.basename(__file__)
+ # Defaults
+ basename = os.path.basename(__file__)
+ chunksize = '1M'
+ hashfunc = 'sha256'
+ progress = True
# Parse CLI
#examples = "Examples:\n\t" + basename + " --no-progress \n"
@@ -190,23 +194,23 @@ def cmdline():
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
parser.add_argument('--chunksize', dest='chunksize',
- help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+ help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize)
- parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
+ parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc)
parser.add_argument('--progress', dest='progress', action='store_true',
- help='Enable progress bar.')
+ help='Enable progress bar, defaults to ' + str(progress))
parser.add_argument('--no-progress', dest='progress', action='store_false',
help='Disable progress bar.')
parser.add_argument('--check', dest='check', action='store_true',
- help='Check both files for differences (test suite), defaults to false.')
+ help='Check both files for differences (test suite), defaults to ' + str(not progress))
# Add default values and get args
parser.set_defaults(sep=',')
- parser.set_defaults(chunksize='1M')
- parser.set_defaults(hashfunc='sha256')
+ parser.set_defaults(chunksize=chunksize)
+ parser.set_defaults(hashfunc=hashfunc)
parser.set_defaults(progress=True)
parser.set_defaults(check=False)
args = parser.parse_args()
@@ -216,8 +220,10 @@ def cmdline():
if __name__ == "__main__":
args = cmdline()
instance = CsvHasher(args)
+ status = instance.run()
- instance.run()
+ if status is False:
+ exit(1)
if args.check == True:
instance.check()
diff --git a/csv-sampler.py b/csv-sampler.py
index 35d82db..fa861a8 100755
--- a/csv-sampler.py
+++ b/csv-sampler.py
@@ -66,7 +66,11 @@ def cmdline():
:return: Command line arguments.
"""
- basename = os.path.basename(__file__)
+ # Defaults
+ basename = os.path.basename(__file__)
+ rows_per_iteration = 1024
+ iterations = 1024
+ progress = True
# Parse CLI
#examples = "Examples:\n\t" + basename + " --no-progress \n"
@@ -76,23 +80,23 @@ def cmdline():
epilog=epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,)
- parser.add_argument('outfile', nargs=1, help='CSV output file name')
+ parser.add_argument('outfile', nargs=1, help='CSV output file name')
parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
- type=int, help='Rows per iteration, defaults to 1000')
+ type=int, help='Rows per iteration, defaults to ' + str(rows_per_iteration))
parser.add_argument('--iterations', dest='iterations',
- help='Number of iterations, defaults to 1000')
+ help='Number of iterations, defaults to ' + str(iterations))
parser.add_argument('--progress', dest='progress', action='store_true',
- help='Enable progress bar.')
+ help='Enable progress bar, defaults to ' + str(progress))
parser.add_argument('--no-progress', dest='progress', action='store_false',
help='Disable progress bar.')
# Add default values and get args
- parser.set_defaults(rows_per_iteration=1000)
- parser.set_defaults(iterations=1000)
+ parser.set_defaults(rows_per_iteration=rows_per_iteration)
+ parser.set_defaults(iterations=iterations)
parser.set_defaults(progress=True)
args = parser.parse_args()