aboutsummaryrefslogtreecommitdiff
path: root/csv-hasher.py
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2021-01-28 15:50:11 -0300
committerSilvio Rhatto <rhatto@riseup.net>2021-01-28 15:50:11 -0300
commit2b343942870441b1c0f83cc6afdb030056d45c2e (patch)
tree8773f01a5f8d1cf711e7bcf91f915ba47991b493 /csv-hasher.py
parent8f381d2dd5af97f3663449a5ffc7ed76d11976fd (diff)
downloadcsv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.gz
csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.bz2
Feat: initial version
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-xcsv-hasher.py164
1 files changed, 164 insertions, 0 deletions
diff --git a/csv-hasher.py b/csv-hasher.py
new file mode 100755
index 0000000..71c3593
--- /dev/null
+++ b/csv-hasher.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Hash a given column from a CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import pandas as pd
+import hashlib
+import subprocess
+from sys import exit
+from tqdm import tqdm
+
+class CsvHasher:
+ """Hashes a column from a CSV file"""
+
+ def __init__(self, args):
+ # Save arguments
+ self.args = args
+
+ # Check if source file exists
+ if not os.path.exists(args.infile[0]):
+ print('File not found: ' + args.infile[0])
+ exit (1)
+
+ if hasattr(hashlib, self.args.hashfunc) is False:
+ print('Invalid hash function ' + self.args.hashfunc)
+ exit (1)
+
+ def apply_hash(self, df):
+ return df[self.args.colname[0]].apply(lambda x: \
+ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
+
+ def run_legacy(self):
+ """
+ Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
+ This won't work with CSVs larger than the available memory in the system.
+
+ Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
+ Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
+ """
+ # Read the CSV
+ df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
+ df = pd.concat(tp, ignore_index=True)
+
+ # Hashing the column
+ df[self.args.colname[0]] = self.apply_hash(df)
+
+ # Writing the new CSV output
+ df.to_csv(self.args.outfile[0], index=False)
+
+ def run(self):
+ """
+ Improved CSV processor for large files.
+
+ Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
+ """
+ infile = self.args.infile[0]
+
+ # Get number of lines in the CSV file
+ nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
+ nlines = int(nlines.split()[0])
+
+ if nlines < 2:
+ print('CSV file is too small.')
+ exit (1)
+
+ # Read the just to get the column names
+ sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
+ sample = pd.concat(sample_tp, ignore_index=True)
+
+ # Initialize progress bar
+ progress_bar = tqdm(total=nlines) if self.args.progress else False
+
+ write_header = True
+
+ for i in range(0, nlines, self.args.chunksize):
+ df = pd.read_csv(infile,
+ sep=self.args.sep,
+ header=None, # no header, define column header manually later
+ nrows=self.args.chunksize, # number of rows to read at each iteration
+ skiprows=i) # skip rows that were already read
+
+ # Add column information
+ df.columns = sample.columns
+
+ # Hashing the column
+ df[self.args.colname[0]] = self.apply_hash(df)
+
+ # Writing the new CSV output
+ df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+
+ # Write the header only in the first iteration
+ write_header = False
+
+ if hasattr(progress_bar, 'update'):
+ progress_bar.update(self.args.chunksize)
+
+ # Teardown
+ if hasattr(progress_bar, 'close'):
+ progress_bar.close()
+
+def cmdline():
+ """
+ Evalutate the command line.
+
+ :return: Command line arguments.
+ """
+
+ basename = os.path.basename(__file__)
+
+ # Parse CLI
+ #examples = "Examples:\n\t" + basename + " --no-progress \n"
+
+ epilog = ''
+ parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+ parser.add_argument('infile', nargs=1, help='CSV input file name')
+ parser.add_argument('outfile', nargs=1, help='CSV output file name')
+ parser.add_argument('colname', nargs=1, help='Column name')
+
+ parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
+
+ parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000')
+
+ parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
+
+ parser.add_argument('--progress', dest='progress', action='store_true',
+ help='Enable progress bar.')
+
+ parser.add_argument('--no-progress', dest='progress', action='store_false',
+ help='Disable progress bar.')
+
+ # Add default values and get args
+ parser.set_defaults(sep=',')
+ parser.set_defaults(chunksize=1000)
+ parser.set_defaults(hashfunc='sha256')
+ parser.set_defaults(progress=True)
+ args = parser.parse_args()
+
+ return args
+
+if __name__ == "__main__":
+ args = cmdline()
+ instance = CsvHasher(args)
+
+ instance.run()