Feat: initial version

author: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 15:50:11 -0300
committer: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 15:50:11 -0300
commit: 2b343942870441b1c0f83cc6afdb030056d45c2e (patch)
tree: 8773f01a5f8d1cf711e7bcf91f915ba47991b493 /csv-hasher.py
parent: 8f381d2dd5af97f3663449a5ffc7ed76d11976fd (diff)
download: csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.gz
csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.bz2
1 files changed, 164 insertions, 0 deletions
diff --git a/csv-hasher.py b/csv-hasher.py
new file mode 100755
index 0000000..71c3593
--- /dev/null
+++ b/csv-hasher.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Hash a given column from a CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import pandas as pd
+import hashlib
+import subprocess
+from sys  import exit
+from tqdm import tqdm
+
+class CsvHasher:
+    """Hashes a column from a CSV file"""
+
+    def __init__(self, args):
+        # Save arguments
+        self.args = args
+
+        # Check if source file exists
+        if not os.path.exists(args.infile[0]):
+            print('File not found: ' + args.infile[0])
+            exit (1)
+
+        if hasattr(hashlib, self.args.hashfunc) is False:
+            print('Invalid hash function ' + self.args.hashfunc)
+            exit (1)
+
+    def apply_hash(self, df):
+        return df[self.args.colname[0]].apply(lambda x: \
+                getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
+
+    def run_legacy(self):
+        """
+        Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
+        This won't work with CSVs larger than the available memory in the system.
+
+        Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
+        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
+        """
+        # Read the CSV
+        df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
+        df = pd.concat(tp, ignore_index=True)
+
+        # Hashing the column
+        df[self.args.colname[0]] = self.apply_hash(df)
+
+        # Writing the new CSV output
+        df.to_csv(self.args.outfile[0], index=False)
+
+    def run(self):
+        """
+        Improved CSV processor for large files.
+
+        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
+        """
+        infile = self.args.infile[0]
+
+        # Get number of lines in the CSV file
+        nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
+        nlines = int(nlines.split()[0])
+
+        if nlines < 2:
+            print('CSV file is too small.')
+            exit (1)
+
+        # Read the just to get the column names
+        sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
+        sample    = pd.concat(sample_tp, ignore_index=True)
+
+        # Initialize progress bar
+        progress_bar = tqdm(total=nlines) if self.args.progress else False
+
+        write_header = True
+
+        for i in range(0, nlines, self.args.chunksize):
+            df = pd.read_csv(infile,
+                    sep=self.args.sep,
+                    header=None,               # no header, define column header manually later
+                    nrows=self.args.chunksize, # number of rows to read at each iteration
+                    skiprows=i)                # skip rows that were already read
+
+            # Add column information
+            df.columns = sample.columns
+
+            # Hashing the column
+            df[self.args.colname[0]] = self.apply_hash(df)
+
+            # Writing the new CSV output
+            df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+
+            # Write the header only in the first iteration
+            write_header = False
+
+            if hasattr(progress_bar, 'update'):
+                progress_bar.update(self.args.chunksize)
+
+        # Teardown
+        if hasattr(progress_bar, 'close'):
+            progress_bar.close()
+
+def cmdline():
+    """
+    Evalutate the command line.
+
+    :return: Command line arguments.
+    """
+
+    basename = os.path.basename(__file__)
+
+    # Parse CLI
+    #examples  = "Examples:\n\t" + basename + " --no-progress \n"
+
+    epilog = ''
+    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+                                     epilog=epilog,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+    parser.add_argument('infile',   nargs=1, help='CSV input file name')
+    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
+    parser.add_argument('colname',  nargs=1, help='Column name')
+
+    parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
+
+    parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000')
+
+    parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
+
+    parser.add_argument('--progress', dest='progress', action='store_true',
+                        help='Enable progress bar.')
+
+    parser.add_argument('--no-progress', dest='progress', action='store_false',
+                        help='Disable progress bar.')
+
+    # Add default values and get args
+    parser.set_defaults(sep=',')
+    parser.set_defaults(chunksize=1000)
+    parser.set_defaults(hashfunc='sha256')
+    parser.set_defaults(progress=True)
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args     = cmdline()
+    instance = CsvHasher(args)
+
+    instance.run()
author	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 15:50:11 -0300
committer	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 15:50:11 -0300
commit	2b343942870441b1c0f83cc6afdb030056d45c2e (patch)
tree	8773f01a5f8d1cf711e7bcf91f915ba47991b493 /csv-hasher.py
parent	8f381d2dd5af97f3663449a5ffc7ed76d11976fd (diff)
download	csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.gz csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.bz2