Fix: improvements and tests for large files

author: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 21:41:53 -0300
committer: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 21:41:53 -0300
commit: caac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree: 1c41a36ed49e85afb95b64722632818503c2e77f /csv-hasher.py
parent: ae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
download: csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz
csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2
1 files changed, 34 insertions, 12 deletions
diff --git a/csv-hasher.py b/csv-hasher.py
index fe206f8..6415c7b 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -44,16 +44,19 @@ class CsvHasher:
             print('Invalid hash function ' + self.args.hashfunc)
             exit (1)
 
-    def apply_hash(self, df):
+    def apply_hash(self, df, skip=0):
         """Apply the hash function into a column from a dataframe"""
 
-        return df[self.args.colname[0]].apply(lambda x: \
+        return df[self.args.colname[0]][skip:].apply(lambda x: \
                 getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
 
     def run_legacy(self):
         """
-        Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
-        This won't work with CSVs larger than the available memory in the system.
+        Process CSV in "legacy" mode: open the input file, process and write
+        the output in a single step.
+
+        This won't work with CSVs larger than the available memory in the
+        system.
 
         Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
         Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
@@ -76,26 +79,44 @@ class CsvHasher:
         Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
         """
 
-        infile = self.args.infile[0]
+        # Shorthands
+        infile  = self.args.infile[0]
+        outfile = self.args.outfile[0]
 
         # Get number of lines in the CSV file
         nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
         nlines = int(nlines.split()[0])
 
+        # Check the input file
         if nlines < 2:
             print('CSV file is too small.')
             exit (1)
 
-        # Read the just to get the column names
-        sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
-        sample    = pd.concat(sample_tp, ignore_index=True)
+        # Start with and empty file
+        try:
+            with open(outfile, 'w') as f:
+                f.truncate(0)
+        except IOError:
+            print('Error writing to ' + outfile)
+            exit(1)
+
+        # Holds columns definition
+        columns = None
+
+        # Read a chunk just to get the column names
+        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
+            for chunk in sample:
+                columns = chunk.columns
+                break
 
         # Initialize progress bar
         progress_bar = tqdm(total=nlines) if self.args.progress else False
 
+        # Controls if the header should be included
         write_header = True
 
-        for i in range(0, nlines, self.args.chunksize):
+        # Start iteration from 1 so the CSV header is skipped
+        for i in range(1, nlines, self.args.chunksize):
             df = pd.read_csv(infile,
                     sep=self.args.sep,
                     header=None,               # no header, define column header manually later
@@ -103,7 +124,7 @@ class CsvHasher:
                     skiprows=i)                # skip rows that were already read
 
             # Add column information
-            df.columns = sample.columns
+            df.columns = columns
 
             # Hashing the column
             try:
@@ -113,7 +134,7 @@ class CsvHasher:
                 exit (1)
 
             # Writing the new CSV output
-            df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+            df.to_csv(outfile, index=False, mode='a', header=write_header)
 
             # Write the header only in the first iteration
             write_header = False
@@ -148,7 +169,8 @@ def cmdline():
 
     parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
 
-    parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+    parser.add_argument('--chunksize', dest='chunksize',
+            help='Read chunks at a time, defaults to 1M, supports human-readable notation')
 
     parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
author	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 21:41:53 -0300
committer	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 21:41:53 -0300
commit	caac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree	1c41a36ed49e85afb95b64722632818503c2e77f /csv-hasher.py
parent	ae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
download	csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2