diff options
author | Silvio Rhatto <rhatto@riseup.net> | 2021-01-28 21:41:53 -0300 |
---|---|---|
committer | Silvio Rhatto <rhatto@riseup.net> | 2021-01-28 21:41:53 -0300 |
commit | caac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch) | |
tree | 1c41a36ed49e85afb95b64722632818503c2e77f /csv-hasher.py | |
parent | ae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff) | |
download | csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2 |
Fix: improvements and tests for large files
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-x | csv-hasher.py | 46 |
1 files changed, 34 insertions, 12 deletions
diff --git a/csv-hasher.py b/csv-hasher.py index fe206f8..6415c7b 100755 --- a/csv-hasher.py +++ b/csv-hasher.py @@ -44,16 +44,19 @@ class CsvHasher: print('Invalid hash function ' + self.args.hashfunc) exit (1) - def apply_hash(self, df): + def apply_hash(self, df, skip=0): """Apply the hash function into a column from a dataframe""" - return df[self.args.colname[0]].apply(lambda x: \ + return df[self.args.colname[0]][skip:].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) def run_legacy(self): """ - Process CSV in "legacy" mode: open the input file, process and write the output in a single step. - This won't work with CSVs larger than the available memory in the system. + Process CSV in "legacy" mode: open the input file, process and write + the output in a single step. + + This won't work with CSVs larger than the available memory in the + system. Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 @@ -76,26 +79,44 @@ class CsvHasher: Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ - infile = self.args.infile[0] + # Shorthands + infile = self.args.infile[0] + outfile = self.args.outfile[0] # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % infile, shell=True) nlines = int(nlines.split()[0]) + # Check the input file if nlines < 2: print('CSV file is too small.') exit (1) - # Read the just to get the column names - sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) - sample = pd.concat(sample_tp, ignore_index=True) + # Start with and empty file + try: + with open(outfile, 'w') as f: + f.truncate(0) + except IOError: + print('Error writing to ' + outfile) + exit(1) + + # Holds columns definition + columns = None + + # Read a chunk just to get the column names + with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample: + for chunk in sample: + columns = chunk.columns + break # Initialize progress bar progress_bar = tqdm(total=nlines) if self.args.progress else False + # Controls if the header should be included write_header = True - for i in range(0, nlines, self.args.chunksize): + # Start iteration from 1 so the CSV header is skipped + for i in range(1, nlines, self.args.chunksize): df = pd.read_csv(infile, sep=self.args.sep, header=None, # no header, define column header manually later @@ -103,7 +124,7 @@ class CsvHasher: skiprows=i) # skip rows that were already read # Add column information - df.columns = sample.columns + df.columns = columns # Hashing the column try: @@ -113,7 +134,7 @@ class CsvHasher: exit (1) # Writing the new CSV output - df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header) + df.to_csv(outfile, index=False, mode='a', header=write_header) # Write the header only in the first iteration write_header = False @@ -148,7 +169,8 @@ def cmdline(): parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') - parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation') + parser.add_argument('--chunksize', dest='chunksize', + help='Read chunks at a time, defaults to 1M, supports human-readable notation') parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') |