diff options
author | Silvio Rhatto <rhatto@riseup.net> | 2021-01-28 15:50:11 -0300 |
---|---|---|
committer | Silvio Rhatto <rhatto@riseup.net> | 2021-01-28 15:50:11 -0300 |
commit | 2b343942870441b1c0f83cc6afdb030056d45c2e (patch) | |
tree | 8773f01a5f8d1cf711e7bcf91f915ba47991b493 /csv-hasher.py | |
parent | 8f381d2dd5af97f3663449a5ffc7ed76d11976fd (diff) | |
download | csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.gz csv-hasher-2b343942870441b1c0f83cc6afdb030056d45c2e.tar.bz2 |
Feat: initial version
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-x | csv-hasher.py | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/csv-hasher.py b/csv-hasher.py new file mode 100755 index 0000000..71c3593 --- /dev/null +++ b/csv-hasher.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Hash a given column from a CSV file. +# +# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import os +import argparse +import pandas as pd +import hashlib +import subprocess +from sys import exit +from tqdm import tqdm + +class CsvHasher: + """Hashes a column from a CSV file""" + + def __init__(self, args): + # Save arguments + self.args = args + + # Check if source file exists + if not os.path.exists(args.infile[0]): + print('File not found: ' + args.infile[0]) + exit (1) + + if hasattr(hashlib, self.args.hashfunc) is False: + print('Invalid hash function ' + self.args.hashfunc) + exit (1) + + def apply_hash(self, df): + return df[self.args.colname[0]].apply(lambda x: \ + getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) + + def run_legacy(self): + """ + Process CSV in "legacy" mode: open the input file, process and write the output in a single step. + This won't work with CSVs larger than the available memory in the system. + + Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file + Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 + """ + # Read the CSV + df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) + df = pd.concat(tp, ignore_index=True) + + # Hashing the column + df[self.args.colname[0]] = self.apply_hash(df) + + # Writing the new CSV output + df.to_csv(self.args.outfile[0], index=False) + + def run(self): + """ + Improved CSV processor for large files. + + Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 + """ + infile = self.args.infile[0] + + # Get number of lines in the CSV file + nlines = subprocess.check_output('wc -l %s' % infile, shell=True) + nlines = int(nlines.split()[0]) + + if nlines < 2: + print('CSV file is too small.') + exit (1) + + # Read the just to get the column names + sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) + sample = pd.concat(sample_tp, ignore_index=True) + + # Initialize progress bar + progress_bar = tqdm(total=nlines) if self.args.progress else False + + write_header = True + + for i in range(0, nlines, self.args.chunksize): + df = pd.read_csv(infile, + sep=self.args.sep, + header=None, # no header, define column header manually later + nrows=self.args.chunksize, # number of rows to read at each iteration + skiprows=i) # skip rows that were already read + + # Add column information + df.columns = sample.columns + + # Hashing the column + df[self.args.colname[0]] = self.apply_hash(df) + + # Writing the new CSV output + df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header) + + # Write the header only in the first iteration + write_header = False + + if hasattr(progress_bar, 'update'): + progress_bar.update(self.args.chunksize) + + # Teardown + if hasattr(progress_bar, 'close'): + progress_bar.close() + +def cmdline(): + """ + Evalutate the command line. + + :return: Command line arguments. + """ + + basename = os.path.basename(__file__) + + # Parse CLI + #examples = "Examples:\n\t" + basename + " --no-progress \n" + + epilog = '' + parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter,) + + parser.add_argument('infile', nargs=1, help='CSV input file name') + parser.add_argument('outfile', nargs=1, help='CSV output file name') + parser.add_argument('colname', nargs=1, help='Column name') + + parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') + + parser.add_argument('--chunksize', dest='chunksize', type=int, help='Read chunks at a time, defaults to 1000') + + parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') + + parser.add_argument('--progress', dest='progress', action='store_true', + help='Enable progress bar.') + + parser.add_argument('--no-progress', dest='progress', action='store_false', + help='Disable progress bar.') + + # Add default values and get args + parser.set_defaults(sep=',') + parser.set_defaults(chunksize=1000) + parser.set_defaults(hashfunc='sha256') + parser.set_defaults(progress=True) + args = parser.parse_args() + + return args + +if __name__ == "__main__": + args = cmdline() + instance = CsvHasher(args) + + instance.run() |