#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Hash a given column from a CSV file. # # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os import argparse import pandas as pd import hashlib import subprocess import humanfriendly from sys import exit from tqdm import tqdm class CsvHasher: """Hashes a column from a CSV file""" def __init__(self, args): # Save arguments self.args = args self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize)) # Check if source file exists if not os.path.exists(args.infile[0]): print('File not found: ' + args.infile[0]) exit (1) if hasattr(hashlib, self.args.hashfunc) is False: print('Invalid hash function ' + self.args.hashfunc) exit (1) def apply_hash(self, df, skip=0): """Apply the hash function into a column from a dataframe""" return df[self.args.colname[0]][skip:].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) def run_legacy(self): """ Process CSV in "legacy" mode: open the input file, process and write the output in a single step. This won't work with CSVs larger than the available memory in the system. Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ # Read the CSV df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) df = pd.concat(tp, ignore_index=True) # Hashing the column df[self.args.colname[0]] = self.apply_hash(df) # Writing the new CSV output df.to_csv(self.args.outfile[0], index=False) def run(self): """ Improved CSV processor for large files. Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ # Shorthands infile = self.args.infile[0] outfile = self.args.outfile[0] # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % infile, shell=True) nlines = int(nlines.split()[0]) # Check the input file if nlines < 2: print('CSV file is too small.') return False # Holds columns definition columns = None # Read a chunk just to get the column names with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample: for chunk in sample: columns = chunk.columns break # Check for the column if self.args.colname[0] not in columns: print('Column not found: ' + self.args.colname[0]) return False # Start with an empty file try: with open(outfile, 'w') as f: f.truncate(0) except IOError: print('Error writing to ' + outfile) return False # Initialize progress bar progress_bar = tqdm(total=nlines) if self.args.progress else False # Controls if the header should be included write_header = True # Start iteration from 1 so the CSV header is skipped for i in range(1, nlines, self.args.chunksize): df = pd.read_csv(infile, sep=self.args.sep, header=None, # no header, define column header manually later nrows=self.args.chunksize, # number of rows to read at each iteration skiprows=i) # skip rows that were already read # Add column information df.columns = columns # Hashing the column try: df[self.args.colname[0]] = self.apply_hash(df) except KeyError as e: print('Column not found: ' + self.args.colname[0]) return False # Writing the new CSV output df.to_csv(outfile, index=False, mode='a', header=write_header) # Write the header only in the first iteration write_header = False if hasattr(progress_bar, 'update'): progress_bar.update(self.args.chunksize) # Teardown if hasattr(progress_bar, 'close'): progress_bar.close() def check(self): """Check both files for differences""" df_infile = pd.read_csv(self.args.infile[0], sep=self.args.sep) df_outfile = pd.read_csv(self.args.outfile[0], sep=self.args.sep) print('Comparing both files without excluding the ' + self.args.colname[0] + ' column:') print(df_infile.compare(df_outfile)) del df_infile[self.args.colname[0]] del df_outfile[self.args.colname[0]] print('Comparing both files excluding the ' + self.args.colname[0] + ' column:') print(df_infile.compare(df_outfile)) def cmdline(): """ Evalutate the command line. :return: Command line arguments. """ # Defaults basename = os.path.basename(__file__) chunksize = '1M' hashfunc = 'sha256' progress = True # Parse CLI #examples = "Examples:\n\t" + basename + " --no-progress \n" epilog = '' parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter,) parser.add_argument('infile', nargs=1, help='CSV input file name') parser.add_argument('outfile', nargs=1, help='CSV output file name') parser.add_argument('colname', nargs=1, help='Column name') parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize) parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc) parser.add_argument('--progress', dest='progress', action='store_true', help='Enable progress bar, defaults to ' + str(progress)) parser.add_argument('--no-progress', dest='progress', action='store_false', help='Disable progress bar.') parser.add_argument('--check', dest='check', action='store_true', help='Check both files for differences (test suite), defaults to ' + str(not progress)) # Add default values and get args parser.set_defaults(sep=',') parser.set_defaults(chunksize=chunksize) parser.set_defaults(hashfunc=hashfunc) parser.set_defaults(progress=True) parser.set_defaults(check=False) args = parser.parse_args() return args if __name__ == "__main__": args = cmdline() instance = CsvHasher(args) status = instance.run() if status is False: exit(1) if args.check == True: instance.check()