#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Hash a given column from a CSV file. # # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os import argparse import pandas as pd import hashlib import subprocess import humanfriendly from sys import exit from tqdm import tqdm class CsvHasher: """Hashes a column from a CSV file""" def __init__(self, args): # Save arguments self.args = args self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize)) # Check if source file exists if not os.path.exists(args.infile[0]): print('File not found: ' + args.infile[0]) exit (1) if hasattr(hashlib, self.args.hashfunc) is False: print('Invalid hash function ' + self.args.hashfunc) exit (1) def apply_hash(self, df, skip=0): """Apply the hash function into a column from a dataframe""" return df[self.args.colname[0]][skip:].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) def run_legacy(self): """ Process CSV in "legacy" mode: open the input file, process and write the output in a single step. This won't work with CSVs larger than the available memory in the system. Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ # Read the CSV df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) df = pd.concat(tp, ignore_index=True) # Hashing the column df[self.args.colname[0]] = self.apply_hash(df) # Writing the new CSV output df.to_csv(self.args.outfile[0], index=False) def run(self): """ Improved CSV processor for large files. Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ # Shorthands infile = self.args.infile[0] outfile = self.args.outfile[0] # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % infile, shell=True) nlines = int(nlines.split()[0]) # Check the input file if nlines < 2: print('CSV file is too small.') exit (1) # Holds columns definition columns = None # Read a chunk just to get the column names with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample: for chunk in sample: columns = chunk.columns break # Check for the column if self.args.colname[0] not in columns: print('Column not found: ' + self.args.colname[0]) exit (1) # Start with an empty file try: with open(outfile, 'w') as f: f.truncate(0) except IOError: print('Error writing to ' + outfile) exit(1) # Initialize progress bar progress_bar = tqdm(total=nlines) if self.args.progress else False # Controls if the header should be included write_header = True # Start iteration from 1 so the CSV header is skipped for i in range(1, nlines, self.args.chunksize): df = pd.read_csv(infile, sep=self.args.sep, header=None, # no header, define column header manually later nrows=self.args.chunksize, # number of rows to read at each iteration skiprows=i) # skip rows that were already read # Add column information df.columns = columns # Hashing the column try: df[self.args.colname[0]] = self.apply_hash(df) except KeyError as e: print('Column not found: ' + self.args.colname[0]) exit (1) # Writing the new CSV output df.to_csv(outfile, index=False, mode='a', header=write_header) # Write the header only in the first iteration write_header = False if hasattr(progress_bar, 'update'): progress_bar.update(self.args.chunksize) # Teardown if hasattr(progress_bar, 'close'): progress_bar.close() def cmdline(): """ Evalutate the command line. :return: Command line arguments. """ basename = os.path.basename(__file__) # Parse CLI #examples = "Examples:\n\t" + basename + " --no-progress \n" epilog = '' parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter,) parser.add_argument('infile', nargs=1, help='CSV input file name') parser.add_argument('outfile', nargs=1, help='CSV output file name') parser.add_argument('colname', nargs=1, help='Column name') parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation') parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') parser.add_argument('--progress', dest='progress', action='store_true', help='Enable progress bar.') parser.add_argument('--no-progress', dest='progress', action='store_false', help='Disable progress bar.') # Add default values and get args parser.set_defaults(sep=',') parser.set_defaults(chunksize='1M') parser.set_defaults(hashfunc='sha256') parser.set_defaults(progress=True) args = parser.parse_args() return args if __name__ == "__main__": args = cmdline() instance = CsvHasher(args) instance.run()