aboutsummaryrefslogtreecommitdiff
path: root/csv-hasher.py
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
committerSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
commitcaac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree1c41a36ed49e85afb95b64722632818503c2e77f /csv-hasher.py
parentae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
downloadcsv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz
csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2
Fix: improvements and tests for large files
Diffstat (limited to 'csv-hasher.py')
-rwxr-xr-xcsv-hasher.py46
1 files changed, 34 insertions, 12 deletions
diff --git a/csv-hasher.py b/csv-hasher.py
index fe206f8..6415c7b 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -44,16 +44,19 @@ class CsvHasher:
print('Invalid hash function ' + self.args.hashfunc)
exit (1)
- def apply_hash(self, df):
+ def apply_hash(self, df, skip=0):
"""Apply the hash function into a column from a dataframe"""
- return df[self.args.colname[0]].apply(lambda x: \
+ return df[self.args.colname[0]][skip:].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
def run_legacy(self):
"""
- Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
- This won't work with CSVs larger than the available memory in the system.
+ Process CSV in "legacy" mode: open the input file, process and write
+ the output in a single step.
+
+ This won't work with CSVs larger than the available memory in the
+ system.
Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
@@ -76,26 +79,44 @@ class CsvHasher:
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
"""
- infile = self.args.infile[0]
+ # Shorthands
+ infile = self.args.infile[0]
+ outfile = self.args.outfile[0]
# Get number of lines in the CSV file
nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
nlines = int(nlines.split()[0])
+ # Check the input file
if nlines < 2:
print('CSV file is too small.')
exit (1)
- # Read the just to get the column names
- sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
- sample = pd.concat(sample_tp, ignore_index=True)
+ # Start with and empty file
+ try:
+ with open(outfile, 'w') as f:
+ f.truncate(0)
+ except IOError:
+ print('Error writing to ' + outfile)
+ exit(1)
+
+ # Holds columns definition
+ columns = None
+
+ # Read a chunk just to get the column names
+ with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
+ for chunk in sample:
+ columns = chunk.columns
+ break
# Initialize progress bar
progress_bar = tqdm(total=nlines) if self.args.progress else False
+ # Controls if the header should be included
write_header = True
- for i in range(0, nlines, self.args.chunksize):
+ # Start iteration from 1 so the CSV header is skipped
+ for i in range(1, nlines, self.args.chunksize):
df = pd.read_csv(infile,
sep=self.args.sep,
header=None, # no header, define column header manually later
@@ -103,7 +124,7 @@ class CsvHasher:
skiprows=i) # skip rows that were already read
# Add column information
- df.columns = sample.columns
+ df.columns = columns
# Hashing the column
try:
@@ -113,7 +134,7 @@ class CsvHasher:
exit (1)
# Writing the new CSV output
- df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+ df.to_csv(outfile, index=False, mode='a', header=write_header)
# Write the header only in the first iteration
write_header = False
@@ -148,7 +169,8 @@ def cmdline():
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
- parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+ parser.add_argument('--chunksize', dest='chunksize',
+ help='Read chunks at a time, defaults to 1M, supports human-readable notation')
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')