1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Hash a given column from a CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import argparse
import pandas as pd
import hashlib
import subprocess
import humanfriendly
from sys import exit
from tqdm import tqdm
class CsvHasher:
"""Hashes a column from a CSV file"""
def __init__(self, args):
# Save arguments
self.args = args
self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))
# Check if source file exists
if not os.path.exists(args.infile[0]):
print('File not found: ' + args.infile[0])
exit (1)
if hasattr(hashlib, self.args.hashfunc) is False:
print('Invalid hash function ' + self.args.hashfunc)
exit (1)
def apply_hash(self, df, skip=0):
"""Apply the hash function into a column from a dataframe"""
return df[self.args.colname[0]][skip:].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
def run_legacy(self):
"""
Process CSV in "legacy" mode: open the input file, process and write
the output in a single step.
This won't work with CSVs larger than the available memory in the
system.
Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
"""
# Read the CSV
df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
df = pd.concat(tp, ignore_index=True)
# Hashing the column
df[self.args.colname[0]] = self.apply_hash(df)
# Writing the new CSV output
df.to_csv(self.args.outfile[0], index=False)
def run(self):
"""
Improved CSV processor for large files.
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
"""
# Shorthands
infile = self.args.infile[0]
outfile = self.args.outfile[0]
# Get number of lines in the CSV file
nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
nlines = int(nlines.split()[0])
# Check the input file
if nlines < 2:
print('CSV file is too small.')
return False
# Holds columns definition
columns = None
# Read a chunk just to get the column names
with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
for chunk in sample:
columns = chunk.columns
break
# Check for the column
if self.args.colname[0] not in columns:
print('Column not found: ' + self.args.colname[0])
return False
# Start with an empty file
try:
with open(outfile, 'w') as f:
f.truncate(0)
except IOError:
print('Error writing to ' + outfile)
return False
# Initialize progress bar
progress_bar = tqdm(total=nlines) if self.args.progress else False
# Controls if the header should be included
write_header = True
# Start iteration from 1 so the CSV header is skipped
for i in range(1, nlines, self.args.chunksize):
df = pd.read_csv(infile,
sep=self.args.sep,
header=None, # no header, define column header manually later
nrows=self.args.chunksize, # number of rows to read at each iteration
skiprows=i) # skip rows that were already read
# Add column information
df.columns = columns
# Hashing the column
try:
df[self.args.colname[0]] = self.apply_hash(df)
except KeyError as e:
print('Column not found: ' + self.args.colname[0])
return False
# Writing the new CSV output
df.to_csv(outfile, index=False, mode='a', header=write_header)
# Write the header only in the first iteration
write_header = False
if hasattr(progress_bar, 'update'):
progress_bar.update(self.args.chunksize)
# Teardown
if hasattr(progress_bar, 'close'):
progress_bar.close()
def check(self):
"""Check both files for differences"""
df_infile = pd.read_csv(self.args.infile[0], sep=self.args.sep)
df_outfile = pd.read_csv(self.args.outfile[0], sep=self.args.sep)
print('Comparing both files without excluding the ' + self.args.colname[0] + ' column:')
print(df_infile.compare(df_outfile))
del df_infile[self.args.colname[0]]
del df_outfile[self.args.colname[0]]
print('Comparing both files excluding the ' + self.args.colname[0] + ' column:')
print(df_infile.compare(df_outfile))
def cmdline():
"""
Evalutate the command line.
:return: Command line arguments.
"""
# Defaults
basename = os.path.basename(__file__)
chunksize = '1M'
hashfunc = 'sha256'
progress = True
# Parse CLI
#examples = "Examples:\n\t" + basename + " --no-progress \n"
epilog = ''
parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
epilog=epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,)
parser.add_argument('infile', nargs=1, help='CSV input file name')
parser.add_argument('outfile', nargs=1, help='CSV output file name')
parser.add_argument('colname', nargs=1, help='Column name')
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
parser.add_argument('--chunksize', dest='chunksize',
help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize)
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc)
parser.add_argument('--progress', dest='progress', action='store_true',
help='Enable progress bar, defaults to ' + str(progress))
parser.add_argument('--no-progress', dest='progress', action='store_false',
help='Disable progress bar.')
parser.add_argument('--check', dest='check', action='store_true',
help='Check both files for differences (test suite), defaults to ' + str(not progress))
# Add default values and get args
parser.set_defaults(sep=',')
parser.set_defaults(chunksize=chunksize)
parser.set_defaults(hashfunc=hashfunc)
parser.set_defaults(progress=True)
parser.set_defaults(check=False)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = cmdline()
instance = CsvHasher(args)
status = instance.run()
if status is False:
exit(1)
if args.check == True:
instance.check()
|