aboutsummaryrefslogtreecommitdiff
path: root/csv-hasher.py
blob: e76c7b0a18d19895b12f9777107c5a7e1091f29f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Hash a given column from a CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import argparse
import pandas as pd
import hashlib
import subprocess
import humanfriendly
from sys  import exit
from tqdm import tqdm

class CsvHasher:
    """Hashes a column from a CSV file"""

    def __init__(self, args):
        # Save arguments
        self.args           = args
        self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))

        # Check if source file exists
        if not os.path.exists(args.infile[0]):
            print('File not found: ' + args.infile[0])
            exit (1)

        if hasattr(hashlib, self.args.hashfunc) is False:
            print('Invalid hash function ' + self.args.hashfunc)
            exit (1)

    def apply_hash(self, df, skip=0):
        """Apply the hash function into a column from a dataframe"""

        return df[self.args.colname[0]][skip:].apply(lambda x: \
                getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())

    def run_legacy(self):
        """
        Process CSV in "legacy" mode: open the input file, process and write
        the output in a single step.

        This won't work with CSVs larger than the available memory in the
        system.

        Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Read the CSV
        df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
        df = pd.concat(tp, ignore_index=True)

        # Hashing the column
        df[self.args.colname[0]] = self.apply_hash(df)

        # Writing the new CSV output
        df.to_csv(self.args.outfile[0], index=False)

    def run(self):
        """
        Improved CSV processor for large files.

        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Shorthands
        infile  = self.args.infile[0]
        outfile = self.args.outfile[0]

        # Get number of lines in the CSV file
        nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
        nlines = int(nlines.split()[0])

        # Check the input file
        if nlines < 2:
            print('CSV file is too small.')
            return False

        # Holds columns definition
        columns = None

        # Read a chunk just to get the column names
        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
            for chunk in sample:
                columns = chunk.columns
                break

        # Check for the column
        if self.args.colname[0] not in columns:
            print('Column not found: ' + self.args.colname[0])
            return False

        # Start with an empty file
        try:
            with open(outfile, 'w') as f:
                f.truncate(0)
        except IOError:
            print('Error writing to ' + outfile)
            return False

        # Initialize progress bar
        progress_bar = tqdm(total=nlines) if self.args.progress else False

        # Controls if the header should be included
        write_header = True

        # Start iteration from 1 so the CSV header is skipped
        for i in range(1, nlines, self.args.chunksize):
            df = pd.read_csv(infile,
                    sep=self.args.sep,
                    header=None,               # no header, define column header manually later
                    nrows=self.args.chunksize, # number of rows to read at each iteration
                    skiprows=i)                # skip rows that were already read

            # Add column information
            df.columns = columns

            # Hashing the column
            try:
                df[self.args.colname[0]] = self.apply_hash(df)
            except KeyError as e:
                print('Column not found: ' + self.args.colname[0])
                return False

            # Writing the new CSV output
            df.to_csv(outfile, index=False, mode='a', header=write_header)

            # Write the header only in the first iteration
            write_header = False

            if hasattr(progress_bar, 'update'):
                progress_bar.update(self.args.chunksize)

        # Teardown
        if hasattr(progress_bar, 'close'):
            progress_bar.close()

    def check(self):
        """Check both files for differences"""

        df_infile  = pd.read_csv(self.args.infile[0],  sep=self.args.sep)
        df_outfile = pd.read_csv(self.args.outfile[0], sep=self.args.sep)

        print('Comparing both files without excluding the ' + self.args.colname[0] + ' column:')
        print(df_infile.compare(df_outfile))

        del df_infile[self.args.colname[0]]
        del df_outfile[self.args.colname[0]]

        print('Comparing both files excluding the ' + self.args.colname[0] + ' column:')
        print(df_infile.compare(df_outfile))

def cmdline():
    """
    Evalutate the command line.

    :return: Command line arguments.
    """

    # Defaults
    basename  = os.path.basename(__file__)
    chunksize = '1M'
    hashfunc  = 'sha256'
    progress  = True

    # Parse CLI
    #examples  = "Examples:\n\t" + basename + " --no-progress \n"

    epilog = ''
    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
                                     epilog=epilog,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,)

    parser.add_argument('infile',   nargs=1, help='CSV input file name')
    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
    parser.add_argument('colname',  nargs=1, help='Column name')

    parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')

    parser.add_argument('--chunksize', dest='chunksize',
            help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize)

    parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc)

    parser.add_argument('--progress', dest='progress', action='store_true',
                        help='Enable progress bar, defaults to ' + str(progress))

    parser.add_argument('--no-progress', dest='progress', action='store_false',
                        help='Disable progress bar.')

    parser.add_argument('--check', dest='check', action='store_true',
                        help='Check both files for differences (test suite), defaults to ' + str(not progress))

    # Add default values and get args
    parser.set_defaults(sep=',')
    parser.set_defaults(chunksize=chunksize)
    parser.set_defaults(hashfunc=hashfunc)
    parser.set_defaults(progress=True)
    parser.set_defaults(check=False)
    args = parser.parse_args()

    return args

if __name__ == "__main__":
    args     = cmdline()
    instance = CsvHasher(args)
    status   = instance.run()

    if status is False:
        exit(1)

    if args.check == True:
        instance.check()