csv-hasher.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Hash a given column from a CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import argparse
import pandas as pd
import hashlib
import subprocess
import humanfriendly
from sys  import exit
from tqdm import tqdm

class CsvHasher:
    """Hashes a column from a CSV file"""

    def __init__(self, args):
        # Save arguments
        self.args           = args
        self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))

        # Check if source file exists
        if not os.path.exists(args.infile[0]):
            print('File not found: ' + args.infile[0])
            exit (1)

        if hasattr(hashlib, self.args.hashfunc) is False:
            print('Invalid hash function ' + self.args.hashfunc)
            exit (1)

    def apply_hash(self, df, skip=0):
        """Apply the hash function into a column from a dataframe"""

        return df[self.args.colname[0]][skip:].apply(lambda x: \
                getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())

    def run_legacy(self):
        """
        Process CSV in "legacy" mode: open the input file, process and write
        the output in a single step.

        This won't work with CSVs larger than the available memory in the
        system.

        Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Read the CSV
        df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
        df = pd.concat(tp, ignore_index=True)

        # Hashing the column
        df[self.args.colname[0]] = self.apply_hash(df)

        # Writing the new CSV output
        df.to_csv(self.args.outfile[0], index=False)

    def run(self):
        """
        Improved CSV processor for large files.

        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Shorthands
        infile  = self.args.infile[0]
        outfile = self.args.outfile[0]

        # Get number of lines in the CSV file
        nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
        nlines = int(nlines.split()[0])

        # Check the input file
        if nlines < 2:
            print('CSV file is too small.')
            exit (1)

        # Start with and empty file
        try:
            with open(outfile, 'w') as f:
                f.truncate(0)
        except IOError:
            print('Error writing to ' + outfile)
            exit(1)

        # Holds columns definition
        columns = None

        # Read a chunk just to get the column names
        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
            for chunk in sample:
                columns = chunk.columns
                break

        # Initialize progress bar
        progress_bar = tqdm(total=nlines) if self.args.progress else False

        # Controls if the header should be included
        write_header = True

        # Start iteration from 1 so the CSV header is skipped
        for i in range(1, nlines, self.args.chunksize):
            df = pd.read_csv(infile,
                    sep=self.args.sep,
                    header=None,               # no header, define column header manually later
                    nrows=self.args.chunksize, # number of rows to read at each iteration
                    skiprows=i)                # skip rows that were already read

            # Add column information
            df.columns = columns

            # Hashing the column
            try:
                df[self.args.colname[0]] = self.apply_hash(df)
            except KeyError as e:
                print('Column not found: ' + self.args.colname[0])
                exit (1)

            # Writing the new CSV output
            df.to_csv(outfile, index=False, mode='a', header=write_header)

            # Write the header only in the first iteration
            write_header = False

            if hasattr(progress_bar, 'update'):
                progress_bar.update(self.args.chunksize)

        # Teardown
        if hasattr(progress_bar, 'close'):
            progress_bar.close()

def cmdline():
    """
    Evalutate the command line.

    :return: Command line arguments.
    """

    basename = os.path.basename(__file__)

    # Parse CLI
    #examples  = "Examples:\n\t" + basename + " --no-progress \n"

    epilog = ''
    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
                                     epilog=epilog,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,)

    parser.add_argument('infile',   nargs=1, help='CSV input file name')
    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
    parser.add_argument('colname',  nargs=1, help='Column name')

    parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')

    parser.add_argument('--chunksize', dest='chunksize',
            help='Read chunks at a time, defaults to 1M, supports human-readable notation')

    parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')

    parser.add_argument('--progress', dest='progress', action='store_true',
                        help='Enable progress bar.')

    parser.add_argument('--no-progress', dest='progress', action='store_false',
                        help='Disable progress bar.')

    # Add default values and get args
    parser.set_defaults(sep=',')
    parser.set_defaults(chunksize='1M')
    parser.set_defaults(hashfunc='sha256')
    parser.set_defaults(progress=True)
    args = parser.parse_args()

    return args

if __name__ == "__main__":
    args     = cmdline()
    instance = CsvHasher(args)

    instance.run()