#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Hash a given column from a CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import argparse
import pandas as pd
import hashlib
import subprocess
import humanfriendly
from sys  import exit
from tqdm import tqdm

class CsvHasher:
    """Hashes a column from a CSV file"""

    def __init__(self, args):
        # Save arguments
        self.args           = args
        self.args.chunksize = int(humanfriendly.parse_size(self.args.chunksize))

        # Check if source file exists
        if not os.path.exists(args.infile[0]):
            print('File not found: ' + args.infile[0])
            exit (1)

        if hasattr(hashlib, self.args.hashfunc) is False:
            print('Invalid hash function ' + self.args.hashfunc)
            exit (1)

    def apply_hash(self, df, skip=0):
        """Apply the hash function into a column from a dataframe"""

        return df[self.args.colname[0]][skip:].apply(lambda x: \
                getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())

    def run_legacy(self):
        """
        Process CSV in "legacy" mode: open the input file, process and write
        the output in a single step.

        This won't work with CSVs larger than the available memory in the
        system.

        Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Read the CSV
        df = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
        df = pd.concat(tp, ignore_index=True)

        # Hashing the column
        df[self.args.colname[0]] = self.apply_hash(df)

        # Writing the new CSV output
        df.to_csv(self.args.outfile[0], index=False)

    def run(self):
        """
        Improved CSV processor for large files.

        Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
        """

        # Shorthands
        infile  = self.args.infile[0]
        outfile = self.args.outfile[0]

        # Get number of lines in the CSV file
        nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
        nlines = int(nlines.split()[0])

        # Check the input file
        if nlines < 2:
            print('CSV file is too small.')
            return False

        # Holds columns definition
        columns = None

        # Read a chunk just to get the column names
        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
            for chunk in sample:
                columns = chunk.columns
                break

        # Check for the column
        if self.args.colname[0] not in columns:
            print('Column not found: ' + self.args.colname[0])
            return False

        # Start with an empty file
        try:
            with open(outfile, 'w') as f:
                f.truncate(0)
        except IOError:
            print('Error writing to ' + outfile)
            return False

        # Initialize progress bar
        progress_bar = tqdm(total=nlines) if self.args.progress else False

        # Controls if the header should be included
        write_header = True

        # Start iteration from 1 so the CSV header is skipped
        for i in range(1, nlines, self.args.chunksize):
            df = pd.read_csv(infile,
                    sep=self.args.sep,
                    header=None,               # no header, define column header manually later
                    nrows=self.args.chunksize, # number of rows to read at each iteration
                    skiprows=i)                # skip rows that were already read

            # Add column information
            df.columns = columns

            # Hashing the column
            try:
                df[self.args.colname[0]] = self.apply_hash(df)
            except KeyError as e:
                print('Column not found: ' + self.args.colname[0])
                return False

            # Writing the new CSV output
            df.to_csv(outfile, index=False, mode='a', header=write_header)

            # Write the header only in the first iteration
            write_header = False

            if hasattr(progress_bar, 'update'):
                progress_bar.update(self.args.chunksize)

        # Teardown
        if hasattr(progress_bar, 'close'):
            progress_bar.close()

    def check(self):
        """Check both files for differences"""

        df_infile  = pd.read_csv(self.args.infile[0],  sep=self.args.sep)
        df_outfile = pd.read_csv(self.args.outfile[0], sep=self.args.sep)

        print('Comparing both files without excluding the ' + self.args.colname[0] + ' column:')
        print(df_infile.compare(df_outfile))

        del df_infile[self.args.colname[0]]
        del df_outfile[self.args.colname[0]]

        print('Comparing both files excluding the ' + self.args.colname[0] + ' column:')
        print(df_infile.compare(df_outfile))

def cmdline():
    """
    Evalutate the command line.

    :return: Command line arguments.
    """

    # Defaults
    basename  = os.path.basename(__file__)
    chunksize = '1M'
    hashfunc  = 'sha256'
    progress  = True

    # Parse CLI
    #examples  = "Examples:\n\t" + basename + " --no-progress \n"

    epilog = ''
    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
                                     epilog=epilog,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,)

    parser.add_argument('infile',   nargs=1, help='CSV input file name')
    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
    parser.add_argument('colname',  nargs=1, help='Column name')

    parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')

    parser.add_argument('--chunksize', dest='chunksize',
            help='Read chunks at a time, supports human-readable notation, defaults to ' + chunksize)

    parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do ' + hashfunc)

    parser.add_argument('--progress', dest='progress', action='store_true',
                        help='Enable progress bar, defaults to ' + str(progress))

    parser.add_argument('--no-progress', dest='progress', action='store_false',
                        help='Disable progress bar.')

    parser.add_argument('--check', dest='check', action='store_true',
                        help='Check both files for differences (test suite), defaults to ' + str(not progress))

    # Add default values and get args
    parser.set_defaults(sep=',')
    parser.set_defaults(chunksize=chunksize)
    parser.set_defaults(hashfunc=hashfunc)
    parser.set_defaults(progress=True)
    parser.set_defaults(check=False)
    args = parser.parse_args()

    return args

if __name__ == "__main__":
    args     = cmdline()
    instance = CsvHasher(args)
    status   = instance.run()

    if status is False:
        exit(1)

    if args.check == True:
        instance.check()