Fix: improvements and tests for large files

author: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 21:41:53 -0300
committer: Silvio Rhatto <rhatto@riseup.net> 2021-01-28 21:41:53 -0300
commit: caac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree: 1c41a36ed49e85afb95b64722632818503c2e77f
parent: ae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
download: csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz
csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2
6 files changed, 182 insertions, 20 deletions
diff --git a/Makefile b/Makefile
index 16311a1..1a13c56 100644
--- a/Makefile
+++ b/Makefile
@@ -2,20 +2,37 @@
 # Makefile for csv-hasher
 #
 
+CHUNKSIZE   							= 10000
+CHECK_LINES 							= 20
+SAMPLE_ITERATIONS         = 1000
+SAMPLE_ROWS_PER_ITERATION = 1000
+TESTS       							= tests
+COLNAME     							= id
+SAMPLE      							= $(TESTS)/sample.csv
+OUTPUT      							= $(TESTS)/output.csv
+
 vendor:
 	pipenv install
 
 sample:
-	bin/make-sample 200
+	@#bin/make-sample $(ITERATIONS)
+	pipenv run ./csv-sampler.py --iterations $(SAMPLE_ITERATIONS) --rows_per_iteration $(SAMPLE_ROWS_PER_ITERATION) $(SAMPLE)
 
 test-sample:
-	pipenv run ./csv-hasher.py --chunksize 5 tests/sample.csv tests/output.csv id
+	pipenv run ./csv-hasher.py --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME)
 
 show-test-output:
-	head -20 tests/sample.csv
-	head -20 tests/output.csv
+	head -$(CHECK_LINES) $(SAMPLE)
+	head -$(CHECK_LINES) $(OUTPUT)
+	tail -$(CHECK_LINES) $(SAMPLE)
+	tail -$(CHECK_LINES) $(OUTPUT)
+	wc -l    						 $(SAMPLE)
+	wc -l    						 $(OUTPUT)
+	ls -lh   						 $(TESTS)
 
 clean-sample:
-	rm tests/*.csv
+	rm -f tests/*.csv
+
+clean: clean-sample
 
-test: clean-sample sample test-sample show-test-output clean-sample
+test: clean-sample sample test-sample show-test-output
diff --git a/Pipfile b/Pipfile
index 30be324..756589a 100644
--- a/Pipfile
+++ b/Pipfile
@@ -7,6 +7,8 @@ name = "pypi"
 pandas = "*"
 tqdm = "*"
 humanfriendly = "*"
+numpy = "*"
+Faker = "*"
 
 [dev-packages]
 
diff --git a/Pipfile.lock b/Pipfile.lock
index cd7b1b2..56b7acc 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e"
+            "sha256": "48063038d08edcb167714b008be15beec84ab053a15d4504cea5d4c7ca6ed321"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -16,6 +16,14 @@
         ]
     },
     "default": {
+        "faker": {
+            "hashes": [
+                "sha256:0783729c61501d52efea2967aff6e6fcb8370f0f6b5a558f2a81233642ae529a",
+                "sha256:6b2995ffff6c2b02bc5daad96f8c24c021e5bd491d9d53d31bcbd66f348181d4"
+            ],
+            "index": "pypi",
+            "version": "==5.8.0"
+        },
         "humanfriendly": {
             "hashes": [
                 "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d",
@@ -61,6 +69,7 @@
                 "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827",
                 "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"
             ],
+            "index": "pypi",
             "version": "==1.19.5"
         },
         "pandas": {
@@ -107,6 +116,13 @@
             ],
             "version": "==1.15.0"
         },
+        "text-unidecode": {
+            "hashes": [
+                "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
+                "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
+            ],
+            "version": "==1.3"
+        },
         "tqdm": {
             "hashes": [
                 "sha256:4621f6823bab46a9cc33d48105753ccbea671b68bab2c50a9f0be23d4065cb5a",
diff --git a/bin/make-sample b/bin/make-sample
index c282a30..7d405b5 100755
--- a/bin/make-sample
+++ b/bin/make-sample
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Build a sample dataset.
+# Build a sample dataset, shell script version.
 #
 # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
 #
diff --git a/csv-hasher.py b/csv-hasher.py
index fe206f8..6415c7b 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -44,16 +44,19 @@ class CsvHasher:
             print('Invalid hash function ' + self.args.hashfunc)
             exit (1)
 
-    def apply_hash(self, df):
+    def apply_hash(self, df, skip=0):
         """Apply the hash function into a column from a dataframe"""
 
-        return df[self.args.colname[0]].apply(lambda x: \
+        return df[self.args.colname[0]][skip:].apply(lambda x: \
                 getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
 
     def run_legacy(self):
         """
-        Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
-        This won't work with CSVs larger than the available memory in the system.
+        Process CSV in "legacy" mode: open the input file, process and write
+        the output in a single step.
+
+        This won't work with CSVs larger than the available memory in the
+        system.
 
         Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
         Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
@@ -76,26 +79,44 @@ class CsvHasher:
         Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
         """
 
-        infile = self.args.infile[0]
+        # Shorthands
+        infile  = self.args.infile[0]
+        outfile = self.args.outfile[0]
 
         # Get number of lines in the CSV file
         nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
         nlines = int(nlines.split()[0])
 
+        # Check the input file
         if nlines < 2:
             print('CSV file is too small.')
             exit (1)
 
-        # Read the just to get the column names
-        sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
-        sample    = pd.concat(sample_tp, ignore_index=True)
+        # Start with and empty file
+        try:
+            with open(outfile, 'w') as f:
+                f.truncate(0)
+        except IOError:
+            print('Error writing to ' + outfile)
+            exit(1)
+
+        # Holds columns definition
+        columns = None
+
+        # Read a chunk just to get the column names
+        with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
+            for chunk in sample:
+                columns = chunk.columns
+                break
 
         # Initialize progress bar
         progress_bar = tqdm(total=nlines) if self.args.progress else False
 
+        # Controls if the header should be included
         write_header = True
 
-        for i in range(0, nlines, self.args.chunksize):
+        # Start iteration from 1 so the CSV header is skipped
+        for i in range(1, nlines, self.args.chunksize):
             df = pd.read_csv(infile,
                     sep=self.args.sep,
                     header=None,               # no header, define column header manually later
@@ -103,7 +124,7 @@ class CsvHasher:
                     skiprows=i)                # skip rows that were already read
 
             # Add column information
-            df.columns = sample.columns
+            df.columns = columns
 
             # Hashing the column
             try:
@@ -113,7 +134,7 @@ class CsvHasher:
                 exit (1)
 
             # Writing the new CSV output
-            df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+            df.to_csv(outfile, index=False, mode='a', header=write_header)
 
             # Write the header only in the first iteration
             write_header = False
@@ -148,7 +169,8 @@ def cmdline():
 
     parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
 
-    parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+    parser.add_argument('--chunksize', dest='chunksize',
+            help='Read chunks at a time, defaults to 1M, supports human-readable notation')
 
     parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
 
diff --git a/csv-sampler.py b/csv-sampler.py
new file mode 100755
index 0000000..35d82db
--- /dev/null
+++ b/csv-sampler.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Generate a sample CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import numpy  as np
+import pandas as pd
+from tqdm import tqdm
+
+class CsvSampler:
+    def __init__(self, args):
+        self.args                    = args
+        self.args.iterations         = int(self.args.iterations)
+        self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
+
+    @staticmethod
+    def random_col(size, low=1):
+        return np.random.randint(low, size, size=size)
+
+    # Inspired by
+    # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
+    def write_csv(self, write_header=True, mode='w'):
+        df       = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
+        df['id'] = self.random_col(self.args.rows_per_iteration)
+        df['a']  = self.random_col(self.args.rows_per_iteration)
+        df['b']  = self.random_col(self.args.rows_per_iteration)
+        df['c']  = self.random_col(self.args.rows_per_iteration)
+        df['d']  = self.random_col(self.args.rows_per_iteration)
+
+        df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
+
+    def run(self):
+        progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
+
+        # Write the first portion
+        self.write_csv()
+
+        # Dispatch
+        for i in range(0, self.args.iterations):
+            self.write_csv(write_header=False, mode='a')
+
+            if hasattr(progress_bar, 'update'):
+                progress_bar.update(1)
+
+def cmdline():
+    """
+    Evalutate the command line.
+
+    :return: Command line arguments.
+    """
+
+    basename = os.path.basename(__file__)
+
+    # Parse CLI
+    #examples  = "Examples:\n\t" + basename + " --no-progress \n"
+
+    epilog = ''
+    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+                                     epilog=epilog,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
+
+    parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
+            type=int, help='Rows per iteration, defaults to 1000')
+
+    parser.add_argument('--iterations', dest='iterations',
+            help='Number of iterations, defaults to 1000')
+
+    parser.add_argument('--progress', dest='progress', action='store_true',
+                        help='Enable progress bar.')
+
+    parser.add_argument('--no-progress', dest='progress', action='store_false',
+                        help='Disable progress bar.')
+
+    # Add default values and get args
+    parser.set_defaults(rows_per_iteration=1000)
+    parser.set_defaults(iterations=1000)
+    parser.set_defaults(progress=True)
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args     = cmdline()
+    instance = CsvSampler(args)
+
+    instance.run()
author	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 21:41:53 -0300
committer	Silvio Rhatto <rhatto@riseup.net>	2021-01-28 21:41:53 -0300
commit	caac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree	1c41a36ed49e85afb95b64722632818503c2e77f
parent	ae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
download	csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2