diff options
-rw-r--r-- | Makefile | 29 | ||||
-rw-r--r-- | Pipfile | 2 | ||||
-rw-r--r-- | Pipfile.lock | 18 | ||||
-rwxr-xr-x | bin/make-sample | 2 | ||||
-rwxr-xr-x | csv-hasher.py | 46 | ||||
-rwxr-xr-x | csv-sampler.py | 105 |
6 files changed, 182 insertions, 20 deletions
@@ -2,20 +2,37 @@ # Makefile for csv-hasher # +CHUNKSIZE = 10000 +CHECK_LINES = 20 +SAMPLE_ITERATIONS = 1000 +SAMPLE_ROWS_PER_ITERATION = 1000 +TESTS = tests +COLNAME = id +SAMPLE = $(TESTS)/sample.csv +OUTPUT = $(TESTS)/output.csv + vendor: pipenv install sample: - bin/make-sample 200 + @#bin/make-sample $(ITERATIONS) + pipenv run ./csv-sampler.py --iterations $(SAMPLE_ITERATIONS) --rows_per_iteration $(SAMPLE_ROWS_PER_ITERATION) $(SAMPLE) test-sample: - pipenv run ./csv-hasher.py --chunksize 5 tests/sample.csv tests/output.csv id + pipenv run ./csv-hasher.py --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME) show-test-output: - head -20 tests/sample.csv - head -20 tests/output.csv + head -$(CHECK_LINES) $(SAMPLE) + head -$(CHECK_LINES) $(OUTPUT) + tail -$(CHECK_LINES) $(SAMPLE) + tail -$(CHECK_LINES) $(OUTPUT) + wc -l $(SAMPLE) + wc -l $(OUTPUT) + ls -lh $(TESTS) clean-sample: - rm tests/*.csv + rm -f tests/*.csv + +clean: clean-sample -test: clean-sample sample test-sample show-test-output clean-sample +test: clean-sample sample test-sample show-test-output @@ -7,6 +7,8 @@ name = "pypi" pandas = "*" tqdm = "*" humanfriendly = "*" +numpy = "*" +Faker = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index cd7b1b2..56b7acc 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e" + "sha256": "48063038d08edcb167714b008be15beec84ab053a15d4504cea5d4c7ca6ed321" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,14 @@ ] }, "default": { + "faker": { + "hashes": [ + "sha256:0783729c61501d52efea2967aff6e6fcb8370f0f6b5a558f2a81233642ae529a", + "sha256:6b2995ffff6c2b02bc5daad96f8c24c021e5bd491d9d53d31bcbd66f348181d4" + ], + "index": "pypi", + "version": "==5.8.0" + }, "humanfriendly": { "hashes": [ "sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d", @@ -61,6 +69,7 @@ "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827", "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60" ], + "index": "pypi", "version": "==1.19.5" }, "pandas": { @@ -107,6 +116,13 @@ ], "version": "==1.15.0" }, + "text-unidecode": { + "hashes": [ + "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", + "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93" + ], + "version": "==1.3" + }, "tqdm": { "hashes": [ "sha256:4621f6823bab46a9cc33d48105753ccbea671b68bab2c50a9f0be23d4065cb5a", diff --git a/bin/make-sample b/bin/make-sample index c282a30..7d405b5 100755 --- a/bin/make-sample +++ b/bin/make-sample @@ -1,6 +1,6 @@ #!/bin/bash # -# Build a sample dataset. +# Build a sample dataset, shell script version. # # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net # diff --git a/csv-hasher.py b/csv-hasher.py index fe206f8..6415c7b 100755 --- a/csv-hasher.py +++ b/csv-hasher.py @@ -44,16 +44,19 @@ class CsvHasher: print('Invalid hash function ' + self.args.hashfunc) exit (1) - def apply_hash(self, df): + def apply_hash(self, df, skip=0): """Apply the hash function into a column from a dataframe""" - return df[self.args.colname[0]].apply(lambda x: \ + return df[self.args.colname[0]][skip:].apply(lambda x: \ getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest()) def run_legacy(self): """ - Process CSV in "legacy" mode: open the input file, process and write the output in a single step. - This won't work with CSVs larger than the available memory in the system. + Process CSV in "legacy" mode: open the input file, process and write + the output in a single step. + + This won't work with CSVs larger than the available memory in the + system. Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 @@ -76,26 +79,44 @@ class CsvHasher: Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309 """ - infile = self.args.infile[0] + # Shorthands + infile = self.args.infile[0] + outfile = self.args.outfile[0] # Get number of lines in the CSV file nlines = subprocess.check_output('wc -l %s' % infile, shell=True) nlines = int(nlines.split()[0]) + # Check the input file if nlines < 2: print('CSV file is too small.') exit (1) - # Read the just to get the column names - sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) - sample = pd.concat(sample_tp, ignore_index=True) + # Start with and empty file + try: + with open(outfile, 'w') as f: + f.truncate(0) + except IOError: + print('Error writing to ' + outfile) + exit(1) + + # Holds columns definition + columns = None + + # Read a chunk just to get the column names + with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample: + for chunk in sample: + columns = chunk.columns + break # Initialize progress bar progress_bar = tqdm(total=nlines) if self.args.progress else False + # Controls if the header should be included write_header = True - for i in range(0, nlines, self.args.chunksize): + # Start iteration from 1 so the CSV header is skipped + for i in range(1, nlines, self.args.chunksize): df = pd.read_csv(infile, sep=self.args.sep, header=None, # no header, define column header manually later @@ -103,7 +124,7 @@ class CsvHasher: skiprows=i) # skip rows that were already read # Add column information - df.columns = sample.columns + df.columns = columns # Hashing the column try: @@ -113,7 +134,7 @@ class CsvHasher: exit (1) # Writing the new CSV output - df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header) + df.to_csv(outfile, index=False, mode='a', header=write_header) # Write the header only in the first iteration write_header = False @@ -148,7 +169,8 @@ def cmdline(): parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","') - parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation') + parser.add_argument('--chunksize', dest='chunksize', + help='Read chunks at a time, defaults to 1M, supports human-readable notation') parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256') diff --git a/csv-sampler.py b/csv-sampler.py new file mode 100755 index 0000000..35d82db --- /dev/null +++ b/csv-sampler.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Generate a sample CSV file. +# +# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import os +import argparse +import numpy as np +import pandas as pd +from tqdm import tqdm + +class CsvSampler: + def __init__(self, args): + self.args = args + self.args.iterations = int(self.args.iterations) + self.args.rows_pet_iteration = int(self.args.rows_per_iteration) + + @staticmethod + def random_col(size, low=1): + return np.random.randint(low, size, size=size) + + # Inspired by + # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/ + def write_csv(self, write_header=True, mode='w'): + df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd']) + df['id'] = self.random_col(self.args.rows_per_iteration) + df['a'] = self.random_col(self.args.rows_per_iteration) + df['b'] = self.random_col(self.args.rows_per_iteration) + df['c'] = self.random_col(self.args.rows_per_iteration) + df['d'] = self.random_col(self.args.rows_per_iteration) + + df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode) + + def run(self): + progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False + + # Write the first portion + self.write_csv() + + # Dispatch + for i in range(0, self.args.iterations): + self.write_csv(write_header=False, mode='a') + + if hasattr(progress_bar, 'update'): + progress_bar.update(1) + +def cmdline(): + """ + Evalutate the command line. + + :return: Command line arguments. + """ + + basename = os.path.basename(__file__) + + # Parse CLI + #examples = "Examples:\n\t" + basename + " --no-progress \n" + + epilog = '' + parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter,) + + parser.add_argument('outfile', nargs=1, help='CSV output file name') + + parser.add_argument('--rows_per_iteration', dest='rows_per_iteration', + type=int, help='Rows per iteration, defaults to 1000') + + parser.add_argument('--iterations', dest='iterations', + help='Number of iterations, defaults to 1000') + + parser.add_argument('--progress', dest='progress', action='store_true', + help='Enable progress bar.') + + parser.add_argument('--no-progress', dest='progress', action='store_false', + help='Disable progress bar.') + + # Add default values and get args + parser.set_defaults(rows_per_iteration=1000) + parser.set_defaults(iterations=1000) + parser.set_defaults(progress=True) + args = parser.parse_args() + + return args + +if __name__ == "__main__": + args = cmdline() + instance = CsvSampler(args) + + instance.run() |