aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
committerSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
commitcaac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree1c41a36ed49e85afb95b64722632818503c2e77f
parentae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
downloadcsv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz
csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2
Fix: improvements and tests for large files
-rw-r--r--Makefile29
-rw-r--r--Pipfile2
-rw-r--r--Pipfile.lock18
-rwxr-xr-xbin/make-sample2
-rwxr-xr-xcsv-hasher.py46
-rwxr-xr-xcsv-sampler.py105
6 files changed, 182 insertions, 20 deletions
diff --git a/Makefile b/Makefile
index 16311a1..1a13c56 100644
--- a/Makefile
+++ b/Makefile
@@ -2,20 +2,37 @@
# Makefile for csv-hasher
#
+CHUNKSIZE = 10000
+CHECK_LINES = 20
+SAMPLE_ITERATIONS = 1000
+SAMPLE_ROWS_PER_ITERATION = 1000
+TESTS = tests
+COLNAME = id
+SAMPLE = $(TESTS)/sample.csv
+OUTPUT = $(TESTS)/output.csv
+
vendor:
pipenv install
sample:
- bin/make-sample 200
+ @#bin/make-sample $(ITERATIONS)
+ pipenv run ./csv-sampler.py --iterations $(SAMPLE_ITERATIONS) --rows_per_iteration $(SAMPLE_ROWS_PER_ITERATION) $(SAMPLE)
test-sample:
- pipenv run ./csv-hasher.py --chunksize 5 tests/sample.csv tests/output.csv id
+ pipenv run ./csv-hasher.py --chunksize $(CHUNKSIZE) $(SAMPLE) $(OUTPUT) $(COLNAME)
show-test-output:
- head -20 tests/sample.csv
- head -20 tests/output.csv
+ head -$(CHECK_LINES) $(SAMPLE)
+ head -$(CHECK_LINES) $(OUTPUT)
+ tail -$(CHECK_LINES) $(SAMPLE)
+ tail -$(CHECK_LINES) $(OUTPUT)
+ wc -l $(SAMPLE)
+ wc -l $(OUTPUT)
+ ls -lh $(TESTS)
clean-sample:
- rm tests/*.csv
+ rm -f tests/*.csv
+
+clean: clean-sample
-test: clean-sample sample test-sample show-test-output clean-sample
+test: clean-sample sample test-sample show-test-output
diff --git a/Pipfile b/Pipfile
index 30be324..756589a 100644
--- a/Pipfile
+++ b/Pipfile
@@ -7,6 +7,8 @@ name = "pypi"
pandas = "*"
tqdm = "*"
humanfriendly = "*"
+numpy = "*"
+Faker = "*"
[dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
index cd7b1b2..56b7acc 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "fd63b54e272583b41d9a5c54abdb5c1737cf72c1d0d510a1051d25c0fd61d33e"
+ "sha256": "48063038d08edcb167714b008be15beec84ab053a15d4504cea5d4c7ca6ed321"
},
"pipfile-spec": 6,
"requires": {
@@ -16,6 +16,14 @@
]
},
"default": {
+ "faker": {
+ "hashes": [
+ "sha256:0783729c61501d52efea2967aff6e6fcb8370f0f6b5a558f2a81233642ae529a",
+ "sha256:6b2995ffff6c2b02bc5daad96f8c24c021e5bd491d9d53d31bcbd66f348181d4"
+ ],
+ "index": "pypi",
+ "version": "==5.8.0"
+ },
"humanfriendly": {
"hashes": [
"sha256:066562956639ab21ff2676d1fda0b5987e985c534fc76700a19bd54bcb81121d",
@@ -61,6 +69,7 @@
"sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827",
"sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"
],
+ "index": "pypi",
"version": "==1.19.5"
},
"pandas": {
@@ -107,6 +116,13 @@
],
"version": "==1.15.0"
},
+ "text-unidecode": {
+ "hashes": [
+ "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8",
+ "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"
+ ],
+ "version": "==1.3"
+ },
"tqdm": {
"hashes": [
"sha256:4621f6823bab46a9cc33d48105753ccbea671b68bab2c50a9f0be23d4065cb5a",
diff --git a/bin/make-sample b/bin/make-sample
index c282a30..7d405b5 100755
--- a/bin/make-sample
+++ b/bin/make-sample
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Build a sample dataset.
+# Build a sample dataset, shell script version.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
diff --git a/csv-hasher.py b/csv-hasher.py
index fe206f8..6415c7b 100755
--- a/csv-hasher.py
+++ b/csv-hasher.py
@@ -44,16 +44,19 @@ class CsvHasher:
print('Invalid hash function ' + self.args.hashfunc)
exit (1)
- def apply_hash(self, df):
+ def apply_hash(self, df, skip=0):
"""Apply the hash function into a column from a dataframe"""
- return df[self.args.colname[0]].apply(lambda x: \
+ return df[self.args.colname[0]][skip:].apply(lambda x: \
getattr(hashlib, self.args.hashfunc)(str(x).encode('utf-8')).hexdigest())
def run_legacy(self):
"""
- Process CSV in "legacy" mode: open the input file, process and write the output in a single step.
- This won't work with CSVs larger than the available memory in the system.
+ Process CSV in "legacy" mode: open the input file, process and write
+ the output in a single step.
+
+ This won't work with CSVs larger than the available memory in the
+ system.
Thanks https://stackoverflow.com/questions/55775674/how-do-i-hash-specific-columns-from-a-csv-file
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
@@ -76,26 +79,44 @@ class CsvHasher:
Thanks https://stackoverflow.com/questions/11622652/large-persistent-dataframe-in-pandas/12193309#12193309
"""
- infile = self.args.infile[0]
+ # Shorthands
+ infile = self.args.infile[0]
+ outfile = self.args.outfile[0]
# Get number of lines in the CSV file
nlines = subprocess.check_output('wc -l %s' % infile, shell=True)
nlines = int(nlines.split()[0])
+ # Check the input file
if nlines < 2:
print('CSV file is too small.')
exit (1)
- # Read the just to get the column names
- sample_tp = pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize)
- sample = pd.concat(sample_tp, ignore_index=True)
+ # Start with and empty file
+ try:
+ with open(outfile, 'w') as f:
+ f.truncate(0)
+ except IOError:
+ print('Error writing to ' + outfile)
+ exit(1)
+
+ # Holds columns definition
+ columns = None
+
+ # Read a chunk just to get the column names
+ with pd.read_csv(self.args.infile[0], sep=self.args.sep, iterator=True, chunksize=self.args.chunksize) as sample:
+ for chunk in sample:
+ columns = chunk.columns
+ break
# Initialize progress bar
progress_bar = tqdm(total=nlines) if self.args.progress else False
+ # Controls if the header should be included
write_header = True
- for i in range(0, nlines, self.args.chunksize):
+ # Start iteration from 1 so the CSV header is skipped
+ for i in range(1, nlines, self.args.chunksize):
df = pd.read_csv(infile,
sep=self.args.sep,
header=None, # no header, define column header manually later
@@ -103,7 +124,7 @@ class CsvHasher:
skiprows=i) # skip rows that were already read
# Add column information
- df.columns = sample.columns
+ df.columns = columns
# Hashing the column
try:
@@ -113,7 +134,7 @@ class CsvHasher:
exit (1)
# Writing the new CSV output
- df.to_csv(self.args.outfile[0], index=False, mode='a', header=write_header)
+ df.to_csv(outfile, index=False, mode='a', header=write_header)
# Write the header only in the first iteration
write_header = False
@@ -148,7 +169,8 @@ def cmdline():
parser.add_argument('--sep', dest='sep', help='Separator, defaults to ","')
- parser.add_argument('--chunksize', dest='chunksize', help='Read chunks at a time, defaults to 1M, supports human-readable notation')
+ parser.add_argument('--chunksize', dest='chunksize',
+ help='Read chunks at a time, defaults to 1M, supports human-readable notation')
parser.add_argument('--hashfunc', dest='hashfunc', help='Hash function, defaults do sha256')
diff --git a/csv-sampler.py b/csv-sampler.py
new file mode 100755
index 0000000..35d82db
--- /dev/null
+++ b/csv-sampler.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Generate a sample CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+class CsvSampler:
+ def __init__(self, args):
+ self.args = args
+ self.args.iterations = int(self.args.iterations)
+ self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
+
+ @staticmethod
+ def random_col(size, low=1):
+ return np.random.randint(low, size, size=size)
+
+ # Inspired by
+ # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
+ def write_csv(self, write_header=True, mode='w'):
+ df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
+ df['id'] = self.random_col(self.args.rows_per_iteration)
+ df['a'] = self.random_col(self.args.rows_per_iteration)
+ df['b'] = self.random_col(self.args.rows_per_iteration)
+ df['c'] = self.random_col(self.args.rows_per_iteration)
+ df['d'] = self.random_col(self.args.rows_per_iteration)
+
+ df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
+
+ def run(self):
+ progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
+
+ # Write the first portion
+ self.write_csv()
+
+ # Dispatch
+ for i in range(0, self.args.iterations):
+ self.write_csv(write_header=False, mode='a')
+
+ if hasattr(progress_bar, 'update'):
+ progress_bar.update(1)
+
+def cmdline():
+ """
+ Evalutate the command line.
+
+ :return: Command line arguments.
+ """
+
+ basename = os.path.basename(__file__)
+
+ # Parse CLI
+ #examples = "Examples:\n\t" + basename + " --no-progress \n"
+
+ epilog = ''
+ parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+ parser.add_argument('outfile', nargs=1, help='CSV output file name')
+
+ parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
+ type=int, help='Rows per iteration, defaults to 1000')
+
+ parser.add_argument('--iterations', dest='iterations',
+ help='Number of iterations, defaults to 1000')
+
+ parser.add_argument('--progress', dest='progress', action='store_true',
+ help='Enable progress bar.')
+
+ parser.add_argument('--no-progress', dest='progress', action='store_false',
+ help='Disable progress bar.')
+
+ # Add default values and get args
+ parser.set_defaults(rows_per_iteration=1000)
+ parser.set_defaults(iterations=1000)
+ parser.set_defaults(progress=True)
+ args = parser.parse_args()
+
+ return args
+
+if __name__ == "__main__":
+ args = cmdline()
+ instance = CsvSampler(args)
+
+ instance.run()