aboutsummaryrefslogtreecommitdiff
path: root/csv-sampler.py
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
committerSilvio Rhatto <rhatto@riseup.net>2021-01-28 21:41:53 -0300
commitcaac6a103f1a76a4ec4a096bb569cd7820a0ff14 (patch)
tree1c41a36ed49e85afb95b64722632818503c2e77f /csv-sampler.py
parentae3abe5a4c14a2e1b50aaf1f41e3225a5c34140b (diff)
downloadcsv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.gz
csv-hasher-caac6a103f1a76a4ec4a096bb569cd7820a0ff14.tar.bz2
Fix: improvements and tests for large files
Diffstat (limited to 'csv-sampler.py')
-rwxr-xr-xcsv-sampler.py105
1 files changed, 105 insertions, 0 deletions
diff --git a/csv-sampler.py b/csv-sampler.py
new file mode 100755
index 0000000..35d82db
--- /dev/null
+++ b/csv-sampler.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Generate a sample CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+class CsvSampler:
+ def __init__(self, args):
+ self.args = args
+ self.args.iterations = int(self.args.iterations)
+ self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
+
+ @staticmethod
+ def random_col(size, low=1):
+ return np.random.randint(low, size, size=size)
+
+ # Inspired by
+ # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
+ def write_csv(self, write_header=True, mode='w'):
+ df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
+ df['id'] = self.random_col(self.args.rows_per_iteration)
+ df['a'] = self.random_col(self.args.rows_per_iteration)
+ df['b'] = self.random_col(self.args.rows_per_iteration)
+ df['c'] = self.random_col(self.args.rows_per_iteration)
+ df['d'] = self.random_col(self.args.rows_per_iteration)
+
+ df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
+
+ def run(self):
+ progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
+
+ # Write the first portion
+ self.write_csv()
+
+ # Dispatch
+ for i in range(0, self.args.iterations):
+ self.write_csv(write_header=False, mode='a')
+
+ if hasattr(progress_bar, 'update'):
+ progress_bar.update(1)
+
+def cmdline():
+ """
+ Evalutate the command line.
+
+ :return: Command line arguments.
+ """
+
+ basename = os.path.basename(__file__)
+
+ # Parse CLI
+ #examples = "Examples:\n\t" + basename + " --no-progress \n"
+
+ epilog = ''
+ parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+ parser.add_argument('outfile', nargs=1, help='CSV output file name')
+
+ parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
+ type=int, help='Rows per iteration, defaults to 1000')
+
+ parser.add_argument('--iterations', dest='iterations',
+ help='Number of iterations, defaults to 1000')
+
+ parser.add_argument('--progress', dest='progress', action='store_true',
+ help='Enable progress bar.')
+
+ parser.add_argument('--no-progress', dest='progress', action='store_false',
+ help='Disable progress bar.')
+
+ # Add default values and get args
+ parser.set_defaults(rows_per_iteration=1000)
+ parser.set_defaults(iterations=1000)
+ parser.set_defaults(progress=True)
+ args = parser.parse_args()
+
+ return args
+
+if __name__ == "__main__":
+ args = cmdline()
+ instance = CsvSampler(args)
+
+ instance.run()