From caac6a103f1a76a4ec4a096bb569cd7820a0ff14 Mon Sep 17 00:00:00 2001
From: Silvio Rhatto <rhatto@riseup.net>
Date: Thu, 28 Jan 2021 21:41:53 -0300
Subject: Fix: improvements and tests for large files

---
 csv-sampler.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100755 csv-sampler.py

(limited to 'csv-sampler.py')

diff --git a/csv-sampler.py b/csv-sampler.py
new file mode 100755
index 0000000..35d82db
--- /dev/null
+++ b/csv-sampler.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Generate a sample CSV file.
+#
+# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import argparse
+import numpy  as np
+import pandas as pd
+from tqdm import tqdm
+
+class CsvSampler:
+    def __init__(self, args):
+        self.args                    = args
+        self.args.iterations         = int(self.args.iterations)
+        self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
+
+    @staticmethod
+    def random_col(size, low=1):
+        return np.random.randint(low, size, size=size)
+
+    # Inspired by
+    # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
+    def write_csv(self, write_header=True, mode='w'):
+        df       = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
+        df['id'] = self.random_col(self.args.rows_per_iteration)
+        df['a']  = self.random_col(self.args.rows_per_iteration)
+        df['b']  = self.random_col(self.args.rows_per_iteration)
+        df['c']  = self.random_col(self.args.rows_per_iteration)
+        df['d']  = self.random_col(self.args.rows_per_iteration)
+
+        df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
+
+    def run(self):
+        progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
+
+        # Write the first portion
+        self.write_csv()
+
+        # Dispatch
+        for i in range(0, self.args.iterations):
+            self.write_csv(write_header=False, mode='a')
+
+            if hasattr(progress_bar, 'update'):
+                progress_bar.update(1)
+
+def cmdline():
+    """
+    Evalutate the command line.
+
+    :return: Command line arguments.
+    """
+
+    basename = os.path.basename(__file__)
+
+    # Parse CLI
+    #examples  = "Examples:\n\t" + basename + " --no-progress \n"
+
+    epilog = ''
+    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
+                                     epilog=epilog,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter,)
+
+    parser.add_argument('outfile',  nargs=1, help='CSV output file name')
+
+    parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
+            type=int, help='Rows per iteration, defaults to 1000')
+
+    parser.add_argument('--iterations', dest='iterations',
+            help='Number of iterations, defaults to 1000')
+
+    parser.add_argument('--progress', dest='progress', action='store_true',
+                        help='Enable progress bar.')
+
+    parser.add_argument('--no-progress', dest='progress', action='store_false',
+                        help='Disable progress bar.')
+
+    # Add default values and get args
+    parser.set_defaults(rows_per_iteration=1000)
+    parser.set_defaults(iterations=1000)
+    parser.set_defaults(progress=True)
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args     = cmdline()
+    instance = CsvSampler(args)
+
+    instance.run()
-- 
cgit v1.2.3