From caac6a103f1a76a4ec4a096bb569cd7820a0ff14 Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Thu, 28 Jan 2021 21:41:53 -0300 Subject: Fix: improvements and tests for large files --- csv-sampler.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100755 csv-sampler.py (limited to 'csv-sampler.py') diff --git a/csv-sampler.py b/csv-sampler.py new file mode 100755 index 0000000..35d82db --- /dev/null +++ b/csv-sampler.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Generate a sample CSV file. +# +# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +import argparse +import numpy as np +import pandas as pd +from tqdm import tqdm + +class CsvSampler: + def __init__(self, args): + self.args = args + self.args.iterations = int(self.args.iterations) + self.args.rows_pet_iteration = int(self.args.rows_per_iteration) + + @staticmethod + def random_col(size, low=1): + return np.random.randint(low, size, size=size) + + # Inspired by + # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/ + def write_csv(self, write_header=True, mode='w'): + df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd']) + df['id'] = self.random_col(self.args.rows_per_iteration) + df['a'] = self.random_col(self.args.rows_per_iteration) + df['b'] = self.random_col(self.args.rows_per_iteration) + df['c'] = self.random_col(self.args.rows_per_iteration) + df['d'] = self.random_col(self.args.rows_per_iteration) + + df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode) + + def run(self): + progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False + + # Write the first portion + self.write_csv() + + # Dispatch + for i in range(0, self.args.iterations): + self.write_csv(write_header=False, mode='a') + + if hasattr(progress_bar, 'update'): + progress_bar.update(1) + +def cmdline(): + """ + Evalutate the command line. + + :return: Command line arguments. + """ + + basename = os.path.basename(__file__) + + # Parse CLI + #examples = "Examples:\n\t" + basename + " --no-progress \n" + + epilog = '' + parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter,) + + parser.add_argument('outfile', nargs=1, help='CSV output file name') + + parser.add_argument('--rows_per_iteration', dest='rows_per_iteration', + type=int, help='Rows per iteration, defaults to 1000') + + parser.add_argument('--iterations', dest='iterations', + help='Number of iterations, defaults to 1000') + + parser.add_argument('--progress', dest='progress', action='store_true', + help='Enable progress bar.') + + parser.add_argument('--no-progress', dest='progress', action='store_false', + help='Disable progress bar.') + + # Add default values and get args + parser.set_defaults(rows_per_iteration=1000) + parser.set_defaults(iterations=1000) + parser.set_defaults(progress=True) + args = parser.parse_args() + + return args + +if __name__ == "__main__": + args = cmdline() + instance = CsvSampler(args) + + instance.run() -- cgit v1.2.3