#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Generate a sample CSV file. # # Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os import argparse import numpy as np import pandas as pd from tqdm import tqdm class CsvSampler: def __init__(self, args): self.args = args self.args.iterations = int(self.args.iterations) self.args.rows_pet_iteration = int(self.args.rows_per_iteration) @staticmethod def random_col(size, low=1): return np.random.randint(low, size, size=size) # Inspired by # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/ def write_csv(self, write_header=True, mode='w'): df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd']) df['id'] = self.random_col(self.args.rows_per_iteration) df['a'] = self.random_col(self.args.rows_per_iteration) df['b'] = self.random_col(self.args.rows_per_iteration) df['c'] = self.random_col(self.args.rows_per_iteration) df['d'] = self.random_col(self.args.rows_per_iteration) df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode) def run(self): progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False # Write the first portion self.write_csv() # Dispatch for i in range(0, self.args.iterations): self.write_csv(write_header=False, mode='a') if hasattr(progress_bar, 'update'): progress_bar.update(1) def cmdline(): """ Evalutate the command line. :return: Command line arguments. """ # Defaults basename = os.path.basename(__file__) rows_per_iteration = 1024 iterations = 1024 progress = True # Parse CLI #examples = "Examples:\n\t" + basename + " --no-progress \n" epilog = '' parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.', epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter,) parser.add_argument('outfile', nargs=1, help='CSV output file name') parser.add_argument('--rows_per_iteration', dest='rows_per_iteration', type=int, help='Rows per iteration, defaults to ' + str(rows_per_iteration)) parser.add_argument('--iterations', dest='iterations', help='Number of iterations, defaults to ' + str(iterations)) parser.add_argument('--progress', dest='progress', action='store_true', help='Enable progress bar, defaults to ' + str(progress)) parser.add_argument('--no-progress', dest='progress', action='store_false', help='Disable progress bar.') # Add default values and get args parser.set_defaults(rows_per_iteration=rows_per_iteration) parser.set_defaults(iterations=iterations) parser.set_defaults(progress=True) args = parser.parse_args() return args if __name__ == "__main__": args = cmdline() instance = CsvSampler(args) instance.run()