1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Generate a sample CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
class CsvSampler:
def __init__(self, args):
self.args = args
self.args.iterations = int(self.args.iterations)
self.args.rows_pet_iteration = int(self.args.rows_per_iteration)
@staticmethod
def random_col(size, low=1):
return np.random.randint(low, size, size=size)
# Inspired by
# https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
def write_csv(self, write_header=True, mode='w'):
df = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
df['id'] = self.random_col(self.args.rows_per_iteration)
df['a'] = self.random_col(self.args.rows_per_iteration)
df['b'] = self.random_col(self.args.rows_per_iteration)
df['c'] = self.random_col(self.args.rows_per_iteration)
df['d'] = self.random_col(self.args.rows_per_iteration)
df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)
def run(self):
progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False
# Write the first portion
self.write_csv()
# Dispatch
for i in range(0, self.args.iterations):
self.write_csv(write_header=False, mode='a')
if hasattr(progress_bar, 'update'):
progress_bar.update(1)
def cmdline():
"""
Evalutate the command line.
:return: Command line arguments.
"""
basename = os.path.basename(__file__)
# Parse CLI
#examples = "Examples:\n\t" + basename + " --no-progress \n"
epilog = ''
parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
epilog=epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,)
parser.add_argument('outfile', nargs=1, help='CSV output file name')
parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
type=int, help='Rows per iteration, defaults to 1000')
parser.add_argument('--iterations', dest='iterations',
help='Number of iterations, defaults to 1000')
parser.add_argument('--progress', dest='progress', action='store_true',
help='Enable progress bar.')
parser.add_argument('--no-progress', dest='progress', action='store_false',
help='Disable progress bar.')
# Add default values and get args
parser.set_defaults(rows_per_iteration=1000)
parser.set_defaults(iterations=1000)
parser.set_defaults(progress=True)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = cmdline()
instance = CsvSampler(args)
instance.run()
|