aboutsummaryrefslogtreecommitdiff
path: root/csv-sampler.py
blob: fa861a8ba3dbfe6e4e4b860b56a6cf2d2e921739 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Generate a sample CSV file.
#
# Copyright (C) 2021 Silvio Rhatto - rhatto@riseup.net
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import argparse
import numpy  as np
import pandas as pd
from tqdm import tqdm

class CsvSampler:
    def __init__(self, args):
        self.args                    = args
        self.args.iterations         = int(self.args.iterations)
        self.args.rows_pet_iteration = int(self.args.rows_per_iteration)

    @staticmethod
    def random_col(size, low=1):
        return np.random.randint(low, size, size=size)

    # Inspired by
    # https://www.caktusgroup.com/blog/2020/04/15/quick-guide-generating-fake-data-with-pandas/
    def write_csv(self, write_header=True, mode='w'):
        df       = pd.DataFrame(columns=['id', 'a', 'b', 'c', 'd'])
        df['id'] = self.random_col(self.args.rows_per_iteration)
        df['a']  = self.random_col(self.args.rows_per_iteration)
        df['b']  = self.random_col(self.args.rows_per_iteration)
        df['c']  = self.random_col(self.args.rows_per_iteration)
        df['d']  = self.random_col(self.args.rows_per_iteration)

        df.to_csv(self.args.outfile[0], index=False, header=write_header, mode=mode)

    def run(self):
        progress_bar = tqdm(total=self.args.iterations) if self.args.progress else False

        # Write the first portion
        self.write_csv()

        # Dispatch
        for i in range(0, self.args.iterations):
            self.write_csv(write_header=False, mode='a')

            if hasattr(progress_bar, 'update'):
                progress_bar.update(1)

def cmdline():
    """
    Evalutate the command line.

    :return: Command line arguments.
    """

    # Defaults
    basename           = os.path.basename(__file__)
    rows_per_iteration = 1024
    iterations         = 1024
    progress           = True

    # Parse CLI
    #examples  = "Examples:\n\t" + basename + " --no-progress \n"

    epilog = ''
    parser = argparse.ArgumentParser(description='Hashes a column from a CSV file.',
                                     epilog=epilog,
                                     formatter_class=argparse.RawDescriptionHelpFormatter,)

    parser.add_argument('outfile', nargs=1, help='CSV output file name')

    parser.add_argument('--rows_per_iteration', dest='rows_per_iteration',
            type=int, help='Rows per iteration, defaults to ' + str(rows_per_iteration))

    parser.add_argument('--iterations', dest='iterations',
            help='Number of iterations, defaults to ' + str(iterations))

    parser.add_argument('--progress', dest='progress', action='store_true',
                        help='Enable progress bar, defaults to ' + str(progress))

    parser.add_argument('--no-progress', dest='progress', action='store_false',
                        help='Disable progress bar.')

    # Add default values and get args
    parser.set_defaults(rows_per_iteration=rows_per_iteration)
    parser.set_defaults(iterations=iterations)
    parser.set_defaults(progress=True)
    args = parser.parse_args()

    return args

if __name__ == "__main__":
    args     = cmdline()
    instance = CsvSampler(args)

    instance.run()