From 3932b2640920ded3f5966937899626bea2cb37b7 Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Thu, 16 May 2019 08:04:29 -0300 Subject: Enhanced output with tqdm --- ckandumper | 111 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/ckandumper b/ckandumper index 9f19f8c..979db18 100755 --- a/ckandumper +++ b/ckandumper @@ -21,49 +21,23 @@ # Dependencies import asyncio import argparse -import curses import sys, os, json from urllib.parse import urlencode - -class StatusLine: - """Handle printing in a given status line""" - - def __init__(self): - self.stdscr = curses.initscr() - - curses.noecho() - curses.cbreak() - - def print(self, line, content): - """Print content in a specific status line""" - height, width = self.stdscr.getmaxyx() - - if line < height: - self.stdscr.addstr(line, 0, ' ' * width) - self.stdscr.addstr(line, 0, content) - self.stdscr.refresh() - - def clear(self, line): - self.print(line, ' ') - - def teardown(self): - """Finish status lines""" - curses.echo() - curses.nocbreak() - curses.endwin() +from tqdm import tqdm class DownloadMultiple: """Downloads multiple files simultaneously with error logging and fancy output""" wget = '/usr/bin/wget' - def __init__(self, limit_rate, limit_concurrent = 20): + def __init__(self, limit_rate, limit_concurrent = 20, progress = True, debug = False): if not os.path.exists(self.wget): raise FileNotFoundError('Wget not found in your path; please install it first.') self.limit_rate = limit_rate self.limit_concurrent = asyncio.Semaphore(int(limit_concurrent)) - self.status = StatusLine() + self.progress = progress + self.debug = debug def ensuredir(self, dest): """Ensures that the destination folder exists""" @@ -81,7 +55,9 @@ class DownloadMultiple: """ async with semaphore: - #print('Downloading ' + url + '...') + if self.debug: + print('Downloading ' + url + '...') + self.ensuredir(os.path.dirname(local_filename)); # Other opts: -q --show-progress @@ -92,15 +68,21 @@ class DownloadMultiple: stdout, stderr = await proc.communicate() - #if stdout: - # self.status.print(semaphore._value, f'[stdout] {stdout.decode()}') + if stdout: + if self.debug: + print(f'[stdout] {url} {stdout.decode()}') if stderr: - self.status.print(semaphore._value, f'[stderr] {stderr.decode()} {url}') + if self.debug: + print(f'[stderr] {url} {stderr.decode()}') - #self.status.print(semaphore._value, f'[{cmd!r} exited with {proc.returncode}]') + if self.debug: + print(f'[{cmd!r} exited with {proc.returncode}]') - self.status.clear(semaphore._value) + if hasattr(self.bar, 'update'): + self.bar.update(1) + + return proc.returncode async def gather(self, filepairs): """Gather all files to be downloaded @@ -116,8 +98,10 @@ class DownloadMultiple: await asyncio.gather(*jobs) def get(self, filepairs): - loop = asyncio.get_event_loop() - #loop.set_debug(True) + self.bar = tqdm(total=len(filepairs)) if self.progress and len(filepairs) > 1 else False + loop = asyncio.get_event_loop() + + loop.set_debug(self.debug) loop.run_until_complete(self.gather(filepairs)) class CkanDumper: @@ -133,6 +117,12 @@ class CkanDumper: self.tag_list = '/api/3/action/tag_list' self.tag_show = '/api/3/action/tag_show?' + if args.debug != None: + self.debug = args.debug + + if args.progress != None: + self.progress = args.progress + if args.limit_rate != None: self.limit_rate = '--limit-rate=' + args.limit_rate else: @@ -143,7 +133,7 @@ class CkanDumper: else: self.limit_concurrent = '20' - self.download = DownloadMultiple(self.limit_rate, self.limit_concurrent) + self.download = DownloadMultiple(self.limit_rate, self.limit_concurrent, self.progress, self.debug) def load_json(self, file): """Loads a file with contents serialized as JSON""" @@ -153,12 +143,14 @@ class CkanDumper: descriptor.close() return data + def iterate(self, iter): + if self.progress == True: + return tqdm(iter) + else: + return iter + def dump(self): """Downloads all content listed in a CKAN repository""" - # Switch to dest folder - #self.ensuredir(self.dest) - #os.chdir(self.dest) - package_list = self.dest + os.sep + 'package_list.json' group_list = self.dest + os.sep + 'group_list.json' tag_list = self.dest + os.sep + 'tag_list.json' @@ -166,17 +158,17 @@ class CkanDumper: # # Groups # + print(f'Downloading {self.url}{self.group_list}...') self.download.get([[self.url + self.group_list, group_list]]) groups = self.load_json(group_list) group_downloads = [] + print(f'Downloading each group info...') for group in groups['result']: group_folder = self.dest + os.sep + 'groups' + os.sep + group group_file = group_folder + os.sep + 'group.json' - #print("Downloading " + self.url + self.group_show + 'id=' + group + '...') - #self.ensuredir(group_folder) group_downloads.append([self.url + self.group_show + urlencode({ 'id': group }, False, '', 'utf-8'), group_file]) self.download.get(group_downloads) @@ -184,17 +176,17 @@ class CkanDumper: # # Tags # + print(f'Downloading {self.url}{self.tag_list}...') self.download.get([[self.url + self.tag_list, tag_list]]) tags = self.load_json(tag_list) tags_downloads = [] + print(f'Downloading each tag info...') for tag in tags['result']: tag_folder = self.dest + os.sep + 'tags' + os.sep + tag tag_file = tag_folder + os.sep + 'tag.json' - #print("Downloading " + self.url + self.tag_show + 'id=' + tag + '...') - #self.ensuredir(tag_folder) tags_downloads.append([self.url + self.tag_show + urlencode({ 'id': tag }, False, '', 'utf-8'), tag_file]) self.download.get(tags_downloads) @@ -202,17 +194,18 @@ class CkanDumper: # # Packages # + print(f'Downloading {self.url}{self.package_list}...') self.download.get([[self.url + self.package_list, package_list]]) packages = self.load_json(package_list) packages_downloads = [] + print(f'Downloading each package info...') + for package in packages['result']: package_folder = self.dest + os.sep + 'packages' + os.sep + package package_file = package_folder + os.sep + 'package.json' - #print("Downloading " + self.url + self.package_show + 'id=' + package + '...') - #self.ensuredir(package_folder + os.sep + 'data') packages_downloads.append([self.url + self.package_show + urlencode({ 'id': package }, False, '', 'utf-8'), package_file]) self.download.get(packages_downloads) @@ -226,6 +219,8 @@ class CkanDumper: package_file = package_folder + os.sep + 'package.json' contents = self.load_json(package_file) + print(f'Downloading contents of package {package}...') + for resource in contents['result']['resources']: #if resource['name'] != None: # name = resource['name'] @@ -233,7 +228,6 @@ class CkanDumper: # name = resource['id'] name = resource['id'] - if resource['format'] != None: format = '.' + resource['format'].lower() else: @@ -245,16 +239,19 @@ class CkanDumper: self.download.get(package_downloads) - # Run only once during development - #return - if __name__ == "__main__": # Parse CLI parser = argparse.ArgumentParser(description='Dump CKAN metadata and datasets.') - parser.add_argument('url', nargs='+', help='CKAN instance URL') - parser.add_argument('dest', nargs='+', help='Destination folder') - parser.add_argument("--limit-rate", help="Limit the download speed to amount bytes per second, per download") - parser.add_argument("--limit-concurrent", help="Limit the total concurrent downloads") + parser.add_argument('url', nargs='+', help='CKAN instance URL') + parser.add_argument('dest', nargs='+', help='Destination folder') + parser.add_argument("--limit-rate", help="Limit the download speed to amount bytes per second, per download") + parser.add_argument("--limit-concurrent", help="Limit the total concurrent downloads") + parser.add_argument('--debug', dest='debug', action='store_true', help="Enable debug") + parser.add_argument('--no-debug', dest='debug', action='store_false', help="Disable debug") + parser.add_argument('--progress', dest='progress', action='store_true', help="Enable progress") + parser.add_argument('--no-progress', dest='progress', action='store_false', help="Disable progress") + parser.set_defaults(debug=False) + parser.set_defaults(progress=True) args = parser.parse_args() # Dispatch -- cgit v1.2.3