diff options
-rwxr-xr-x | ckandumper | 224 |
1 files changed, 113 insertions, 111 deletions
@@ -24,117 +24,119 @@ import sys, os, subprocess, pycurl, json from urllib.parse import urlencode class ckandumper: - """Dumps CKAN data: metadata plus entire datasets""" - - def __init__(self, args): - self.url = args.url[0] - self.dest = args.dest[0] - self.package_list = '/api/3/action/package_list' - self.package_show = '/api/3/action/package_show?' - self.group_list = '/api/3/action/group_list' - self.group_show = '/api/3/action/group_show?' - self.tag_list = '/api/3/action/tag_list' - self.tag_show = '/api/3/action/tag_show?' - - if args.limit_rate != None - self.limit_rate = '--limit-rate=' + args.limit_rate - - # Using wget as it is more reliable - def download(self, url, local_filename): - subprocess.call('/usr/bin/wget ' + self.limit_rate + ' -c -O "' + local_filename + '" ' + url, shell=True) - - def ensuredir(self, dest): - # Ensure that the destination folder exists - if not os.path.exists(dest) and not os.path.isdir(dest): - os.makedirs(dest, 0o755); - elif os.path.exists(dest) and not os.path.isdir(dest): - raise ValueError('File exists and is not a folder:' + dest) - - def loadJSON(self, file): - descriptor = open(file) - data = json.load(descriptor) - file.close() - - def dump(self): - self.ensuredir(self.dest) - - # Move to dest folder - #os.chdir(self.dest) - - package_list = self.dest + os.sep + 'package_list.json' - group_list = self.dest + os.sep + 'group_list.json' - tag_list = self.dest + os.sep + 'tag_list.json' - - # - # Groups - # - self.download(self.url + self.group_list, group_list) - groups = self.loadJSON(group_list) - - for group in groups['result']: - group_folder = self.dest + os.sep + 'groups' + os.sep + group - group_file = group_folder + os.sep + 'group.json' - self.ensuredir(group_folder) - print("Downloading " + self.url + self.group_show + 'id=' + group + '...') - self.download(self.url + self.group_show + urlencode({ 'id': group }, False, '', 'utf-8'), group_file) - - # - # Tags - # - self.download(self.url + self.tag_list, tag_list) - tags = self.loadJSON(tag_list) - - for tag in tags['result']: - tag_folder = self.dest + os.sep + 'tags' + os.sep + tag - tag_file = tag_folder + os.sep + 'tag.json' - self.ensuredir(tag_folder) - print("Downloading " + self.url + self.tag_show + 'id=' + tag + '...') - self.download(self.url + self.tag_show + urlencode({ 'id': tag }, False, '', 'utf-8'), tag_file) - - # - # Packages - # - self.download(self.url + self.package_list, package_list) - packages = self.loadJSON(package_list) - - for package in packages['result']: - package_folder = self.dest + os.sep + 'packages' + os.sep + package - package_file = package_folder + os.sep + 'package.json' - self.ensuredir(package_folder + os.sep + 'data') - print("Downloading " + self.url + self.package_show + 'id=' + package + '...') - self.download(self.url + self.package_show + urlencode({ 'id': package }, False, '', 'utf-8'), package_file) - - contents = self.loadJSON(package_file) - - for resource in contents['result']['resources']: - #if resource['name'] != None: - # name = resource['name'] - #else - # name = resource['id'] - - name = resource['id'] - - if resource['format'] != None: - format = '.' + resource['format'].lower() - else: - format = '' - - resource_file = package_folder + os.sep + 'data' + os.sep + name + format - - self.download(resource['url'], resource_file) - - # Run only once during development - #return + """Dumps CKAN data: metadata plus entire datasets""" + + def __init__(self, args): + self.url = args.url[0] + self.dest = args.dest[0] + self.package_list = '/api/3/action/package_list' + self.package_show = '/api/3/action/package_show?' + self.group_list = '/api/3/action/group_list' + self.group_show = '/api/3/action/group_show?' + self.tag_list = '/api/3/action/tag_list' + self.tag_show = '/api/3/action/tag_show?' + + if args.limit_rate != None: + self.limit_rate = '--limit-rate=' + args.limit_rate + + # Using wget as it is more reliable + def download(self, url, local_filename): + subprocess.call('/usr/bin/wget ' + self.limit_rate + ' -c -O "' + local_filename + '" ' + url, shell=True) + + def ensuredir(self, dest): + # Ensure that the destination folder exists + if not os.path.exists(dest) and not os.path.isdir(dest): + os.makedirs(dest, 0o755); + elif os.path.exists(dest) and not os.path.isdir(dest): + raise ValueError('File exists and is not a folder:' + dest) + + def loadJSON(self, file): + descriptor = open(file) + data = json.load(descriptor) + file.close() + + def dump(self): + self.ensuredir(self.dest) + + # Move to dest folder + #os.chdir(self.dest) + + package_list = self.dest + os.sep + 'package_list.json' + group_list = self.dest + os.sep + 'group_list.json' + tag_list = self.dest + os.sep + 'tag_list.json' + + # + # Groups + # + self.download(self.url + self.group_list, group_list) + groups = self.loadJSON(group_list) + + for group in groups['result']: + group_folder = self.dest + os.sep + 'groups' + os.sep + group + group_file = group_folder + os.sep + 'group.json' + + self.ensuredir(group_folder) + print("Downloading " + self.url + self.group_show + 'id=' + group + '...') + self.download(self.url + self.group_show + urlencode({ 'id': group }, False, '', 'utf-8'), group_file) + # + # Tags + # + self.download(self.url + self.tag_list, tag_list) + tags = self.loadJSON(tag_list) + + for tag in tags['result']: + tag_folder = self.dest + os.sep + 'tags' + os.sep + tag + tag_file = tag_folder + os.sep + 'tag.json' + + self.ensuredir(tag_folder) + print("Downloading " + self.url + self.tag_show + 'id=' + tag + '...') + self.download(self.url + self.tag_show + urlencode({ 'id': tag }, False, '', 'utf-8'), tag_file) + + # + # Packages + # + self.download(self.url + self.package_list, package_list) + packages = self.loadJSON(package_list) + + for package in packages['result']: + package_folder = self.dest + os.sep + 'packages' + os.sep + package + package_file = package_folder + os.sep + 'package.json' + + self.ensuredir(package_folder + os.sep + 'data') + print("Downloading " + self.url + self.package_show + 'id=' + package + '...') + self.download(self.url + self.package_show + urlencode({ 'id': package }, False, '', 'utf-8'), package_file) + + contents = self.loadJSON(package_file) + + for resource in contents['result']['resources']: + #if resource['name'] != None: + # name = resource['name'] + #else + # name = resource['id'] + + name = resource['id'] + + if resource['format'] != None: + format = '.' + resource['format'].lower() + else: + format = '' + + resource_file = package_folder + os.sep + 'data' + os.sep + name + format + + self.download(resource['url'], resource_file) + + # Run only once during development + #return # Standalone usage if __name__ == "__main__": - # Parse CLI - parser = argparse.ArgumentParser(description='Dump CKAN metadata and datasets.') - parser.add_argument('url', nargs='+', help='CKAN instance URL') - parser.add_argument('dest', nargs='+', help='Destination folder') - parser.add_argument("--limit-rate", help="Limit the download speed to amount bytes per second, per download") - args = parser.parse_args() - - # Dispatch - ckan = ckandumper(args) - ckan.dump() + # Parse CLI + parser = argparse.ArgumentParser(description='Dump CKAN metadata and datasets.') + parser.add_argument('url', nargs='+', help='CKAN instance URL') + parser.add_argument('dest', nargs='+', help='Destination folder') + parser.add_argument("--limit-rate", help="Limit the download speed to amount bytes per second, per download") + args = parser.parse_args() + + # Dispatch + ckan = ckandumper(args) + ckan.dump() |