aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2019-04-25 22:01:29 -0300
committerSilvio Rhatto <rhatto@riseup.net>2019-04-25 22:01:29 -0300
commit5277bd8d0a5b36c51796cd33ee956099592bc3a9 (patch)
tree4fd683533619d266eef97992bf28f5419abcf759
parent68171613087e27262c5b7faa5790d58580930083 (diff)
downloadckandumper-5277bd8d0a5b36c51796cd33ee956099592bc3a9.tar.gz
ckandumper-5277bd8d0a5b36c51796cd33ee956099592bc3a9.tar.bz2
Initial working version
-rw-r--r--README.md3
-rw-r--r--TODO.md9
-rwxr-xr-xckandumper135
3 files changed, 147 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f0f90bc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# CKAN Dumper
+
+Downloads both metadata and datasets from a CKAN instance.
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..a7e959c
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,9 @@
+TODO
+====
+
+* Error/exception handling.
+* File hashing.
+* Log facility.
+* Parallell downloads.
+* Git-annex support.
+* Documentation.
diff --git a/ckandumper b/ckandumper
new file mode 100755
index 0000000..3cfdf37
--- /dev/null
+++ b/ckandumper
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Dumps CKAN instance data: metadata plus entire datasets.
+#
+# Copyright (C) 2019 Silvio Rhatto <rhatto@riseup.net>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, either version 3 of the License,
+# or any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Dependencies
+import argparse
+import sys, os, subprocess, pycurl, json
+from urllib.parse import urlencode
+
+class ckandumper:
+ """Dumps CKAN data: metadata plus entire datasets"""
+
+ def __init__(self, args):
+ self.url = args.url[0]
+ self.dest = args.dest[0]
+ self.package_list = '/api/3/action/package_list'
+ self.package_show = '/api/3/action/package_show?'
+ self.group_list = '/api/3/action/group_list'
+ self.group_show = '/api/3/action/group_show?'
+ self.tag_list = '/api/3/action/tag_list'
+ self.tag_show = '/api/3/action/tag_show?'
+
+ # Using wget as it is more reliable
+ def download(self, url, local_filename):
+ subprocess.call('/usr/bin/wget -c -O "' + local_filename + '" ' + url, shell=True)
+
+ def ensuredir(self, dest):
+ # Ensure that the destination folder exists
+ if not os.path.exists(dest) and not os.path.isdir(dest):
+ os.makedirs(dest, 0o755);
+ elif os.path.exists(dest) and not os.path.isdir(dest):
+ raise ValueError('File exists and is not a folder:' + dest)
+
+ def loadJSON(self, file):
+ descriptor = open(file)
+ return json.load(descriptor)
+
+ def dump(self):
+ self.ensuredir(self.dest)
+
+ # Move to dest folder
+ #os.chdir(self.dest)
+
+ package_list = self.dest + os.sep + 'package_list.json'
+ group_list = self.dest + os.sep + 'group_list.json'
+ tag_list = self.dest + os.sep + 'tag_list.json'
+
+ #
+ # Groups
+ #
+ self.download(self.url + self.group_list, group_list)
+ groups = self.loadJSON(group_list)
+
+ for group in groups['result']:
+ group_folder = self.dest + os.sep + 'groups' + os.sep + group
+ group_file = group_folder + os.sep + 'group.json'
+ self.ensuredir(group_folder)
+ print("Downloading " + self.url + self.group_show + 'id=' + group + '...')
+ self.download(self.url + self.group_show + urlencode({ 'id': group }, False, '', 'utf-8'), group_file)
+
+ #
+ # Tags
+ #
+ self.download(self.url + self.tag_list, tag_list)
+ tags = self.loadJSON(tag_list)
+
+ for tag in tags['result']:
+ tag_folder = self.dest + os.sep + 'tags' + os.sep + tag
+ tag_file = tag_folder + os.sep + 'tag.json'
+ self.ensuredir(tag_folder)
+ print("Downloading " + self.url + self.tag_show + 'id=' + tag + '...')
+ self.download(self.url + self.tag_show + urlencode({ 'id': tag }, False, '', 'utf-8'), tag_file)
+
+ #
+ # Packages
+ #
+ self.download(self.url + self.package_list, package_list)
+ packages = self.loadJSON(package_list)
+
+ for package in packages['result']:
+ package_folder = self.dest + os.sep + 'packages' + os.sep + package
+ package_file = package_folder + os.sep + 'package.json'
+ self.ensuredir(package_folder + os.sep + 'data')
+ print("Downloading " + self.url + self.package_show + 'id=' + package + '...')
+ self.download(self.url + self.package_show + urlencode({ 'id': package }, False, '', 'utf-8'), package_file)
+
+ contents = self.loadJSON(package_file)
+
+ for resource in contents['result']['resources']:
+ #if resource['name'] != None:
+ # name = resource['name']
+ #else
+ # name = resource['id']
+
+ name = resource['id']
+
+ if resource['format'] != None:
+ format = '.' + resource['format'].lower()
+ else:
+ format = ''
+
+ resource_file = package_folder + os.sep + 'data' + os.sep + name + format
+
+ self.download(resource['url'], resource_file)
+
+ # Run only once during development
+ #return
+
+# Standalone usage
+if __name__ == "__main__":
+ # Parse CLI
+ parser = argparse.ArgumentParser(description='Process some integers.')
+ parser.add_argument('url', nargs='+', help='CKAN instance URL')
+ parser.add_argument('dest', nargs='+', help='Destination folder')
+ args = parser.parse_args()
+
+ # Dispatch
+ ckan = ckandumper(args)
+ ckan.dump()