From de0c3f46c616fe8e1f2d3a80b747a69d4bf7da14 Mon Sep 17 00:00:00 2001 From: Harrison Wright Date: Fri, 21 Jun 2019 20:03:14 -0500 Subject: [PATCH 1/4] WIP: download assets --- bin/github-backup | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/bin/github-backup b/bin/github-backup index 6c88dd2..e349eaa 100755 --- a/bin/github-backup +++ b/bin/github-backup @@ -537,6 +537,24 @@ def _request_url_error(template, retry_timeout): return False +def download_file(url, path, auth): + request = Request(url) + request.add_header('Accept', 'application/octet-stream') + request.add_header('Authorization', 'Basic '.encode('ascii') + auth) + data = urlopen(request) + with open(path, 'wb') as f: + f.write(data.read()) + + # import requests + # r = requests.get(url, stream=True, headers={ + # 'Accept': 'application/octet-stream', + # 'Authorization': 'Basic '.encode('ascii') + auth + # }) + # with open(path, 'wb') as f: + # for chunk in r.iter_content(1024): + # f.write(chunk) + + def get_authenticated_user(args): template = 'https://{0}/user'.format(get_github_api_host(args)) data = retrieve_data(args, template, single_request=True) @@ -898,17 +916,21 @@ def backup_releases(args, repo_cwd, repository, repos_template): query_args = {} - _release_template = '{0}/{1}/releases'.format(repos_template, repository_fullname) - _releases = retrieve_data(args, _release_template, query_args=query_args) + release_template = '{0}/{1}/releases'.format(repos_template, repository_fullname) + releases = retrieve_data(args, release_template, query_args=query_args) # for each release, store it - log_info('Saving {0} releases to disk'.format(len(_releases))) - for release in _releases: + log_info('Saving {0} releases to disk'.format(len(releases))) + for release in releases: release_name = release['tag_name'] output_filepath = os.path.join(release_cwd, '{0}.json'.format(release_name)) with codecs.open(output_filepath, 'w+', encoding='utf-8') as f: json_dump(release, f) + assets = retrieve_data(args, release['assets_url']) + for asset in assets: + download_file(asset['url'], os.path.join(release_cwd, asset['name']), get_auth(args)) + def fetch_repository(name, remote_url, From 9b6400932d9ba7ebefcaa180ffe2efcd2be36c68 Mon Sep 17 00:00:00 2001 From: Harrison Wright Date: Sat, 22 Jun 2019 13:00:42 -0500 Subject: [PATCH 2/4] Fix redirect to s3 --- bin/github-backup | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/bin/github-backup b/bin/github-backup index e349eaa..583d3ee 100755 --- a/bin/github-backup +++ b/bin/github-backup @@ -26,6 +26,8 @@ try: from urllib.error import HTTPError, URLError from urllib.request import urlopen from urllib.request import Request + from urllib.request import HTTPRedirectHandler + from urllib.request import build_opener except ImportError: # python 2 from urlparse import urlparse @@ -34,6 +36,8 @@ except ImportError: from urllib2 import HTTPError, URLError from urllib2 import urlopen from urllib2 import Request + from urllib2 import HTTPRedirectHandler + from urllib2 import build_opener from github_backup import __version__ @@ -537,22 +541,33 @@ def _request_url_error(template, retry_timeout): return False +class S3HTTPRedirectHandler(HTTPRedirectHandler): + """ + A subclassed redirect handler for downloading Github assets from S3. + + urllib will add the Authorization header to the redirected request to S3, which will result in a 400, + so we should remove said header on redirect. + """ + def redirect_request(self, req, fp, code, msg, headers, newurl): + request = super(S3HTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl) + del request.headers['Authorization'] + return request + + def download_file(url, path, auth): request = Request(url) request.add_header('Accept', 'application/octet-stream') request.add_header('Authorization', 'Basic '.encode('ascii') + auth) - data = urlopen(request) - with open(path, 'wb') as f: - f.write(data.read()) + opener = build_opener(S3HTTPRedirectHandler) + response = opener.open(request) - # import requests - # r = requests.get(url, stream=True, headers={ - # 'Accept': 'application/octet-stream', - # 'Authorization': 'Basic '.encode('ascii') + auth - # }) - # with open(path, 'wb') as f: - # for chunk in r.iter_content(1024): - # f.write(chunk) + chunk_size = 16 * 1024 + with open(path, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) def get_authenticated_user(args): From ea4c3d0f6f79aec742f4497b502a757e185d6e4e Mon Sep 17 00:00:00 2001 From: Harrison Wright Date: Sat, 22 Jun 2019 13:05:54 -0500 Subject: [PATCH 3/4] Fix super call for python2 --- bin/github-backup | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/github-backup b/bin/github-backup index 583d3ee..fe3f8e9 100755 --- a/bin/github-backup +++ b/bin/github-backup @@ -18,6 +18,7 @@ import subprocess import sys import time import platform +PY2 = False try: # python 3 from urllib.parse import urlparse @@ -30,6 +31,7 @@ try: from urllib.request import build_opener except ImportError: # python 2 + PY2 = True from urlparse import urlparse from urllib import quote as urlquote from urllib import urlencode @@ -549,7 +551,11 @@ class S3HTTPRedirectHandler(HTTPRedirectHandler): so we should remove said header on redirect. """ def redirect_request(self, req, fp, code, msg, headers, newurl): - request = super(S3HTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl) + if PY2: + # HTTPRedirectHandler is an old style class + request = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) + else: + request = super(S3HTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl) del request.headers['Authorization'] return request From 89f59cc7a2e10f6e2878821e7ee8d7f8d1f64d76 Mon Sep 17 00:00:00 2001 From: Harrison Wright Date: Mon, 24 Jun 2019 15:49:19 -0500 Subject: [PATCH 4/4] Make assets it's own flag --- bin/github-backup | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/github-backup b/bin/github-backup index fe3f8e9..5a09483 100755 --- a/bin/github-backup +++ b/bin/github-backup @@ -314,6 +314,10 @@ def parse_args(): dest='include_releases', help='include release information, not including assets or binaries' ) + parser.add_argument('--assets', + action='store_true', + dest='include_assets', + help='include assets alongside release information; only applies if including releases') return parser.parse_args() @@ -744,7 +748,8 @@ def backup_repositories(args, output_directory, repositories): backup_hooks(args, repo_cwd, repository, repos_template) if args.include_releases or args.include_everything: - backup_releases(args, repo_cwd, repository, repos_template) + backup_releases(args, repo_cwd, repository, repos_template, + include_assets=args.include_assets or args.include_everything) if args.incremental: open(last_update_path, 'w').write(last_update) @@ -927,7 +932,7 @@ def backup_hooks(args, repo_cwd, repository, repos_template): log_info("Unable to read hooks, skipping") -def backup_releases(args, repo_cwd, repository, repos_template): +def backup_releases(args, repo_cwd, repository, repos_template, include_assets=False): repository_fullname = repository['full_name'] # give release files somewhere to live & log intent @@ -948,9 +953,10 @@ def backup_releases(args, repo_cwd, repository, repos_template): with codecs.open(output_filepath, 'w+', encoding='utf-8') as f: json_dump(release, f) - assets = retrieve_data(args, release['assets_url']) - for asset in assets: - download_file(asset['url'], os.path.join(release_cwd, asset['name']), get_auth(args)) + if include_assets: + assets = retrieve_data(args, release['assets_url']) + for asset in assets: + download_file(asset['url'], os.path.join(release_cwd, asset['name']), get_auth(args)) def fetch_repository(name,