#!/usr/bin/env python from __future__ import print_function import argparse import base64 import calendar import codecs import errno import getpass import json import logging import os import re import select import subprocess import sys import time try: # python 3 from urllib.parse import urlparse from urllib.parse import quote as urlquote from urllib.parse import urlencode from urllib.error import HTTPError, URLError from urllib.request import urlopen from urllib.request import Request except ImportError: # python 2 from urlparse import urlparse from urllib import quote as urlquote from urllib import urlencode from urllib2 import HTTPError, URLError from urllib2 import urlopen from urllib2 import Request __version__='asdf' # from github_backup import __version__ FNULL = open(os.devnull, 'w') def log_error(message): if type(message) == str: message = [message] for msg in message: sys.stderr.write("{0}\n".format(msg)) sys.exit(1) def log_info(message): if type(message) == str: message = [message] for msg in message: sys.stdout.write("{0}\n".format(msg)) def logging_subprocess(popenargs, logger, stdout_log_level=logging.DEBUG, stderr_log_level=logging.ERROR, **kwargs): """ Variant of subprocess.call that accepts a logger instead of stdout/stderr, and logs stdout messages via logger.debug and stderr messages via logger.error. """ child = subprocess.Popen(popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) log_level = {child.stdout: stdout_log_level, child.stderr: stderr_log_level} def check_io(): ready_to_read = select.select([child.stdout, child.stderr], [], [], 1000)[0] for io in ready_to_read: line = io.readline() if not logger: continue if not (io == child.stderr and not line): logger.log(log_level[io], line[:-1]) # keep checking stdout/stderr until the child exits while child.poll() is None: check_io() check_io() # check again to catch anything after the process exits rc = child.wait() if rc != 0: print('{} returned {}:'.format(popenargs[0], rc), file=sys.stderr) print('\t', ' '.join(popenargs), file=sys.stderr) return rc def mkdir_p(*args): for path in args: try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def mask_password(url, secret='*****'): parsed = urlparse(url) if not parsed.password: return url elif parsed.password == 'x-oauth-basic': return url.replace(parsed.username, secret) return url.replace(parsed.password, secret) def parse_args(): parser = argparse.ArgumentParser(description='Backup a github account') parser.add_argument('user', metavar='USER', type=str, help='github username') parser.add_argument('-u', '--username', dest='username', help='username for basic auth') parser.add_argument('-p', '--password', dest='password', help='password for basic auth. ' 'If a username is given but not a password, the ' 'password will be prompted for.') parser.add_argument('-t', '--token', dest='token', help='personal access or OAuth token') parser.add_argument('-o', '--output-directory', default='.', dest='output_directory', help='directory at which to backup the repositories') parser.add_argument('-i', '--incremental', action='store_true', dest='incremental', help='incremental backup') parser.add_argument('--starred', action='store_true', dest='include_starred', help='include starred repositories in backup') parser.add_argument('--watched', action='store_true', dest='include_watched', help='include watched repositories in backup') parser.add_argument('--all', action='store_true', dest='include_everything', help='include everything in backup') parser.add_argument('--issues', action='store_true', dest='include_issues', help='include issues in backup') parser.add_argument('--issue-comments', action='store_true', dest='include_issue_comments', help='include issue comments in backup') parser.add_argument('--issue-events', action='store_true', dest='include_issue_events', help='include issue events in backup') parser.add_argument('--pulls', action='store_true', dest='include_pulls', help='include pull requests in backup') parser.add_argument('--pull-comments', action='store_true', dest='include_pull_comments', help='include pull request review comments in backup') parser.add_argument('--pull-commits', action='store_true', dest='include_pull_commits', help='include pull request commits in backup') parser.add_argument('--labels', action='store_true', dest='include_labels', help='include labels in backup') parser.add_argument('--hooks', action='store_true', dest='include_hooks', help='include hooks in backup (works only when authenticated)') parser.add_argument('--milestones', action='store_true', dest='include_milestones', help='include milestones in backup') parser.add_argument('--repositories', action='store_true', dest='include_repository', help='include repository clone in backup') parser.add_argument('--wikis', action='store_true', dest='include_wiki', help='include wiki clone in backup') parser.add_argument('--skip-existing', action='store_true', dest='skip_existing', help='skip project if a backup directory exists') parser.add_argument('-L', '--languages', dest='languages', help='only allow these languages', nargs='*') parser.add_argument('-N', '--name-regex', dest='name_regex', help='python regex to match names against') parser.add_argument('-H', '--github-host', dest='github_host', help='GitHub Enterprise hostname') parser.add_argument('-O', '--organization', action='store_true', dest='organization', help='whether or not this is an organization user') parser.add_argument('-R', '--repository', dest='repository', help='name of repository to limit backup to') parser.add_argument('-P', '--private', action='store_true', dest='private', help='include private repositories') parser.add_argument('-F', '--fork', action='store_true', dest='fork', help='include forked repositories') parser.add_argument('--prefer-ssh', action='store_true', help='Clone repositories using SSH instead of HTTPS') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) return parser.parse_args() def get_auth(args, encode=True): auth = None if args.token: auth = args.token + ':' + 'x-oauth-basic' elif args.username: if not args.password: args.password = getpass.getpass() if encode: password = args.password else: password = urlquote(args.password) auth = args.username + ':' + password elif args.password: log_error('You must specify a username for basic auth') if not auth: return None if encode == False: return auth return base64.b64encode(auth) def get_github_api_host(args): if args.github_host: host = args.github_host + '/api/v3' else: host = 'api.github.com' return host def get_github_host(args): if args.github_host: host = args.github_host else: host = 'github.com' return host def get_github_repo_url(args, repository): if args.prefer_ssh: return repository['ssh_url'] auth = get_auth(args, False) if auth: repo_url = 'https://{0}@{1}/{2}/{3}.git'.format( auth, get_github_host(args), args.user, repository['name']) else: repo_url = repository['clone_url'] return repo_url def retrieve_data(args, template, query_args=None, single_request=False): auth = get_auth(args) query_args = get_query_args(query_args) per_page = 100 page = 0 data = [] while True: page = page + 1 request = _construct_request(per_page, page, query_args, template, auth) # noqa r, errors = _get_response(request, auth, template) status_code = int(r.getcode()) if status_code != 200: template = 'API request returned HTTP {0}: {1}' errors.append(template.format(status_code, r.reason)) log_error(errors) response = json.loads(r.read()) if len(errors) == 0: if type(response) == list: data.extend(response) if len(response) < per_page: break elif type(response) == dict and single_request: data.append(response) if len(errors) > 0: log_error(errors) if single_request: break return data def get_query_args(query_args=None): if not query_args: query_args = {} return query_args def _get_response(request, auth, template): retry_timeout = 3 errors = [] # We'll make requests in a loop so we can # delay and retry in the case of rate-limiting while True: should_continue = False try: r = urlopen(request) except HTTPError as exc: errors, should_continue = _request_http_error(exc, auth, errors) # noqa r = exc except URLError: should_continue = _request_url_error(template, retry_timeout) if not should_continue: raise if should_continue: continue break return r, errors def _construct_request(per_page, page, query_args, template, auth): querystring = urlencode(dict(list({ 'per_page': per_page, 'page': page }.items()) + list(query_args.items()))) request = Request(template + '?' + querystring) if auth is not None: request.add_header('Authorization', 'Basic ' + auth) return request def _request_http_error(exc, auth, errors): # HTTPError behaves like a Response so we can # check the status code and headers to see exactly # what failed. should_continue = False headers = exc.headers limit_remaining = int(headers.get('x-ratelimit-remaining', 0)) if exc.code == 403 and limit_remaining < 1: # The X-RateLimit-Reset header includes a # timestamp telling us when the limit will reset # so we can calculate how long to wait rather # than inefficiently polling: gm_now = calendar.timegm(time.gmtime()) reset = int(headers.get('x-ratelimit-reset', 0)) or gm_now # We'll never sleep for less than 10 seconds: delta = max(10, reset - gm_now) limit = headers.get('x-ratelimit-limit') print('Exceeded rate limit of {} requests; waiting {} seconds to reset'.format(limit, delta), # noqa file=sys.stderr) if auth is None: print('Hint: Authenticate to raise your GitHub rate limit', file=sys.stderr) time.sleep(delta) should_continue = True return errors, should_continue def _request_url_error(template, retry_timeout): # Incase of a connection timing out, we can retry a few time # But we won't crash and not back-up the rest now log_info('{} timed out'.format(template)) retry_timeout -= 1 if retry_timeout >= 0: return True log_error('{} timed out to much, skipping!') return False def retrieve_repositories(args): log_info('Retrieving repositories') single_request = False template = 'https://{0}/user/repos'.format( get_github_api_host(args)) if args.organization: template = 'https://{0}/orgs/{1}/repos'.format( get_github_api_host(args), args.user) if args.repository: single_request = True template = 'https://{0}/repos/{1}/{2}'.format( get_github_api_host(args), args.user, args.repository) return retrieve_data(args, template, single_request=single_request) def filter_repositories(args, repositories): log_info('Filtering repositories') repositories = [r for r in repositories if r['owner']['login'] == args.user] name_regex = None if args.name_regex: name_regex = re.compile(args.name_regex) languages = None if args.languages: languages = [x.lower() for x in args.languages] if not args.fork: repositories = [r for r in repositories if not r['fork']] if not args.private: repositories = [r for r in repositories if not r['private']] if languages: repositories = [r for r in repositories if r['language'] and r['language'].lower() in languages] # noqa if name_regex: repositories = [r for r in repositories if name_regex.match(r['name'])] return repositories def backup_repositories(args, output_directory, repositories): log_info('Backing up repositories') repos_template = 'https://{0}/repos'.format(get_github_api_host(args)) if args.incremental: last_update = max(repository['updated_at'] for repository in repositories) last_update_path = os.path.join(output_directory, 'last_update') if os.path.exists(last_update_path): args.since = open(last_update_path).read().strip() else: args.since = None else: args.since = None for repository in repositories: backup_cwd = os.path.join(output_directory, 'repositories') repo_cwd = os.path.join(backup_cwd, repository['name']) repo_dir = os.path.join(repo_cwd, 'repository') repo_url = get_github_repo_url(args, repository) if args.include_repository or args.include_everything: fetch_repository(repository['name'], repo_url, repo_dir, skip_existing=args.skip_existing) download_wiki = (args.include_wiki or args.include_everything) if repository['has_wiki'] and download_wiki: fetch_repository(repository['name'], repo_url.replace('.git', '.wiki.git'), os.path.join(repo_cwd, 'wiki'), skip_existing=args.skip_existing) if args.include_issues or args.include_everything: backup_issues(args, repo_cwd, repository, repos_template) if args.include_pulls or args.include_everything: backup_pulls(args, repo_cwd, repository, repos_template) if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) if args.include_labels or args.include_everything: backup_labels(args, repo_cwd, repository, repos_template) if args.include_hooks or args.include_everything: backup_hooks(args, repo_cwd, repository, repos_template) if args.incremental: open(last_update_path, 'w').write(last_update) def backup_issues(args, repo_cwd, repository, repos_template): has_issues_dir = os.path.isdir('{0}/issues/.git'.format(repo_cwd)) if args.skip_existing and has_issues_dir: return log_info('Retrieving {0} issues'.format(repository['full_name'])) issue_cwd = os.path.join(repo_cwd, 'issues') mkdir_p(repo_cwd, issue_cwd) issues = {} issues_skipped = 0 issues_skipped_message = '' _issue_template = '{0}/{1}/issues'.format(repos_template, repository['full_name']) issue_states = ['open', 'closed'] for issue_state in issue_states: query_args = { 'filter': 'all', 'state': issue_state } if args.since: query_args['since'] = args.since _issues = retrieve_data(args, _issue_template, query_args=query_args) for issue in _issues: # skip pull requests which are also returned as issues # if retrieving pull requests is requested as well if 'pull_request' in issue and (args.include_pulls or args.include_everything): issues_skipped += 1 continue issues[issue['number']] = issue if issues_skipped: issues_skipped_message = ' (skipped {0} pull requests)'.format(issues_skipped) log_info('Saving {0} issues to disk{1}'.format(len(list(issues.keys())), issues_skipped_message)) comments_template = _issue_template + '/{0}/comments' events_template = _issue_template + '/{0}/events' for number, issue in list(issues.items()): if args.include_issue_comments or args.include_everything: template = comments_template.format(number) issues[number]['comment_data'] = retrieve_data(args, template) if args.include_issue_events or args.include_everything: template = events_template.format(number) issues[number]['event_data'] = retrieve_data(args, template) issue_file = '{0}/{1}.json'.format(issue_cwd, number) with codecs.open(issue_file, 'w', encoding='utf-8') as f: json_dump(issue, f) def backup_pulls(args, repo_cwd, repository, repos_template): has_pulls_dir = os.path.isdir('{0}/pulls/.git'.format(repo_cwd)) if args.skip_existing and has_pulls_dir: return log_info('Retrieving {0} pull requests'.format(repository['full_name'])) # noqa pulls_cwd = os.path.join(repo_cwd, 'pulls') mkdir_p(repo_cwd, pulls_cwd) pulls = {} _pulls_template = '{0}/{1}/pulls'.format(repos_template, repository['full_name']) pull_states = ['open', 'closed'] for pull_state in pull_states: query_args = { 'filter': 'all', 'state': pull_state, 'sort': 'updated', 'direction': 'desc', } # It'd be nice to be able to apply the args.since filter here... _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: if not args.since or pull['updated_at'] >= args.since: pulls[pull['number']] = pull log_info('Saving {0} pull requests to disk'.format(len(list(pulls.keys())))) comments_template = _pulls_template + '/{0}/comments' commits_template = _pulls_template + '/{0}/commits' for number, pull in list(pulls.items()): if args.include_pull_comments or args.include_everything: template = comments_template.format(number) pulls[number]['comment_data'] = retrieve_data(args, template) if args.include_pull_commits or args.include_everything: template = commits_template.format(number) pulls[number]['commit_data'] = retrieve_data(args, template) pull_file = '{0}/{1}.json'.format(pulls_cwd, number) with codecs.open(pull_file, 'w', encoding='utf-8') as f: json_dump(pull, f) def backup_milestones(args, repo_cwd, repository, repos_template): milestone_cwd = os.path.join(repo_cwd, 'milestones') if args.skip_existing and os.path.isdir(milestone_cwd): return log_info('Retrieving {0} milestones'.format(repository['full_name'])) mkdir_p(repo_cwd, milestone_cwd) template = '{0}/{1}/milestones'.format(repos_template, repository['full_name']) query_args = { 'state': 'all' } _milestones = retrieve_data(args, template, query_args=query_args) milestones = {} for milestone in _milestones: milestones[milestone['number']] = milestone log_info('Saving {0} milestones to disk'.format(len(list(milestones.keys())))) for number, milestone in list(milestones.items()): milestone_file = '{0}/{1}.json'.format(milestone_cwd, number) with codecs.open(milestone_file, 'w', encoding='utf-8') as f: json_dump(milestone, f) def backup_labels(args, repo_cwd, repository, repos_template): label_cwd = os.path.join(repo_cwd, 'labels') output_file = '{0}/labels.json'.format(label_cwd) template = '{0}/{1}/labels'.format(repos_template, repository['full_name']) _backup_data(args, 'labels', template, output_file, label_cwd) def backup_hooks(args, repo_cwd, repository, repos_template): auth = get_auth(args) if not auth: log_info("Skipping hooks since no authentication provided") return hook_cwd = os.path.join(repo_cwd, 'hooks') output_file = '{0}/hooks.json'.format(hook_cwd) template = '{0}/{1}/hooks'.format(repos_template, repository['full_name']) try: _backup_data(args, 'hooks', template, output_file, hook_cwd) except SystemExit: log_info("Unable to read hooks, skipping") def fetch_repository(name, remote_url, local_dir, skip_existing=False): clone_exists = os.path.exists(os.path.join(local_dir, '.git')) if clone_exists and skip_existing: return masked_remote_url = mask_password(remote_url) initalized = subprocess.call('git ls-remote ' + remote_url, stdout=FNULL, stderr=FNULL, shell=True) if initalized == 128: log_info("Skipping {0} ({1}) since it's not initalized".format(name, masked_remote_url)) return if clone_exists: log_info('Updating {0} in {1}'.format(name, local_dir)) git_command = ['git', 'remote', 'rm', 'origin'] logging_subprocess(git_command, None, cwd=local_dir) git_command = ['git', 'remote', 'add', 'origin', remote_url] logging_subprocess(git_command, None, cwd=local_dir) git_command = ['git', 'fetch', '--all', '--tags', '--prune'] logging_subprocess(git_command, None, cwd=local_dir) else: log_info('Cloning {0} repository from {1} to {2}'.format(name, masked_remote_url, local_dir)) git_command = ['git', 'clone', remote_url, local_dir] logging_subprocess(git_command, None) def backup_account(args, output_directory): account_cwd = os.path.join(output_directory, 'account') if args.include_starred or args.include_everything: output_file = '{0}/starred.json'.format(account_cwd) template = "https://{0}/users/{1}/starred" template = template.format(get_github_api_host(args), args.user) _backup_data(args, 'starred repositories', template, output_file, account_cwd) if args.include_watched or args.include_everything: output_file = '{0}/watched.json'.format(account_cwd) template = "https://{0}/users/{1}/subscriptions" template = template.format(get_github_api_host(args), args.user) _backup_data(args, 'watched repositories', template, output_file, account_cwd) def _backup_data(args, name, template, output_file, output_directory): skip_existing = args.skip_existing if not skip_existing or not os.path.exists(output_file): log_info('Retrieving {0} {1}'.format(args.user, name)) mkdir_p(output_directory) data = retrieve_data(args, template) log_info('Writing {0} {1} to disk'.format(len(data), name)) with codecs.open(output_file, 'w', encoding='utf-8') as f: json_dump(data, f) def json_dump(data, output_file): json.dump(data, output_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': ')) def main(): args = parse_args() output_directory = os.path.realpath(args.output_directory) if not os.path.isdir(output_directory): log_info('Create output directory {0}'.format(output_directory)) mkdir_p(output_directory) log_info('Backing up user {0} to {1}'.format(args.user, output_directory)) repositories = retrieve_repositories(args) repositories = filter_repositories(args, repositories) backup_repositories(args, output_directory, repositories) backup_account(args, output_directory) if __name__ == '__main__': main()