Files
python-github-backup/bin/github-backup
2016-11-22 20:11:26 +00:00

825 lines
28 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
import argparse
import base64
import calendar
import codecs
import errno
import getpass
import json
import logging
import os
import re
import select
import subprocess
import sys
import time
try:
# python 3
from urllib.parse import urlparse
from urllib.parse import quote as urlquote
from urllib.parse import urlencode
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from urllib.request import Request
except ImportError:
# python 2
from urlparse import urlparse
from urllib import quote as urlquote
from urllib import urlencode
from urllib2 import HTTPError, URLError
from urllib2 import urlopen
from urllib2 import Request
from github_backup import __version__
FNULL = open(os.devnull, 'w')
def log_error(message):
if type(message) == str:
message = [message]
for msg in message:
sys.stderr.write("{0}\n".format(msg))
sys.exit(1)
def log_info(message):
if type(message) == str:
message = [message]
for msg in message:
sys.stdout.write("{0}\n".format(msg))
def logging_subprocess(popenargs,
logger,
stdout_log_level=logging.DEBUG,
stderr_log_level=logging.ERROR,
**kwargs):
"""
Variant of subprocess.call that accepts a logger instead of stdout/stderr,
and logs stdout messages via logger.debug and stderr messages via
logger.error.
"""
child = subprocess.Popen(popenargs, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, **kwargs)
log_level = {child.stdout: stdout_log_level,
child.stderr: stderr_log_level}
def check_io():
ready_to_read = select.select([child.stdout, child.stderr],
[],
[],
1000)[0]
for io in ready_to_read:
line = io.readline()
if not logger:
continue
if not (io == child.stderr and not line):
logger.log(log_level[io], line[:-1])
# keep checking stdout/stderr until the child exits
while child.poll() is None:
check_io()
check_io() # check again to catch anything after the process exits
rc = child.wait()
if rc != 0:
print('{} returned {}:'.format(popenargs[0], rc), file=sys.stderr)
print('\t', ' '.join(popenargs), file=sys.stderr)
return rc
def mkdir_p(*args):
for path in args:
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def mask_password(url, secret='*****'):
parsed = urlparse(url)
if not parsed.password:
return url
elif parsed.password == 'x-oauth-basic':
return url.replace(parsed.username, secret)
return url.replace(parsed.password, secret)
def parse_args():
parser = argparse.ArgumentParser(description='Backup a github account')
parser.add_argument('user',
metavar='USER',
type=str,
help='github username')
parser.add_argument('-u',
'--username',
dest='username',
help='username for basic auth')
parser.add_argument('-p',
'--password',
dest='password',
help='password for basic auth. '
'If a username is given but not a password, the '
'password will be prompted for.')
parser.add_argument('-t',
'--token',
dest='token',
help='personal access or OAuth token, or path to token (file://...)') # noqa
parser.add_argument('-o',
'--output-directory',
default='.',
dest='output_directory',
help='directory at which to backup the repositories')
parser.add_argument('-i',
'--incremental',
action='store_true',
dest='incremental',
help='incremental backup')
parser.add_argument('--starred',
action='store_true',
dest='include_starred',
help='include starred repositories in backup')
parser.add_argument('--watched',
action='store_true',
dest='include_watched',
help='include watched repositories in backup')
parser.add_argument('--all',
action='store_true',
dest='include_everything',
help='include everything in backup')
parser.add_argument('--issues',
action='store_true',
dest='include_issues',
help='include issues in backup')
parser.add_argument('--issue-comments',
action='store_true',
dest='include_issue_comments',
help='include issue comments in backup')
parser.add_argument('--issue-events',
action='store_true',
dest='include_issue_events',
help='include issue events in backup')
parser.add_argument('--pulls',
action='store_true',
dest='include_pulls',
help='include pull requests in backup')
parser.add_argument('--pull-comments',
action='store_true',
dest='include_pull_comments',
help='include pull request review comments in backup')
parser.add_argument('--pull-commits',
action='store_true',
dest='include_pull_commits',
help='include pull request commits in backup')
parser.add_argument('--labels',
action='store_true',
dest='include_labels',
help='include labels in backup')
parser.add_argument('--hooks',
action='store_true',
dest='include_hooks',
help='include hooks in backup (works only when authenticated)') # noqa
parser.add_argument('--milestones',
action='store_true',
dest='include_milestones',
help='include milestones in backup')
parser.add_argument('--repositories',
action='store_true',
dest='include_repository',
help='include repository clone in backup')
parser.add_argument('--bare',
action='store_true',
dest='bare_clone',
help='clone bare repositories')
parser.add_argument('--wikis',
action='store_true',
dest='include_wiki',
help='include wiki clone in backup')
parser.add_argument('--skip-existing',
action='store_true',
dest='skip_existing',
help='skip project if a backup directory exists')
parser.add_argument('-L',
'--languages',
dest='languages',
help='only allow these languages',
nargs='*')
parser.add_argument('-N',
'--name-regex',
dest='name_regex',
help='python regex to match names against')
parser.add_argument('-H',
'--github-host',
dest='github_host',
help='GitHub Enterprise hostname')
parser.add_argument('-O',
'--organization',
action='store_true',
dest='organization',
help='whether or not this is an organization user')
parser.add_argument('-R',
'--repository',
dest='repository',
help='name of repository to limit backup to')
parser.add_argument('-P', '--private',
action='store_true',
dest='private',
help='include private repositories')
parser.add_argument('-F', '--fork',
action='store_true',
dest='fork',
help='include forked repositories')
parser.add_argument('--prefer-ssh',
action='store_true',
help='Clone repositories using SSH instead of HTTPS')
parser.add_argument('-v', '--version',
action='version',
version='%(prog)s ' + __version__)
return parser.parse_args()
def get_auth(args, encode=True):
auth = None
if args.token:
_path_specifier = 'file://'
if args.token.startswith(_path_specifier):
args.token = open(args.token[len(_path_specifier):],
'rt').readline().strip()
auth = args.token + ':' + 'x-oauth-basic'
elif args.username:
if not args.password:
args.password = getpass.getpass()
if encode:
password = args.password
else:
password = urlquote(args.password)
auth = args.username + ':' + password
elif args.password:
log_error('You must specify a username for basic auth')
if not auth:
return None
if not encode:
return auth
return base64.b64encode(auth.encode('ascii'))
def get_github_api_host(args):
if args.github_host:
host = args.github_host + '/api/v3'
else:
host = 'api.github.com'
return host
def get_github_host(args):
if args.github_host:
host = args.github_host
else:
host = 'github.com'
return host
def get_github_repo_url(args, repository):
if args.prefer_ssh:
return repository['ssh_url']
auth = get_auth(args, False)
if auth:
repo_url = 'https://{0}@{1}/{2}/{3}.git'.format(
auth,
get_github_host(args),
args.user,
repository['name'])
else:
repo_url = repository['clone_url']
return repo_url
def retrieve_data(args, template, query_args=None, single_request=False):
auth = get_auth(args)
query_args = get_query_args(query_args)
per_page = 100
page = 0
data = []
while True:
page = page + 1
request = _construct_request(per_page, page, query_args, template, auth) # noqa
r, errors = _get_response(request, auth, template)
status_code = int(r.getcode())
if status_code != 200:
template = 'API request returned HTTP {0}: {1}'
errors.append(template.format(status_code, r.reason))
log_error(errors)
response = json.loads(r.read().decode('utf-8'))
if len(errors) == 0:
if type(response) == list:
data.extend(response)
if len(response) < per_page:
break
elif type(response) == dict and single_request:
data.append(response)
if len(errors) > 0:
log_error(errors)
if single_request:
break
return data
def get_query_args(query_args=None):
if not query_args:
query_args = {}
return query_args
def _get_response(request, auth, template):
retry_timeout = 3
errors = []
# We'll make requests in a loop so we can
# delay and retry in the case of rate-limiting
while True:
should_continue = False
try:
r = urlopen(request)
except HTTPError as exc:
errors, should_continue = _request_http_error(exc, auth, errors) # noqa
r = exc
except URLError:
should_continue = _request_url_error(template, retry_timeout)
if not should_continue:
raise
if should_continue:
continue
break
return r, errors
def _construct_request(per_page, page, query_args, template, auth):
querystring = urlencode(dict(list({
'per_page': per_page,
'page': page
}.items()) + list(query_args.items())))
request = Request(template + '?' + querystring)
if auth is not None:
request.add_header('Authorization', 'Basic '.encode('ascii') + auth)
return request
def _request_http_error(exc, auth, errors):
# HTTPError behaves like a Response so we can
# check the status code and headers to see exactly
# what failed.
should_continue = False
headers = exc.headers
limit_remaining = int(headers.get('x-ratelimit-remaining', 0))
if exc.code == 403 and limit_remaining < 1:
# The X-RateLimit-Reset header includes a
# timestamp telling us when the limit will reset
# so we can calculate how long to wait rather
# than inefficiently polling:
gm_now = calendar.timegm(time.gmtime())
reset = int(headers.get('x-ratelimit-reset', 0)) or gm_now
# We'll never sleep for less than 10 seconds:
delta = max(10, reset - gm_now)
limit = headers.get('x-ratelimit-limit')
print('Exceeded rate limit of {} requests; waiting {} seconds to reset'.format(limit, delta), # noqa
file=sys.stderr)
if auth is None:
print('Hint: Authenticate to raise your GitHub rate limit',
file=sys.stderr)
time.sleep(delta)
should_continue = True
return errors, should_continue
def _request_url_error(template, retry_timeout):
# Incase of a connection timing out, we can retry a few time
# But we won't crash and not back-up the rest now
log_info('{} timed out'.format(template))
retry_timeout -= 1
if retry_timeout >= 0:
return True
log_error('{} timed out to much, skipping!')
return False
def retrieve_repositories(args):
log_info('Retrieving repositories')
single_request = False
template = 'https://{0}/user/repos'.format(
get_github_api_host(args))
if args.organization:
template = 'https://{0}/orgs/{1}/repos'.format(
get_github_api_host(args),
args.user)
if args.repository:
single_request = True
template = 'https://{0}/repos/{1}/{2}'.format(
get_github_api_host(args),
args.user,
args.repository)
return retrieve_data(args, template, single_request=single_request)
def filter_repositories(args, unfiltered_repositories):
log_info('Filtering repositories')
repositories = []
for r in unfiltered_repositories:
if r['owner']['login'] == args.user:
repositories.append(r)
name_regex = None
if args.name_regex:
name_regex = re.compile(args.name_regex)
languages = None
if args.languages:
languages = [x.lower() for x in args.languages]
if not args.fork:
repositories = [r for r in repositories if not r['fork']]
if not args.private:
repositories = [r for r in repositories if not r['private']]
if languages:
repositories = [r for r in repositories if r['language'] and r['language'].lower() in languages] # noqa
if name_regex:
repositories = [r for r in repositories if name_regex.match(r['name'])]
return repositories
def backup_repositories(args, output_directory, repositories):
log_info('Backing up repositories')
repos_template = 'https://{0}/repos'.format(get_github_api_host(args))
if args.incremental:
last_update = max(repository['updated_at'] for repository in repositories) # noqa
last_update_path = os.path.join(output_directory, 'last_update')
if os.path.exists(last_update_path):
args.since = open(last_update_path).read().strip()
else:
args.since = None
else:
args.since = None
for repository in repositories:
backup_cwd = os.path.join(output_directory, 'repositories')
repo_cwd = os.path.join(backup_cwd, repository['name'])
repo_dir = os.path.join(repo_cwd, 'repository')
repo_url = get_github_repo_url(args, repository)
if args.include_repository or args.include_everything:
fetch_repository(repository['name'],
repo_url,
repo_dir,
skip_existing=args.skip_existing,
bare_clone=args.bare_clone)
download_wiki = (args.include_wiki or args.include_everything)
if repository['has_wiki'] and download_wiki:
fetch_repository(repository['name'],
repo_url.replace('.git', '.wiki.git'),
os.path.join(repo_cwd, 'wiki'),
skip_existing=args.skip_existing,
bare_clone=args.bare_clone)
if args.include_issues or args.include_everything:
backup_issues(args, repo_cwd, repository, repos_template)
if args.include_pulls or args.include_everything:
backup_pulls(args, repo_cwd, repository, repos_template)
if args.include_milestones or args.include_everything:
backup_milestones(args, repo_cwd, repository, repos_template)
if args.include_labels or args.include_everything:
backup_labels(args, repo_cwd, repository, repos_template)
if args.include_hooks or args.include_everything:
backup_hooks(args, repo_cwd, repository, repos_template)
if args.incremental:
open(last_update_path, 'w').write(last_update)
def backup_issues(args, repo_cwd, repository, repos_template):
has_issues_dir = os.path.isdir('{0}/issues/.git'.format(repo_cwd))
if args.skip_existing and has_issues_dir:
return
log_info('Retrieving {0} issues'.format(repository['full_name']))
issue_cwd = os.path.join(repo_cwd, 'issues')
mkdir_p(repo_cwd, issue_cwd)
issues = {}
issues_skipped = 0
issues_skipped_message = ''
_issue_template = '{0}/{1}/issues'.format(repos_template,
repository['full_name'])
should_include_pulls = args.include_pulls or args.include_everything
issue_states = ['open', 'closed']
for issue_state in issue_states:
query_args = {
'filter': 'all',
'state': issue_state
}
if args.since:
query_args['since'] = args.since
_issues = retrieve_data(args,
_issue_template,
query_args=query_args)
for issue in _issues:
# skip pull requests which are also returned as issues
# if retrieving pull requests is requested as well
if 'pull_request' in issue and should_include_pulls:
issues_skipped += 1
continue
issues[issue['number']] = issue
if issues_skipped:
issues_skipped_message = ' (skipped {0} pull requests)'.format(
issues_skipped)
log_info('Saving {0} issues to disk{1}'.format(
len(list(issues.keys())), issues_skipped_message))
comments_template = _issue_template + '/{0}/comments'
events_template = _issue_template + '/{0}/events'
for number, issue in list(issues.items()):
if args.include_issue_comments or args.include_everything:
template = comments_template.format(number)
issues[number]['comment_data'] = retrieve_data(args, template)
if args.include_issue_events or args.include_everything:
template = events_template.format(number)
issues[number]['event_data'] = retrieve_data(args, template)
issue_file = '{0}/{1}.json'.format(issue_cwd, number)
with codecs.open(issue_file, 'w', encoding='utf-8') as f:
json_dump(issue, f)
def backup_pulls(args, repo_cwd, repository, repos_template):
has_pulls_dir = os.path.isdir('{0}/pulls/.git'.format(repo_cwd))
if args.skip_existing and has_pulls_dir:
return
log_info('Retrieving {0} pull requests'.format(repository['full_name'])) # noqa
pulls_cwd = os.path.join(repo_cwd, 'pulls')
mkdir_p(repo_cwd, pulls_cwd)
pulls = {}
_pulls_template = '{0}/{1}/pulls'.format(repos_template,
repository['full_name'])
pull_states = ['open', 'closed']
for pull_state in pull_states:
query_args = {
'filter': 'all',
'state': pull_state,
'sort': 'updated',
'direction': 'desc',
}
# It'd be nice to be able to apply the args.since filter here...
_pulls = retrieve_data(args,
_pulls_template,
query_args=query_args)
for pull in _pulls:
if not args.since or pull['updated_at'] >= args.since:
pulls[pull['number']] = pull
log_info('Saving {0} pull requests to disk'.format(
len(list(pulls.keys()))))
comments_template = _pulls_template + '/{0}/comments'
commits_template = _pulls_template + '/{0}/commits'
for number, pull in list(pulls.items()):
if args.include_pull_comments or args.include_everything:
template = comments_template.format(number)
pulls[number]['comment_data'] = retrieve_data(args, template)
if args.include_pull_commits or args.include_everything:
template = commits_template.format(number)
pulls[number]['commit_data'] = retrieve_data(args, template)
pull_file = '{0}/{1}.json'.format(pulls_cwd, number)
with codecs.open(pull_file, 'w', encoding='utf-8') as f:
json_dump(pull, f)
def backup_milestones(args, repo_cwd, repository, repos_template):
milestone_cwd = os.path.join(repo_cwd, 'milestones')
if args.skip_existing and os.path.isdir(milestone_cwd):
return
log_info('Retrieving {0} milestones'.format(repository['full_name']))
mkdir_p(repo_cwd, milestone_cwd)
template = '{0}/{1}/milestones'.format(repos_template,
repository['full_name'])
query_args = {
'state': 'all'
}
_milestones = retrieve_data(args, template, query_args=query_args)
milestones = {}
for milestone in _milestones:
milestones[milestone['number']] = milestone
log_info('Saving {0} milestones to disk'.format(
len(list(milestones.keys()))))
for number, milestone in list(milestones.items()):
milestone_file = '{0}/{1}.json'.format(milestone_cwd, number)
with codecs.open(milestone_file, 'w', encoding='utf-8') as f:
json_dump(milestone, f)
def backup_labels(args, repo_cwd, repository, repos_template):
label_cwd = os.path.join(repo_cwd, 'labels')
output_file = '{0}/labels.json'.format(label_cwd)
template = '{0}/{1}/labels'.format(repos_template,
repository['full_name'])
_backup_data(args,
'labels',
template,
output_file,
label_cwd)
def backup_hooks(args, repo_cwd, repository, repos_template):
auth = get_auth(args)
if not auth:
log_info("Skipping hooks since no authentication provided")
return
hook_cwd = os.path.join(repo_cwd, 'hooks')
output_file = '{0}/hooks.json'.format(hook_cwd)
template = '{0}/{1}/hooks'.format(repos_template,
repository['full_name'])
try:
_backup_data(args,
'hooks',
template,
output_file,
hook_cwd)
except SystemExit:
log_info("Unable to read hooks, skipping")
def fetch_repository(name,
remote_url,
local_dir,
skip_existing=False,
bare_clone=False):
if bare_clone:
if os.path.exists(local_dir):
clone_exists = subprocess.check_output(['git',
'rev-parse',
'--is-bare-repository'],
cwd=local_dir) == "true\n"
else:
clone_exists = False
else:
clone_exists = os.path.exists(os.path.join(local_dir, '.git'))
if clone_exists and skip_existing:
return
masked_remote_url = mask_password(remote_url)
initialized = subprocess.call('git ls-remote ' + remote_url,
stdout=FNULL,
stderr=FNULL,
shell=True)
if initialized == 128:
log_info("Skipping {0} ({1}) since it's not initialized".format(
name, masked_remote_url))
return
if clone_exists:
log_info('Updating {0} in {1}'.format(name, local_dir))
git_command = ['git', 'remote', 'rm', 'origin']
logging_subprocess(git_command, None, cwd=local_dir)
git_command = ['git', 'remote', 'add', 'origin', remote_url]
logging_subprocess(git_command, None, cwd=local_dir)
git_command = ['git', 'fetch', '--all', '--tags', '--prune']
logging_subprocess(git_command, None, cwd=local_dir)
else:
log_info('Cloning {0} repository from {1} to {2}'.format(
name,
masked_remote_url,
local_dir))
if bare_clone:
git_command = ['git', 'clone', '--mirror', remote_url, local_dir]
else:
git_command = ['git', 'clone', remote_url, local_dir]
logging_subprocess(git_command, None)
def backup_account(args, output_directory):
account_cwd = os.path.join(output_directory, 'account')
if args.include_starred or args.include_everything:
output_file = '{0}/starred.json'.format(account_cwd)
template = "https://{0}/users/{1}/starred"
template = template.format(get_github_api_host(args), args.user)
_backup_data(args,
'starred repositories',
template,
output_file,
account_cwd)
if args.include_watched or args.include_everything:
output_file = '{0}/watched.json'.format(account_cwd)
template = "https://{0}/users/{1}/subscriptions"
template = template.format(get_github_api_host(args), args.user)
_backup_data(args,
'watched repositories',
template,
output_file,
account_cwd)
def _backup_data(args, name, template, output_file, output_directory):
skip_existing = args.skip_existing
if not skip_existing or not os.path.exists(output_file):
log_info('Retrieving {0} {1}'.format(args.user, name))
mkdir_p(output_directory)
data = retrieve_data(args, template)
log_info('Writing {0} {1} to disk'.format(len(data), name))
with codecs.open(output_file, 'w', encoding='utf-8') as f:
json_dump(data, f)
def json_dump(data, output_file):
json.dump(data,
output_file,
ensure_ascii=False,
sort_keys=True,
indent=4,
separators=(',', ': '))
def main():
args = parse_args()
output_directory = os.path.realpath(args.output_directory)
if not os.path.isdir(output_directory):
log_info('Create output directory {0}'.format(output_directory))
mkdir_p(output_directory)
log_info('Backing up user {0} to {1}'.format(args.user, output_directory))
repositories = retrieve_repositories(args)
repositories = filter_repositories(args, repositories)
backup_repositories(args, output_directory, repositories)
backup_account(args, output_directory)
if __name__ == '__main__':
main()