python-github-backup/bin/github-backup

#!/usr/bin/env python

from __future__ import print_function

import argparse
import base64
import calendar
import codecs
import errno
import getpass
import json
import logging
import os
import re
import select
import subprocess
import sys
import time
try:
    # python 3
    from urllib.parse import urlparse
    from urllib.parse import quote as urlquote
    from urllib.parse import urlencode
    from urllib.error import HTTPError, URLError
    from urllib.request import urlopen
    from urllib.request import Request
except ImportError:
    # python 2
    from urlparse import urlparse
    from urllib import quote as urlquote
    from urllib import urlencode
    from urllib2 import HTTPError, URLError
    from urllib2 import urlopen
    from urllib2 import Request

from github_backup import __version__

FNULL = open(os.devnull, 'w')


def log_error(message):
    if type(message) == str:
        message = [message]

    for msg in message:
        sys.stderr.write("{0}\n".format(msg))

    sys.exit(1)


def log_info(message):
    if type(message) == str:
        message = [message]

    for msg in message:
        sys.stdout.write("{0}\n".format(msg))


def logging_subprocess(popenargs,
                       logger,
                       stdout_log_level=logging.DEBUG,
                       stderr_log_level=logging.ERROR,
                       **kwargs):
    """
    Variant of subprocess.call that accepts a logger instead of stdout/stderr,
    and logs stdout messages via logger.debug and stderr messages via
    logger.error.
    """
    child = subprocess.Popen(popenargs, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE, **kwargs)

    log_level = {child.stdout: stdout_log_level,
                 child.stderr: stderr_log_level}

    def check_io():
        ready_to_read = select.select([child.stdout, child.stderr],
                                      [],
                                      [],
                                      1000)[0]
        for io in ready_to_read:
            line = io.readline()
            if not logger:
                continue
            if not (io == child.stderr and not line):
                logger.log(log_level[io], line[:-1])

    # keep checking stdout/stderr until the child exits
    while child.poll() is None:
        check_io()

    check_io()  # check again to catch anything after the process exits

    rc = child.wait()

    if rc != 0:
        print('{} returned {}:'.format(popenargs[0], rc), file=sys.stderr)
        print('\t', ' '.join(popenargs), file=sys.stderr)

    return rc


def mkdir_p(*args):
    for path in args:
        try:
            os.makedirs(path)
        except OSError as exc:  # Python >2.5
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise


def mask_password(url, secret='*****'):
    parsed = urlparse(url)

    if not parsed.password:
        return url
    elif parsed.password == 'x-oauth-basic':
        return url.replace(parsed.username, secret)

    return url.replace(parsed.password, secret)


def parse_args():
    parser = argparse.ArgumentParser(description='Backup a github account')
    parser.add_argument('user',
                        metavar='USER',
                        type=str,
                        help='github username')
    parser.add_argument('-u',
                        '--username',
                        dest='username',
                        help='username for basic auth')
    parser.add_argument('-p',
                        '--password',
                        dest='password',
                        help='password for basic auth. '
                             'If a username is given but not a password, the '
                             'password will be prompted for.')
    parser.add_argument('-t',
                        '--token',
                        dest='token',
                        help='personal access or OAuth token, or path to token (file://...)')  # noqa
    parser.add_argument('-o',
                        '--output-directory',
                        default='.',
                        dest='output_directory',
                        help='directory at which to backup the repositories')
    parser.add_argument('-i',
                        '--incremental',
                        action='store_true',
                        dest='incremental',
                        help='incremental backup')
    parser.add_argument('--starred',
                        action='store_true',
                        dest='include_starred',
                        help='include starred repositories in backup')
    parser.add_argument('--watched',
                        action='store_true',
                        dest='include_watched',
                        help='include watched repositories in backup')
    parser.add_argument('--all',
                        action='store_true',
                        dest='include_everything',
                        help='include everything in backup')
    parser.add_argument('--issues',
                        action='store_true',
                        dest='include_issues',
                        help='include issues in backup')
    parser.add_argument('--issue-comments',
                        action='store_true',
                        dest='include_issue_comments',
                        help='include issue comments in backup')
    parser.add_argument('--issue-events',
                        action='store_true',
                        dest='include_issue_events',
                        help='include issue events in backup')
    parser.add_argument('--pulls',
                        action='store_true',
                        dest='include_pulls',
                        help='include pull requests in backup')
    parser.add_argument('--pull-comments',
                        action='store_true',
                        dest='include_pull_comments',
                        help='include pull request review comments in backup')
    parser.add_argument('--pull-commits',
                        action='store_true',
                        dest='include_pull_commits',
                        help='include pull request commits in backup')
    parser.add_argument('--labels',
                        action='store_true',
                        dest='include_labels',
                        help='include labels in backup')
    parser.add_argument('--hooks',
                        action='store_true',
                        dest='include_hooks',
                        help='include hooks in backup (works only when authenticated)')  # noqa
    parser.add_argument('--milestones',
                        action='store_true',
                        dest='include_milestones',
                        help='include milestones in backup')
    parser.add_argument('--repositories',
                        action='store_true',
                        dest='include_repository',
                        help='include repository clone in backup')
    parser.add_argument('--bare',
                        action='store_true',
                        dest='bare_clone',
                        help='clone bare repositories')
    parser.add_argument('--wikis',
                        action='store_true',
                        dest='include_wiki',
                        help='include wiki clone in backup')
    parser.add_argument('--skip-existing',
                        action='store_true',
                        dest='skip_existing',
                        help='skip project if a backup directory exists')
    parser.add_argument('-L',
                        '--languages',
                        dest='languages',
                        help='only allow these languages',
                        nargs='*')
    parser.add_argument('-N',
                        '--name-regex',
                        dest='name_regex',
                        help='python regex to match names against')
    parser.add_argument('-H',
                        '--github-host',
                        dest='github_host',
                        help='GitHub Enterprise hostname')
    parser.add_argument('-O',
                        '--organization',
                        action='store_true',
                        dest='organization',
                        help='whether or not this is an organization user')
    parser.add_argument('-R',
                        '--repository',
                        dest='repository',
                        help='name of repository to limit backup to')
    parser.add_argument('-P', '--private',
                        action='store_true',
                        dest='private',
                        help='include private repositories')
    parser.add_argument('-F', '--fork',
                        action='store_true',
                        dest='fork',
                        help='include forked repositories')
    parser.add_argument('--prefer-ssh',
                        action='store_true',
                        help='Clone repositories using SSH instead of HTTPS')
    parser.add_argument('-v', '--version',
                        action='version',
                        version='%(prog)s ' + __version__)
    return parser.parse_args()


def get_auth(args, encode=True):
    auth = None

    if args.token:
        _path_specifier = 'file://'
        if args.token.startswith(_path_specifier):
            args.token = open(args.token[len(_path_specifier):],
                              'rt').readline().strip()
        auth = args.token + ':' + 'x-oauth-basic'
    elif args.username:
        if not args.password:
            args.password = getpass.getpass()
        if encode:
            password = args.password
        else:
            password = urlquote(args.password)
        auth = args.username + ':' + password
    elif args.password:
        log_error('You must specify a username for basic auth')

    if not auth:
        return None

    if not encode:
        return auth

    return base64.b64encode(auth.encode('ascii'))


def get_github_api_host(args):
    if args.github_host:
        host = args.github_host + '/api/v3'
    else:
        host = 'api.github.com'

    return host


def get_github_host(args):
    if args.github_host:
        host = args.github_host
    else:
        host = 'github.com'

    return host


def get_github_repo_url(args, repository):
    if args.prefer_ssh:
        return repository['ssh_url']

    auth = get_auth(args, False)
    if auth:
        repo_url = 'https://{0}@{1}/{2}/{3}.git'.format(
            auth,
            get_github_host(args),
            args.user,
            repository['name'])
    else:
        repo_url = repository['clone_url']

    return repo_url


def retrieve_data(args, template, query_args=None, single_request=False):
    auth = get_auth(args)
    query_args = get_query_args(query_args)
    per_page = 100
    page = 0
    data = []

    while True:
        page = page + 1
        request = _construct_request(per_page, page, query_args, template, auth)  # noqa
        r, errors = _get_response(request, auth, template)

        status_code = int(r.getcode())

        if status_code != 200:
            template = 'API request returned HTTP {0}: {1}'
            errors.append(template.format(status_code, r.reason))
            log_error(errors)

        response = json.loads(r.read().decode('utf-8'))
        if len(errors) == 0:
            if type(response) == list:
                data.extend(response)
                if len(response) < per_page:
                    break
            elif type(response) == dict and single_request:
                data.append(response)

        if len(errors) > 0:
            log_error(errors)

        if single_request:
            break

    return data


def get_query_args(query_args=None):
    if not query_args:
        query_args = {}
    return query_args


def _get_response(request, auth, template):
    retry_timeout = 3
    errors = []
    # We'll make requests in a loop so we can
    # delay and retry in the case of rate-limiting
    while True:
        should_continue = False
        try:
            r = urlopen(request)
        except HTTPError as exc:
            errors, should_continue = _request_http_error(exc, auth, errors)  # noqa
            r = exc
        except URLError:
            should_continue = _request_url_error(template, retry_timeout)
            if not should_continue:
                raise

        if should_continue:
            continue

        break
    return r, errors


def _construct_request(per_page, page, query_args, template, auth):
    querystring = urlencode(dict(list({
        'per_page': per_page,
        'page': page
    }.items()) + list(query_args.items())))

    request = Request(template + '?' + querystring)
    if auth is not None:
        request.add_header('Authorization', 'Basic '.encode('ascii') + auth)
    return request


def _request_http_error(exc, auth, errors):
    # HTTPError behaves like a Response so we can
    # check the status code and headers to see exactly
    # what failed.

    should_continue = False
    headers = exc.headers
    limit_remaining = int(headers.get('x-ratelimit-remaining', 0))

    if exc.code == 403 and limit_remaining < 1:
        # The X-RateLimit-Reset header includes a
        # timestamp telling us when the limit will reset
        # so we can calculate how long to wait rather
        # than inefficiently polling:
        gm_now = calendar.timegm(time.gmtime())
        reset = int(headers.get('x-ratelimit-reset', 0)) or gm_now
        # We'll never sleep for less than 10 seconds:
        delta = max(10, reset - gm_now)

        limit = headers.get('x-ratelimit-limit')
        print('Exceeded rate limit of {} requests; waiting {} seconds to reset'.format(limit, delta),  # noqa
              file=sys.stderr)

        if auth is None:
            print('Hint: Authenticate to raise your GitHub rate limit',
                  file=sys.stderr)

        time.sleep(delta)
        should_continue = True
    return errors, should_continue


def _request_url_error(template, retry_timeout):
    # Incase of a connection timing out, we can retry a few time
    # But we won't crash and not back-up the rest now
    log_info('{} timed out'.format(template))
    retry_timeout -= 1

    if retry_timeout >= 0:
        return True

    log_error('{} timed out to much, skipping!')
    return False


def retrieve_repositories(args):
    log_info('Retrieving repositories')
    single_request = False
    template = 'https://{0}/user/repos'.format(
        get_github_api_host(args))
    if args.organization:
        template = 'https://{0}/orgs/{1}/repos'.format(
            get_github_api_host(args),
            args.user)

    if args.repository:
        single_request = True
        template = 'https://{0}/repos/{1}/{2}'.format(
            get_github_api_host(args),
            args.user,
            args.repository)

    return retrieve_data(args, template, single_request=single_request)


def filter_repositories(args, unfiltered_repositories):
    log_info('Filtering repositories')

    repositories = []
    for r in unfiltered_repositories:
        if r['owner']['login'] == args.user:
            repositories.append(r)

    name_regex = None
    if args.name_regex:
        name_regex = re.compile(args.name_regex)

    languages = None
    if args.languages:
        languages = [x.lower() for x in args.languages]

    if not args.fork:
        repositories = [r for r in repositories if not r['fork']]
    if not args.private:
        repositories = [r for r in repositories if not r['private']]
    if languages:
        repositories = [r for r in repositories if r['language'] and r['language'].lower() in languages]  # noqa
    if name_regex:
        repositories = [r for r in repositories if name_regex.match(r['name'])]

    return repositories


def backup_repositories(args, output_directory, repositories):
    log_info('Backing up repositories')
    repos_template = 'https://{0}/repos'.format(get_github_api_host(args))

    if args.incremental:
        last_update = max(repository['updated_at'] for repository in repositories)  # noqa
        last_update_path = os.path.join(output_directory, 'last_update')
        if os.path.exists(last_update_path):
            args.since = open(last_update_path).read().strip()
        else:
            args.since = None
    else:
        args.since = None

    for repository in repositories:
        backup_cwd = os.path.join(output_directory, 'repositories')
        repo_cwd = os.path.join(backup_cwd, repository['name'])
        repo_dir = os.path.join(repo_cwd, 'repository')
        repo_url = get_github_repo_url(args, repository)

        if args.include_repository or args.include_everything:
            fetch_repository(repository['name'],
                             repo_url,
                             repo_dir,
                             skip_existing=args.skip_existing,
                             bare_clone=args.bare_clone)

        download_wiki = (args.include_wiki or args.include_everything)
        if repository['has_wiki'] and download_wiki:
            fetch_repository(repository['name'],
                             repo_url.replace('.git', '.wiki.git'),
                             os.path.join(repo_cwd, 'wiki'),
                             skip_existing=args.skip_existing,
                             bare_clone=args.bare_clone)

        if args.include_issues or args.include_everything:
            backup_issues(args, repo_cwd, repository, repos_template)

        if args.include_pulls or args.include_everything:
            backup_pulls(args, repo_cwd, repository, repos_template)

        if args.include_milestones or args.include_everything:
            backup_milestones(args, repo_cwd, repository, repos_template)

        if args.include_labels or args.include_everything:
            backup_labels(args, repo_cwd, repository, repos_template)

        if args.include_hooks or args.include_everything:
            backup_hooks(args, repo_cwd, repository, repos_template)

    if args.incremental:
        open(last_update_path, 'w').write(last_update)


def backup_issues(args, repo_cwd, repository, repos_template):
    has_issues_dir = os.path.isdir('{0}/issues/.git'.format(repo_cwd))
    if args.skip_existing and has_issues_dir:
        return

    log_info('Retrieving {0} issues'.format(repository['full_name']))
    issue_cwd = os.path.join(repo_cwd, 'issues')
    mkdir_p(repo_cwd, issue_cwd)

    issues = {}
    issues_skipped = 0
    issues_skipped_message = ''
    _issue_template = '{0}/{1}/issues'.format(repos_template,
                                              repository['full_name'])

    should_include_pulls = args.include_pulls or args.include_everything
    issue_states = ['open', 'closed']
    for issue_state in issue_states:
        query_args = {
            'filter': 'all',
            'state': issue_state
        }
        if args.since:
            query_args['since'] = args.since

        _issues = retrieve_data(args,
                                _issue_template,
                                query_args=query_args)
        for issue in _issues:
            # skip pull requests which are also returned as issues
            # if retrieving pull requests is requested as well
            if 'pull_request' in issue and should_include_pulls:
                issues_skipped += 1
                continue

            issues[issue['number']] = issue

    if issues_skipped:
        issues_skipped_message = ' (skipped {0} pull requests)'.format(
            issues_skipped)

    log_info('Saving {0} issues to disk{1}'.format(
        len(list(issues.keys())), issues_skipped_message))
    comments_template = _issue_template + '/{0}/comments'
    events_template = _issue_template + '/{0}/events'
    for number, issue in list(issues.items()):
        if args.include_issue_comments or args.include_everything:
            template = comments_template.format(number)
            issues[number]['comment_data'] = retrieve_data(args, template)
        if args.include_issue_events or args.include_everything:
            template = events_template.format(number)
            issues[number]['event_data'] = retrieve_data(args, template)

        issue_file = '{0}/{1}.json'.format(issue_cwd, number)
        with codecs.open(issue_file, 'w', encoding='utf-8') as f:
            json_dump(issue, f)


def backup_pulls(args, repo_cwd, repository, repos_template):
    has_pulls_dir = os.path.isdir('{0}/pulls/.git'.format(repo_cwd))
    if args.skip_existing and has_pulls_dir:
        return

    log_info('Retrieving {0} pull requests'.format(repository['full_name']))  # noqa
    pulls_cwd = os.path.join(repo_cwd, 'pulls')
    mkdir_p(repo_cwd, pulls_cwd)

    pulls = {}
    _pulls_template = '{0}/{1}/pulls'.format(repos_template,
                                             repository['full_name'])

    pull_states = ['open', 'closed']
    for pull_state in pull_states:
        query_args = {
            'filter': 'all',
            'state': pull_state,
            'sort': 'updated',
            'direction': 'desc',
        }

        # It'd be nice to be able to apply the args.since filter here...
        _pulls = retrieve_data(args,
                               _pulls_template,
                               query_args=query_args)
        for pull in _pulls:
            if not args.since or pull['updated_at'] >= args.since:
                pulls[pull['number']] = pull

    log_info('Saving {0} pull requests to disk'.format(
        len(list(pulls.keys()))))
    comments_template = _pulls_template + '/{0}/comments'
    commits_template = _pulls_template + '/{0}/commits'
    for number, pull in list(pulls.items()):
        if args.include_pull_comments or args.include_everything:
            template = comments_template.format(number)
            pulls[number]['comment_data'] = retrieve_data(args, template)
        if args.include_pull_commits or args.include_everything:
            template = commits_template.format(number)
            pulls[number]['commit_data'] = retrieve_data(args, template)

        pull_file = '{0}/{1}.json'.format(pulls_cwd, number)
        with codecs.open(pull_file, 'w', encoding='utf-8') as f:
            json_dump(pull, f)


def backup_milestones(args, repo_cwd, repository, repos_template):
    milestone_cwd = os.path.join(repo_cwd, 'milestones')
    if args.skip_existing and os.path.isdir(milestone_cwd):
        return

    log_info('Retrieving {0} milestones'.format(repository['full_name']))
    mkdir_p(repo_cwd, milestone_cwd)

    template = '{0}/{1}/milestones'.format(repos_template,
                                           repository['full_name'])

    query_args = {
        'state': 'all'
    }

    _milestones = retrieve_data(args, template, query_args=query_args)

    milestones = {}
    for milestone in _milestones:
        milestones[milestone['number']] = milestone

    log_info('Saving {0} milestones to disk'.format(
        len(list(milestones.keys()))))
    for number, milestone in list(milestones.items()):
        milestone_file = '{0}/{1}.json'.format(milestone_cwd, number)
        with codecs.open(milestone_file, 'w', encoding='utf-8') as f:
            json_dump(milestone, f)


def backup_labels(args, repo_cwd, repository, repos_template):
    label_cwd = os.path.join(repo_cwd, 'labels')
    output_file = '{0}/labels.json'.format(label_cwd)
    template = '{0}/{1}/labels'.format(repos_template,
                                       repository['full_name'])
    _backup_data(args,
                 'labels',
                 template,
                 output_file,
                 label_cwd)


def backup_hooks(args, repo_cwd, repository, repos_template):
    auth = get_auth(args)
    if not auth:
        log_info("Skipping hooks since no authentication provided")
        return
    hook_cwd = os.path.join(repo_cwd, 'hooks')
    output_file = '{0}/hooks.json'.format(hook_cwd)
    template = '{0}/{1}/hooks'.format(repos_template,
                                      repository['full_name'])
    try:
        _backup_data(args,
                     'hooks',
                     template,
                     output_file,
                     hook_cwd)
    except SystemExit:
        log_info("Unable to read hooks, skipping")


def fetch_repository(name,
                     remote_url,
                     local_dir,
                     skip_existing=False,
                     bare_clone=False):
    if bare_clone:
        if os.path.exists(local_dir):
            clone_exists = subprocess.check_output(['git',
                                                    'rev-parse',
                                                    '--is-bare-repository'],
                                                   cwd=local_dir) == "true\n"
        else:
            clone_exists = False
    else:
        clone_exists = os.path.exists(os.path.join(local_dir, '.git'))

    if clone_exists and skip_existing:
        return

    masked_remote_url = mask_password(remote_url)

    initialized = subprocess.call('git ls-remote ' + remote_url,
                                  stdout=FNULL,
                                  stderr=FNULL,
                                  shell=True)
    if initialized == 128:
        log_info("Skipping {0} ({1}) since it's not initialized".format(
            name, masked_remote_url))
        return

    if clone_exists:
        log_info('Updating {0} in {1}'.format(name, local_dir))
        git_command = ['git', 'remote', 'rm', 'origin']
        logging_subprocess(git_command, None, cwd=local_dir)
        git_command = ['git', 'remote', 'add', 'origin', remote_url]
        logging_subprocess(git_command, None, cwd=local_dir)
        git_command = ['git', 'fetch', '--all', '--tags', '--prune']
        logging_subprocess(git_command, None, cwd=local_dir)
    else:
        log_info('Cloning {0} repository from {1} to {2}'.format(
            name,
            masked_remote_url,
            local_dir))
        if bare_clone:
            git_command = ['git', 'clone', '--mirror', remote_url, local_dir]
        else:
            git_command = ['git', 'clone', remote_url, local_dir]
        logging_subprocess(git_command, None)


def backup_account(args, output_directory):
    account_cwd = os.path.join(output_directory, 'account')

    if args.include_starred or args.include_everything:
        output_file = '{0}/starred.json'.format(account_cwd)
        template = "https://{0}/users/{1}/starred"
        template = template.format(get_github_api_host(args), args.user)
        _backup_data(args,
                     'starred repositories',
                     template,
                     output_file,
                     account_cwd)

    if args.include_watched or args.include_everything:
        output_file = '{0}/watched.json'.format(account_cwd)
        template = "https://{0}/users/{1}/subscriptions"
        template = template.format(get_github_api_host(args), args.user)
        _backup_data(args,
                     'watched repositories',
                     template,
                     output_file,
                     account_cwd)


def _backup_data(args, name, template, output_file, output_directory):
    skip_existing = args.skip_existing
    if not skip_existing or not os.path.exists(output_file):
        log_info('Retrieving {0} {1}'.format(args.user, name))
        mkdir_p(output_directory)
        data = retrieve_data(args, template)

        log_info('Writing {0} {1} to disk'.format(len(data), name))
        with codecs.open(output_file, 'w', encoding='utf-8') as f:
            json_dump(data, f)


def json_dump(data, output_file):
    json.dump(data,
              output_file,
              ensure_ascii=False,
              sort_keys=True,
              indent=4,
              separators=(',', ': '))


def main():
    args = parse_args()

    output_directory = os.path.realpath(args.output_directory)
    if not os.path.isdir(output_directory):
        log_info('Create output directory {0}'.format(output_directory))
        mkdir_p(output_directory)

    log_info('Backing up user {0} to {1}'.format(args.user, output_directory))

    repositories = retrieve_repositories(args)
    repositories = filter_repositories(args, repositories)
    backup_repositories(args, output_directory, repositories)
    backup_account(args, output_directory)


if __name__ == '__main__':
    main()