Refactor to both simplify codepath as well as follow PEP8 standards

2026-03-07 19:06:53 +01:00 · 2015-10-10 00:16:30 -04:00
parent 6193efb798
commit 708b377918
1 changed files with 410 additions and 216 deletions
--- a/bin/github-backup
+++ b/bin/github-backup
@@ -22,6 +22,7 @@ from github_backup import __version__

 FNULL = open(os.devnull, 'w')

+
 def log_error(message):
    if type(message) == str:
        message = [message]
@@ -40,7 +41,11 @@ def log_info(message):
        sys.stdout.write("{0}\n".format(msg))


-def logging_subprocess(popenargs, logger, stdout_log_level=logging.DEBUG, stderr_log_level=logging.ERROR, **kwargs):
+def logging_subprocess(popenargs,
+                       logger,
+                       stdout_log_level=logging.DEBUG,
+                       stderr_log_level=logging.ERROR,
+                       **kwargs):
    """
    Variant of subprocess.call that accepts a logger instead of stdout/stderr,
    and logs stdout messages via logger.debug and stderr messages via
@@ -53,7 +58,10 @@ def logging_subprocess(popenargs, logger, stdout_log_level=logging.DEBUG, stderr
                 child.stderr: stderr_log_level}

    def check_io():
-        ready_to_read = select.select([child.stdout, child.stderr], [], [], 1000)[0]
+        ready_to_read = select.select([child.stdout, child.stderr],
+                                      [],
+                                      [],
+                                      1000)[0]
        for io in ready_to_read:
            line = io.readline()
            if not logger:
@@ -88,35 +96,121 @@ def mkdir_p(*args):


 def parse_args():
-    parser = argparse.ArgumentParser(description='Backup a github users account', prog='Github Backup')
-    parser.add_argument('user', metavar='USER', type=str, help='github username')
-    parser.add_argument('-u', '--username', dest='username', help='username for basic auth')
-    parser.add_argument('-p', '--password', dest='password', help='password for basic auth')
-    parser.add_argument('-t', '--token', dest='token', help='personal access or OAuth token')
-    parser.add_argument('-o', '--output-directory', default='.', dest='output_directory', help='directory at which to backup the repositories')
-    parser.add_argument('--starred', action='store_true', dest='include_starred', help='include starred repositories in backup')
-    parser.add_argument('--watched', action='store_true', dest='include_watched', help='include watched repositories in backup')
-    parser.add_argument('--all', action='store_true', dest='include_everything', help='include everything in backup')
-    parser.add_argument('--issues', action='store_true', dest='include_issues', help='include issues in backup')
-    parser.add_argument('--issue-comments', action='store_true', dest='include_issue_comments', help='include issue comments in backup')
-    parser.add_argument('--issue-events', action='store_true', dest='include_issue_events', help='include issue events in backup')
-    parser.add_argument('--pulls', action='store_true', dest='include_pulls', help='include pull requests in backup')
-    parser.add_argument('--pull-comments', action='store_true', dest='include_pull_comments', help='include pull request review comments in backup')
-    parser.add_argument('--pull-commits', action='store_true', dest='include_pull_commits', help='include pull request commits in backup')
-    parser.add_argument('--labels', action='store_true', dest='include_labels', help='include labels in backup')
-    parser.add_argument('--milestones', action='store_true', dest='include_milestones', help='include milestones in backup')
-    parser.add_argument('--repositories', action='store_true', dest='include_repository', help='include repository clone in backup')
-    parser.add_argument('--wikis', action='store_true', dest='include_wiki', help='include wiki clone in backup')
-    parser.add_argument('--skip-existing', action='store_true', dest='skip_existing', help='skip project if a backup directory exists')
-    parser.add_argument('-L', '--languages', dest='languages', help='only allow these languages', nargs='*')
-    parser.add_argument('-N', '--name-regex', dest='name_regex', help='python regex to match names against')
-    parser.add_argument('-H', '--github-host', dest='github_host', help='GitHub Enterprise hostname')
-    parser.add_argument('-O', '--organization', action='store_true', dest='organization', help='whether or not this is a query for an organization')
-    parser.add_argument('-R', '--repository', dest='repository', help='name of repository to limit backup to')
-    parser.add_argument('-P', '--private', action='store_true', dest='private', help='include private repositories')
-    parser.add_argument('-F', '--fork', action='store_true', dest='fork', help='include forked repositories')
-    parser.add_argument('--prefer-ssh', action='store_true', help='Clone repositories using SSH instead of HTTPS')
-    parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__)
+    parser = argparse.ArgumentParser(description='Backup a github account',
+                                     prog='Github Backup')
+    parser.add_argument('user',
+                        metavar='USER',
+                        type=str,
+                        help='github username')
+    parser.add_argument('-u',
+                        '--username',
+                        dest='username',
+                        help='username for basic auth')
+    parser.add_argument('-p',
+                        '--password',
+                        dest='password',
+                        help='password for basic auth')
+    parser.add_argument('-t',
+                        '--token',
+                        dest='token',
+                        help='personal access or OAuth token')
+    parser.add_argument('-o',
+                        '--output-directory',
+                        default='.',
+                        dest='output_directory',
+                        help='directory at which to backup the repositories')
+    parser.add_argument('--starred',
+                        action='store_true',
+                        dest='include_starred',
+                        help='include starred repositories in backup')
+    parser.add_argument('--watched',
+                        action='store_true',
+                        dest='include_watched',
+                        help='include watched repositories in backup')
+    parser.add_argument('--all',
+                        action='store_true',
+                        dest='include_everything',
+                        help='include everything in backup')
+    parser.add_argument('--issues',
+                        action='store_true',
+                        dest='include_issues',
+                        help='include issues in backup')
+    parser.add_argument('--issue-comments',
+                        action='store_true',
+                        dest='include_issue_comments',
+                        help='include issue comments in backup')
+    parser.add_argument('--issue-events',
+                        action='store_true',
+                        dest='include_issue_events',
+                        help='include issue events in backup')
+    parser.add_argument('--pulls',
+                        action='store_true',
+                        dest='include_pulls',
+                        help='include pull requests in backup')
+    parser.add_argument('--pull-comments',
+                        action='store_true',
+                        dest='include_pull_comments',
+                        help='include pull request review comments in backup')
+    parser.add_argument('--pull-commits',
+                        action='store_true',
+                        dest='include_pull_commits',
+                        help='include pull request commits in backup')
+    parser.add_argument('--labels',
+                        action='store_true',
+                        dest='include_labels',
+                        help='include labels in backup')
+    parser.add_argument('--milestones',
+                        action='store_true',
+                        dest='include_milestones',
+                        help='include milestones in backup')
+    parser.add_argument('--repositories',
+                        action='store_true',
+                        dest='include_repository',
+                        help='include repository clone in backup')
+    parser.add_argument('--wikis',
+                        action='store_true',
+                        dest='include_wiki',
+                        help='include wiki clone in backup')
+    parser.add_argument('--skip-existing',
+                        action='store_true',
+                        dest='skip_existing',
+                        help='skip project if a backup directory exists')
+    parser.add_argument('-L',
+                        '--languages',
+                        dest='languages',
+                        help='only allow these languages',
+                        nargs='*')
+    parser.add_argument('-N',
+                        '--name-regex',
+                        dest='name_regex',
+                        help='python regex to match names against')
+    parser.add_argument('-H',
+                        '--github-host',
+                        dest='github_host',
+                        help='GitHub Enterprise hostname')
+    parser.add_argument('-O',
+                        '--organization',
+                        action='store_true',
+                        dest='organization',
+                        help='whether or not this is an organization user')
+    parser.add_argument('-R',
+                        '--repository',
+                        dest='repository',
+                        help='name of repository to limit backup to')
+    parser.add_argument('-P', '--private',
+                        action='store_true',
+                        dest='private',
+                        help='include private repositories')
+    parser.add_argument('-F', '--fork',
+                        action='store_true',
+                        dest='fork',
+                        help='include forked repositories')
+    parser.add_argument('--prefer-ssh',
+                        action='store_true',
+                        help='Clone repositories using SSH instead of HTTPS')
+    parser.add_argument('-v', '--version',
+                        action='version',
+                        version='%(prog)s ' + __version__)
    return parser.parse_args()


@@ -127,12 +221,13 @@ def get_auth(args):
    elif args.username and args.password:
        auth = base64.b64encode(args.username + ':' + args.password)
    elif args.username and not args.password:
-        log_error('You must specify a password for basic auth when specifying a username')
+        log_error('You must specify a password for basic auth')
    elif args.password and not args.username:
-        log_error('You must specify a username for basic auth when specifying a password')
+        log_error('You must specify a username for basic auth')

    return auth

+
 def get_github_api_host(args):
    if args.github_host:
        host = args.github_host + '/api/v3'
@@ -141,6 +236,7 @@ def get_github_api_host(args):

    return host

+
 def get_github_ssh_host(args):
    if args.github_host:
        host = args.github_host
@@ -149,74 +245,24 @@ def get_github_ssh_host(args):

    return host

+
 def retrieve_data(args, template, query_args=None, single_request=False):
    auth = get_auth(args)
+    query_args = get_query_args(query_args)
    per_page = 100
    page = 0
    data = []
-    if not query_args:
-        query_args = {}

    while True:
        page = page + 1
-        querystring = urllib.urlencode(dict({
-            'per_page': per_page,
-            'page': page
-        }.items() + query_args.items()))
-
-        request = urllib2.Request(template + '?' + querystring)
-        if auth is not None:
-            request.add_header('Authorization', 'Basic ' + auth)
-
-        errors = []
-        retry_timeout = 3
-
-        # We'll make requests in a loop so we can delay and retry in the case of rate-limiting
-        while True:
-            try:
-                r = urllib2.urlopen(request)
-            except urllib2.HTTPError as exc:
-                # HTTPError behaves like a Response so we can check the status code and headers to see exactly
-                # what failed.
-
-                limit_remaining = int(exc.headers.get('x-ratelimit-remaining', 0))
-
-                if exc.code == 403 and limit_remaining < 1:
-                    # The X-RateLimit-Reset header includes a timestamp telling us when the limit will reset
-                    # so we can calculate how long to wait rather than inefficiently polling:
-                    gm_now = calendar.timegm(time.gmtime())
-                    reset = int(exc.headers.get('x-ratelimit-reset', 0)) or gm_now
-                    # We'll never sleep for less than 10 seconds:
-                    delta = max(10, reset - gm_now)
-
-                    limit = exc.headers.get('x-ratelimit-limit')
-                    print('Exceeded rate limit of {} requests; waiting {} seconds to reset'.format(limit, delta),
-                          file=sys.stderr)
-
-                    ratelimit_error = 'No more requests remaining'
-                    if auth is None:
-                        ratelimit_error = ratelimit_error + '; authenticate to raise your GitHub rate limit'
-                    errors.append(ratelimit_error)
-
-                    time.sleep(delta)
-                    continue
-            except urllib2.URLError:
-                # Incase of a connection timing out, we can retry a few time
-                # But we won't crash and not back-up the rest now
-                log_info('{} timed out'.format(template))
-                retry_timeout -= 1
-
-                if retry_timeout >= 0:
-                    continue
-                
-                log_error('{} timed out to much, skipping!')
-
-            break
+        request = _construct_request(per_page, page, query_args, template, auth)  # noqa
+        r, errors = _get_response(request, template)

        status_code = int(r.getcode())

        if status_code != 200:
-            errors.append('API request returned HTTP {}: {}'.format(status_code, r.reason))
+            template = 'API request returned HTTP {0}: {1}'
+            errors.append(template.format(status_code, r.reason))
            log_error(errors)

        response = json.loads(r.read())
@@ -237,16 +283,108 @@ def retrieve_data(args, template, query_args=None, single_request=False):
    return data


+def get_query_args(query_args=None):
+    if not query_args:
+        query_args = {}
+    return query_args
+
+
+def _get_response(request, template):
+    retry_timeout = 3
+    errors = []
+    # We'll make requests in a loop so we can
+    # delay and retry in the case of rate-limiting
+    while True:
+        should_continue = False
+        try:
+            r = urllib2.urlopen(request)
+        except urllib2.HTTPError as exc:
+            errors, should_continue = _request_http_error(exc, auth, errors)  # noqa
+        except urllib2.URLError:
+            should_continue = _request_url_error(template, retry_timeout)
+
+        if should_continue:
+            continue
+
+        break
+    return r, errors
+
+
+def _construct_request(per_page, page, query_args, template, auth):
+    querystring = urllib.urlencode(dict({
+        'per_page': per_page,
+        'page': page
+    }.items() + query_args.items()))
+
+    request = urllib2.Request(template + '?' + querystring)
+    if auth is not None:
+        request.add_header('Authorization', 'Basic ' + auth)
+    return request
+
+
+def _request_http_error(exc, auth, errors):
+    # HTTPError behaves like a Response so we can
+    # check the status code and headers to see exactly
+    # what failed.
+
+    should_continue = False
+    headers = exc.headers
+    limit_remaining = int(headers.get('x-ratelimit-remaining', 0))
+
+    if exc.code == 403 and limit_remaining < 1:
+        # The X-RateLimit-Reset header includes a
+        # timestamp telling us when the limit will reset
+        # so we can calculate how long to wait rather
+        # than inefficiently polling:
+        gm_now = calendar.timegm(time.gmtime())
+        reset = int(headers.get('x-ratelimit-reset', 0)) or gm_now
+        # We'll never sleep for less than 10 seconds:
+        delta = max(10, reset - gm_now)
+
+        limit = headers.get('x-ratelimit-limit')
+        print('Exceeded rate limit of {} requests; waiting {} seconds to reset'.format(limit, delta),  # noqa
+              file=sys.stderr)
+
+        ratelimit_error = 'No more requests remaining'
+        if auth is None:
+            ratelimit_error += '; authenticate to raise your GitHub rate limit'  # noqa
+        errors.append(ratelimit_error)
+
+        time.sleep(delta)
+        should_continue = True
+    return errors, should_continue
+
+
+def _request_url_error(template, retry_timeout):
+    # Incase of a connection timing out, we can retry a few time
+    # But we won't crash and not back-up the rest now
+    log_info('{} timed out'.format(template))
+    retry_timeout -= 1
+
+    if retry_timeout >= 0:
+        return True
+
+    log_error('{} timed out to much, skipping!')
+    return False
+
+
 def retrieve_repositories(args):
    log_info('Retrieving repositories')
    single_request = False
-    template = 'https://{0}/users/{1}/repos'.format(get_github_api_host(args), args.user)
+    template = 'https://{0}/users/{1}/repos'.format(
+        get_github_api_host(args),
+        args.user)
    if args.organization:
-        template = 'https://{0}/orgs/{1}/repos'.format(get_github_api_host(args), args.user)
+        template = 'https://{0}/orgs/{1}/repos'.format(
+            get_github_api_host(args),
+            args.user)

    if args.repository:
        single_request = True
-        template = 'https://{0}/repos/{1}/{2}'.format(get_github_api_host(args), args.user, args.repository)
+        template = 'https://{0}/repos/{1}/{2}'.format(
+            get_github_api_host(args),
+            args.user,
+            args.repository)

    return retrieve_data(args, template, single_request=single_request)

@@ -266,7 +404,7 @@ def filter_repositories(args, repositories):
    if not args.private:
        repositories = [r for r in repositories if not r['private']]
    if languages:
-        repositories = [r for r in repositories if r['language'] and r['language'].lower() in languages]
+        repositories = [r for r in repositories if r['language'] and r['language'].lower() in languages]  # noqa
    if name_regex:
        repositories = [r for r in repositories if name_regex.match(r['name'])]

@@ -277,9 +415,6 @@ def backup_repositories(args, output_directory, repositories):
    log_info('Backing up repositories')
    repos_template = 'https://{0}/repos'.format(get_github_api_host(args))

-    issue_states = ['open', 'closed']
-    pull_states = ['open', 'closed']
-
    for repository in repositories:
        backup_cwd = os.path.join(output_directory, 'repositories')
        repo_cwd = os.path.join(backup_cwd, repository['name'])
@@ -288,123 +423,156 @@ def backup_repositories(args, output_directory, repositories):
        if args.prefer_ssh:
            repo_url = repository['ssh_url']
        else:
-            repo_url = repository['git_url']
+            repo_url = repository['clone_url']

        if args.include_repository or args.include_everything:
-            fetch_repository(repository['name'], repo_url, repo_dir, skip_existing=args.skip_existing)
+            fetch_repository(repository['name'],
+                             repo_url,
+                             repo_dir,
+                             skip_existing=args.skip_existing)

-        if repository['has_wiki'] and (args.include_wiki or args.include_everything):
+        download_wiki = (args.include_wiki or args.include_everything)
+        if repository['has_wiki'] and download_wiki:
            fetch_repository(repository['name'],
                             repo_url.replace('.git', '.wiki.git'),
                             os.path.join(repo_cwd, 'wiki'),
                             skip_existing=args.skip_existing)

        if args.include_issues or args.include_everything:
-            if args.skip_existing and os.path.isdir('{0}/issues/.git'.format(repo_cwd)):
-                continue
-
-            log_info('Retrieving {0} issues'.format(repository['full_name']))
-            issue_cwd = os.path.join(repo_cwd, 'issues')
-            mkdir_p(backup_cwd, repo_cwd, issue_cwd)
-
-            issues = {}
-            _issue_template = '{0}/{1}/issues'.format(repos_template, repository['full_name'])
-
-            for issue_state in issue_states:
-                query_args = {
-                    'filter': 'all',
-                    'state': issue_state
-                }
-
-                _issues = retrieve_data(args, _issue_template, query_args=query_args)
-                for issue in _issues:
-                    issues[issue['number']] = issue
-
-            log_info('Saving {0} issues to disk'.format(len(issues.keys())))
-            for number, issue in issues.iteritems():
-                comments_template = _issue_template + '/{0}/comments'
-                events_template = _issue_template + '/{0}/events'
-                if args.include_issue_comments or args.include_everything:
-                    issues[number]['comment_data'] = retrieve_data(args, comments_template.format(number))
-                if args.include_issue_events or args.include_everything:
-                    issues[number]['event_data'] = retrieve_data(args, events_template.format(number))
-
-                with codecs.open('{0}/{1}.json'.format(issue_cwd, number), 'w', encoding='utf-8') as issue_file:
-                    json.dump(issue, issue_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))                    
+            backup_issues(args, repo_cwd, repository, repos_template)

        if args.include_pulls or args.include_everything:
-            if args.skip_existing and os.path.isdir('{0}/pulls/.git'.format(repo_cwd)):
-                continue
-
-            log_info('Retrieving {0} pull requests'.format(repository['full_name']))
-            pulls_cwd = os.path.join(repo_cwd, 'pulls')
-            mkdir_p(backup_cwd, repo_cwd, pulls_cwd)
-
-            pulls = {}
-            _pulls_template = '{0}/{1}/pulls'.format(repos_template, repository['full_name'])
-
-            for pull_state in pull_states:
-                query_args = {
-                    'filter': 'all',
-                    'state': pull_state
-                }
-
-                _pulls = retrieve_data(args, _pulls_template, query_args=query_args)
-                for pull in _pulls:
-                    pulls[pull['number']] = pull
-
-            log_info('Saving {0} pull requests to disk'.format(len(pulls.keys())))
-            for number, pull in pulls.iteritems():
-                comments_template = _pulls_template + '/{0}/comments'
-                commits_template = _pulls_template + '/{0}/commits'
-                if args.include_pull_comments or args.include_everything:
-                    pulls[number]['comment_data'] = retrieve_data(args, comments_template.format(number))
-                if args.include_pull_commits or args.include_everything:
-                    pulls[number]['commit_data'] = retrieve_data(args, commits_template.format(number))
-
-                with codecs.open('{0}/{1}.json'.format(pulls_cwd, number), 'w', encoding='utf-8') as pull_file:
-                    json.dump(pull, pull_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
+            backup_pulls(args, repo_cwd, repository, repos_template)

        if args.include_milestones or args.include_everything:
-            if args.skip_existing and os.path.isdir('{0}/milestones/.git'.format(repo_cwd)):
-                continue
-
-            log_info('Retrieving {0} milestones'.format(repository['full_name']))
-            milestone_cwd = os.path.join(repo_cwd, 'milestones')
-            mkdir_p(backup_cwd, repo_cwd, milestone_cwd)
-
-            milestones = {}
-            _milestone_template = '{0}/{1}/milestones'.format(repos_template, repository['full_name'])
-
-            query_args = {
-                'state': 'all'
-            }
-
-            _milestones = retrieve_data(args, _milestone_template, query_args=query_args)
-
-            for milestone in _milestones:
-                milestones[milestone['number']] = milestone
-
-            log_info('Saving {0} milestones to disk'.format(len(milestones.keys())))
-            for number, milestone in milestones.iteritems():
-                with codecs.open('{0}/{1}.json'.format(milestone_cwd, number), 'w', encoding='utf-8') as milestone_file:
-                    json.dump(milestone, milestone_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
+            backup_milestones(args, repo_cwd, repository, repos_template)

        if args.include_labels or args.include_everything:
-            if args.skip_existing and os.path.isdir('{0}/labels/.git'.format(repo_cwd)):
-                continue
+            backup_labels(args, repo_cwd, repository, repos_template)

-            log_info('Retrieving {0} labels'.format(repository['full_name']))
-            label_cwd = os.path.join(repo_cwd, 'labels')
-            mkdir_p(backup_cwd, repo_cwd, label_cwd)

-            _label_template = '{0}/{1}/labels'.format(repos_template, repository['full_name'])
+def backup_issues(args, repo_cwd, repository, repos_template):
+    has_issues_dir = os.path.isdir('{0}/issues/.git'.format(repo_cwd))
+    if args.skip_existing and has_issues_dir:
+        return

-            labels = retrieve_data(args, _label_template, query_args={})
+    log_info('Retrieving {0} issues'.format(repository['full_name']))
+    issue_cwd = os.path.join(repo_cwd, 'issues')
+    mkdir_p(repo_cwd, issue_cwd)

-            log_info('Saving {0} labels to disk'.format(len(labels)))
-            with codecs.open('{0}/labels.json'.format(label_cwd), 'w', encoding='utf-8') as label_file:
-                json.dump(labels, label_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
+    issues = {}
+    _issue_template = '{0}/{1}/issues'.format(repos_template,
+                                              repository['full_name'])
+
+    issue_states = ['open', 'closed']
+    for issue_state in issue_states:
+        query_args = {
+            'filter': 'all',
+            'state': issue_state
+        }
+
+        _issues = retrieve_data(args,
+                                _issue_template,
+                                query_args=query_args)
+        for issue in _issues:
+            issues[issue['number']] = issue
+
+    log_info('Saving {0} issues to disk'.format(len(issues.keys())))
+    comments_template = _issue_template + '/{0}/comments'
+    events_template = _issue_template + '/{0}/events'
+    for number, issue in issues.iteritems():
+        if args.include_issue_comments or args.include_everything:
+            template = comments_template.format(number)
+            issues[number]['comment_data'] = retrieve_data(args, template)
+        if args.include_issue_events or args.include_everything:
+            template = events_template.format(number)
+            issues[number]['event_data'] = retrieve_data(args, template)
+
+        issue_file = '{0}/{1}.json'.format(issue_cwd, number)
+        with codecs.open(issue_file, 'w', encoding='utf-8') as f:
+            json_dump(issue, f)
+
+
+def backup_pulls(args, repo_cwd, repository, repos_template):
+    has_pulls_dir = os.path.isdir('{0}/pulls/.git'.format(repo_cwd))
+    if args.skip_existing and has_pulls_dir:
+        return
+
+    log_info('Retrieving {0} pull requests'.format(repository['full_name']))  # noqa
+    pulls_cwd = os.path.join(repo_cwd, 'pulls')
+    mkdir_p(repo_cwd, pulls_cwd)
+
+    pulls = {}
+    _pulls_template = '{0}/{1}/pulls'.format(repos_template,
+                                             repository['full_name'])
+
+    pull_states = ['open', 'closed']
+    for pull_state in pull_states:
+        query_args = {
+            'filter': 'all',
+            'state': pull_state
+        }
+
+        _pulls = retrieve_data(args,
+                               _pulls_template,
+                               query_args=query_args)
+        for pull in _pulls:
+            pulls[pull['number']] = pull
+
+    log_info('Saving {0} pull requests to disk'.format(len(pulls.keys())))
+    comments_template = _pulls_template + '/{0}/comments'
+    commits_template = _pulls_template + '/{0}/commits'
+    for number, pull in pulls.iteritems():
+        if args.include_pull_comments or args.include_everything:
+            template = comments_template.format(number)
+            pulls[number]['comment_data'] = retrieve_data(args, template)
+        if args.include_pull_commits or args.include_everything:
+            template = commits_template.format(number)
+            pulls[number]['commit_data'] = retrieve_data(args, template)
+
+        pull_file = '{0}/{1}.json'.format(pulls_cwd, number)
+        with codecs.open(pull_file, 'w', encoding='utf-8') as f:
+            json_dump(pull, f)
+
+
+def backup_milestones(args, repo_cwd, repository, repos_template):
+    milestone_cwd = os.path.join(repo_cwd, 'milestones')
+    if args.skip_existing and os.path.isdir(milestone_cwd):
+        return
+
+    log_info('Retrieving {0} milestones'.format(repository['full_name']))
+    mkdir_p(repo_cwd, milestone_cwd)
+
+    template = '{0}/{1}/milestones'.format(repos_template,
+                                           repository['full_name'])
+
+    query_args = {
+        'state': 'all'
+    }
+
+    _milestones = retrieve_data(args, template, query_args=query_args)
+
+    milestones = {}
+    for milestone in _milestones:
+        milestones[milestone['number']] = milestone
+
+    log_info('Saving {0} milestones to disk'.format(len(milestones.keys())))
+    for number, milestone in milestones.iteritems():
+        milestone_file = '{0}/{1}.json'.format(milestone_cwd, number)
+        with codecs.open(milestone_file, 'w', encoding='utf-8') as f:
+            json_dump(milestone, f)
+
+
+def backup_labels(args, repo_cwd, repository, repos_template):
+    label_cwd = os.path.join(repo_cwd, 'labels')
+    output_file = '{0}/labels.json'.format(label_cwd)
+    template = '{0}/{1}/labels'.format(repos_template,
+                                       repository['full_name'])
+    _backup_data(args,
+                 'labels',
+                 template,
+                 output_file,
+                 label_cwd)


 def fetch_repository(name, remote_url, local_dir, skip_existing=False):
@@ -413,44 +581,69 @@ def fetch_repository(name, remote_url, local_dir, skip_existing=False):
    if clone_exists and skip_existing:
        return

-    initalized = subprocess.call('git ls-remote ' + remote_url,  stdout=FNULL, stderr=FNULL, shell=True)
+    initalized = subprocess.call('git ls-remote ' + remote_url,
+                                 stdout=FNULL,
+                                 stderr=FNULL,
+                                 shell=True)
    if initalized == 128:
-        log_info("Skipping {} since it's not initalized".format(name))
+        log_info("Skipping {0} since it's not initalized".format(name))
        return

    if clone_exists:
-        log_info('Updating {} in {}'.format(name, local_dir))
+        log_info('Updating {0} in {1}'.format(name, local_dir))
        git_command = ['git', 'fetch', '--all', '--tags', '--prune']
        logging_subprocess(git_command, None, cwd=local_dir)
    else:
-        log_info('Cloning {} repository from {} to {}'.format(name, remote_url, local_dir))
+        log_info('Cloning {0} repository from {1} to {2}'.format(name,
+                                                                 remote_url,
+                                                                 local_dir))
        git_command = ['git', 'clone', remote_url, local_dir]
        logging_subprocess(git_command, None)


 def backup_account(args, output_directory):
    account_cwd = os.path.join(output_directory, 'account')
-    if args.include_starred or args.include_everything:
-        if not args.skip_existing or not os.path.exists('{0}/starred.json'.format(account_cwd)):
-            log_info('Retrieving {0} starred repositories'.format(args.user))
-            mkdir_p(account_cwd)

-            starred_template = "https://{0}/users/{1}/starred"
-            starred = retrieve_data(args, starred_template.format(get_github_api_host(args), args.user))
-            log_info('Writing {0} starred repositories'.format(len(starred)))
-            with codecs.open('{0}/starred.json'.format(account_cwd), 'w', encoding='utf-8') as starred_file:
-                json.dump(starred, starred_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
+    if args.include_starred or args.include_everything:
+        output_file = '{0}/starred.json'.format(account_cwd)
+        template = "https://{0}/users/{1}/starred"
+        template = template.format(get_github_api_host(args), args.user)
+        _backup_data(args,
+                     'starred repositories',
+                     template,
+                     output_file,
+                     account_cwd)

    if args.include_watched or args.include_everything:
-        if not args.skip_existing or not os.path.exists('{0}/watched.json'.format(account_cwd)):
-            log_info('Retrieving {0} watched repositories'.format(args.user))
-            mkdir_p(account_cwd)
+        output_file = '{0}/watched.json'.format(account_cwd)
+        template = "https://{0}/users/{1}/subscriptions"
+        template = template.format(get_github_api_host(args), args.user)
+        _backup_data(args,
+                     'watched repositories',
+                     template,
+                     output_file,
+                     account_cwd)

-            watched_template = "https://{0}/users/{1}/subscriptions"
-            watched = retrieve_data(args, watched_template.format(get_github_api_host(args), args.user))
-            log_info('Writing {0} watched repositories'.format(len(watched)))
-            with codecs.open('{0}/watched.json'.format(account_cwd), 'w', encoding='utf-8') as watched_file:
-                json.dump(watched, watched_file, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
+
+def _backup_data(args, name, template, output_file, output_directory):
+    skip_existing = args.skip_existing
+    if not skip_existing or not os.path.exists(output_file):
+        log_info('Retrieving {0} {1}'.format(args.user, name))
+        mkdir_p(output_directory)
+        data = retrieve_data(args, template)
+
+        log_info('Writing {0} {1} to disk'.format(len(data), name))
+        with codecs.open(output_file, 'w', encoding='utf-8') as f:
+            json_dump(data, f)
+
+
+def json_dump(data, output_file):
+    json.dump(data,
+              output_file,
+              ensure_ascii=False,
+              sort_keys=True,
+              indent=4,
+              separators=(',', ': '))


 def main():
@@ -458,7 +651,8 @@ def main():

    output_directory = os.path.realpath(args.output_directory)
    if not os.path.isdir(output_directory):
-        log_error('Specified output directory is not a directory: {0}'.format(output_directory))
+        log_error('Specified output directory is not a directory: {0}'.format(
+            output_directory))

    log_info('Backing up user {0} to {1}'.format(args.user, output_directory))