diff --git a/CHANGES.rst b/CHANGES.rst index 50f8d54..b790ce1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,8 @@ Unreleased ---------- - Add GitHub Discussions backups via GraphQL, including comments, replies, optional attachment downloads, and per-repository incremental checkpoints. +- Add pull request review backups with ``--pull-reviews`` and one-time + incremental backfill for existing backups. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 4135743..52d7222 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,8 @@ CLI Help output:: [--starred] [--all-starred] [--starred-skip-size-over MB] [--watched] [--followers] [--following] [--all] [--issues] [--issue-comments] [--issue-events] [--pulls] - [--pull-comments] [--pull-commits] [--pull-details] + [--pull-comments] [--pull-reviews] [--pull-commits] + [--pull-details] [--labels] [--hooks] [--milestones] [--security-advisories] [--discussions] [--repositories] [--bare] [--no-prune] [--lfs] [--wikis] [--gists] [--starred-gists] @@ -97,6 +98,7 @@ CLI Help output:: --issue-events include issue events in backup --pulls include pull requests in backup --pull-comments include pull request review comments in backup + --pull-reviews include pull request reviews in backup --pull-commits include pull request commits in backup --pull-details include more pull request details in backup [*] --labels include labels in backup @@ -340,6 +342,14 @@ For finer control, avoid using ``--assets`` with starred repos, or use ``--skip- Alternatively, consider just storing links to starred repos in JSON format with ``--starred``. +About pull request reviews +-------------------------- + +Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. + +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. + + Incremental Backup ------------------ @@ -431,14 +441,14 @@ Quietly and incrementally backup useful Github user data (public and private rep export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index c1245bd..054d0c6 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -293,6 +293,12 @@ def parse_args(args=None): dest="include_pull_comments", help="include pull request review comments in backup", ) + parser.add_argument( + "--pull-reviews", + action="store_true", + dest="include_pull_reviews", + help="include pull request reviews in backup", + ) parser.add_argument( "--pull-commits", action="store_true", @@ -2427,6 +2433,57 @@ def backup_issues(args, repo_cwd, repository, repos_template): os.replace(issue_file + ".temp", issue_file) # Atomic write +PULL_OPTIONAL_DATA_KEYS = ( + "comment_regular_data", + "comment_data", + "commit_data", + "review_data", +) +PULL_REVIEWS_LAST_UPDATE_FILENAME = "reviews_last_update" + + +def read_json_file_if_exists(path): + if not os.path.isfile(path): + return None + + try: + with codecs.open(path, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, UnicodeDecodeError, json.decoder.JSONDecodeError) as e: + logger.debug("Error reading existing JSON file {0}: {1}".format(path, e)) + return None + + +def restore_existing_pull_optional_data(pull, existing_pull): + if not existing_pull: + return + + for key in PULL_OPTIONAL_DATA_KEYS: + if key not in pull and key in existing_pull: + pull[key] = existing_pull[key] + + +def get_pull_reviews_since(args, pulls_cwd): + args_since = getattr(args, "since", None) + if not args.incremental: + return args_since, None, None + + reviews_last_update_path = os.path.join( + pulls_cwd, PULL_REVIEWS_LAST_UPDATE_FILENAME + ) + if not os.path.exists(reviews_last_update_path): + # One-time backfill for existing incremental backups: if the user adds + # --pull-reviews after a repository checkpoint already exists, the + # repository-level checkpoint would otherwise skip old PRs forever. + return None, None, reviews_last_update_path + + reviews_since = open(reviews_last_update_path).read().strip() + if args_since and reviews_since: + return min(args_since, reviews_since), reviews_since, reviews_last_update_path + + return args_since or reviews_since, reviews_since, reviews_last_update_path + + def backup_pulls(args, repo_cwd, repository, repos_template): has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd)) if args.skip_existing and has_pulls_dir: @@ -2436,7 +2493,20 @@ def backup_pulls(args, repo_cwd, repository, repos_template): pulls_cwd = os.path.join(repo_cwd, "pulls") mkdir_p(repo_cwd, pulls_cwd) + include_pull_reviews = args.include_pull_reviews or args.include_everything + repository_since = getattr(args, "since", None) + pulls_since = repository_since + pull_reviews_since = None + pull_reviews_last_update_path = None + if include_pull_reviews: + ( + pulls_since, + pull_reviews_since, + pull_reviews_last_update_path, + ) = get_pull_reviews_since(args, pulls_cwd) + pulls = {} + newest_pull_update = None _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"]) _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) query_args = { @@ -2446,27 +2516,43 @@ def backup_pulls(args, repo_cwd, repository, repos_template): "direction": "desc", } + def track_newest_pull_update(pull): + nonlocal newest_pull_update + updated_at = pull.get("updated_at") + if updated_at and ( + newest_pull_update is None or updated_at > newest_pull_update + ): + newest_pull_update = updated_at + + def pull_is_due_for_repository_checkpoint(pull): + return not repository_since or pull["updated_at"] >= repository_since + if not args.include_pull_details: pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: + if not pulls_since or pull["updated_at"] >= pulls_since: pulls[pull["number"]] = pull else: _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: - pulls[pull["number"]] = retrieve_data( - args, - _pulls_template + "/{}".format(pull["number"]), - paginated=False, - )[0] + if not pulls_since or pull["updated_at"] >= pulls_since: + if pull_is_due_for_repository_checkpoint(pull): + pulls[pull["number"]] = retrieve_data( + args, + _pulls_template + "/{}".format(pull["number"]), + paginated=False, + )[0] + else: + pulls[pull["number"]] = pull logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys())))) # Comments from pulls API are only _review_ comments @@ -2476,24 +2562,50 @@ def backup_pulls(args, repo_cwd, repository, repos_template): comments_regular_template = _issue_template + "/{0}/comments" comments_template = _pulls_template + "/{0}/comments" commits_template = _pulls_template + "/{0}/commits" + reviews_template = _pulls_template + "/{0}/reviews" + pull_review_errors = False + for number, pull in list(pulls.items()): pull_file = "{0}/{1}.json".format(pulls_cwd, number) + existing_pull = read_json_file_if_exists(pull_file) + needs_review_backfill = ( + include_pull_reviews + and (not existing_pull or "review_data" not in existing_pull) + ) + if args.incremental_by_files and os.path.isfile(pull_file): modified = os.path.getmtime(pull_file) modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") - if modified > pull["updated_at"]: + if modified > pull["updated_at"] and not needs_review_backfill: logger.info( "Skipping pull request {0} because it wasn't modified since last backup".format( number ) ) continue - if args.include_pull_comments or args.include_everything: + + should_fetch_non_review_data = pull_is_due_for_repository_checkpoint(pull) + if ( + args.include_pull_comments or args.include_everything + ) and should_fetch_non_review_data: template = comments_regular_template.format(number) pulls[number]["comment_regular_data"] = retrieve_data(args, template) template = comments_template.format(number) pulls[number]["comment_data"] = retrieve_data(args, template) - if args.include_pull_commits or args.include_everything: + if include_pull_reviews: + template = reviews_template.format(number) + try: + pulls[number]["review_data"] = retrieve_data(args, template) + except Exception as e: + pull_review_errors = True + logger.warning( + "Unable to retrieve reviews for pull request {0}#{1}, skipping reviews: {2}".format( + repository["full_name"], number, e + ) + ) + if ( + args.include_pull_commits or args.include_everything + ) and should_fetch_non_review_data: template = commits_template.format(number) pulls[number]["commit_data"] = retrieve_data(args, template) if args.include_attachments: @@ -2501,10 +2613,22 @@ def backup_pulls(args, repo_cwd, repository, repos_template): args, pulls_cwd, pulls[number], number, repository, item_type="pull" ) + restore_existing_pull_optional_data(pull, existing_pull) + with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f: json_dump(pull, f) os.replace(pull_file + ".temp", pull_file) # Atomic write + if ( + include_pull_reviews + and args.incremental + and pull_reviews_last_update_path + and newest_pull_update + and not pull_review_errors + and (not pull_reviews_since or newest_pull_update > pull_reviews_since) + ): + open(pull_reviews_last_update_path, "w").write(newest_pull_update) + def backup_milestones(args, repo_cwd, repository, repos_template): milestone_cwd = os.path.join(repo_cwd, "milestones") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py new file mode 100644 index 0000000..6130269 --- /dev/null +++ b/tests/test_pull_reviews.py @@ -0,0 +1,237 @@ +"""Tests for pull request review backups.""" + +import json +import os + +from github_backup import github_backup + + +def test_parse_args_pull_reviews_flag(): + args = github_backup.parse_args(["--pull-reviews", "testuser"]) + assert args.include_pull_reviews is True + + +def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + calls.append((template, query_args)) + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add feature", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [ + { + "id": 123, + "state": "APPROVED", + "body": "Looks good", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [ + { + "body": "Looks good", + "id": 123, + "state": "APPROVED", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + assert ( + "https://api.github.com/repos/owner/repo/pulls/1/reviews", + None, + ) in calls + + +def test_pull_reviews_backfill_ignores_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-01-01T00:00:00Z", + "title": "Old pull request", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-01-01T00:00:00Z" + ) + + +def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review changed while feature was disabled", + }, + { + "number": 2, + "updated_at": "2024-12-01T00:00:00Z", + "title": "Too old", + }, + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "COMMENTED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "2.json") + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-06-01T00:00:00Z" + ) + + +def test_pull_reviews_preserves_existing_optional_pull_data( + create_args, tmp_path, monkeypatch +): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + with open(pulls_dir / "1.json", "w", encoding="utf-8") as f: + json.dump( + { + "number": 1, + "updated_at": "2026-01-01T00:00:00Z", + "comment_data": [{"id": 10, "body": "inline comment"}], + "comment_regular_data": [{"id": 11, "body": "regular comment"}], + "commit_data": [{"sha": "abc"}], + }, + f, + ) + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add reviews", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(pulls_dir / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert pull["comment_data"] == [{"id": 10, "body": "inline comment"}] + assert pull["comment_regular_data"] == [{"id": 11, "body": "regular comment"}] + assert pull["commit_data"] == [{"sha": "abc"}] + + +def test_pull_reviews_does_not_advance_checkpoint_on_review_error( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review retrieval fails", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + raise Exception("temporary API failure") + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert (pulls_dir / "reviews_last_update").read_text() == "2025-01-01T00:00:00Z"