Add support for pull request reviews

Closes #124
This commit is contained in:
Duncan Ogilvie
2026-04-26 14:08:42 +02:00
parent 013b27208e
commit 24b3fdb4f3
4 changed files with 388 additions and 15 deletions

View File

@@ -5,6 +5,8 @@ Unreleased
----------
- Add GitHub Discussions backups via GraphQL, including comments, replies,
optional attachment downloads, and per-repository incremental checkpoints.
- Add pull request review backups with ``--pull-reviews`` and one-time
incremental backfill for existing backups.
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.

View File

@@ -42,7 +42,8 @@ CLI Help output::
[--starred] [--all-starred] [--starred-skip-size-over MB]
[--watched] [--followers] [--following] [--all]
[--issues] [--issue-comments] [--issue-events] [--pulls]
[--pull-comments] [--pull-commits] [--pull-details]
[--pull-comments] [--pull-reviews] [--pull-commits]
[--pull-details]
[--labels] [--hooks] [--milestones] [--security-advisories]
[--discussions] [--repositories] [--bare] [--no-prune]
[--lfs] [--wikis] [--gists] [--starred-gists]
@@ -97,6 +98,7 @@ CLI Help output::
--issue-events include issue events in backup
--pulls include pull requests in backup
--pull-comments include pull request review comments in backup
--pull-reviews include pull request reviews in backup
--pull-commits include pull request commits in backup
--pull-details include more pull request details in backup [*]
--labels include labels in backup
@@ -340,6 +342,14 @@ For finer control, avoid using ``--assets`` with starred repos, or use ``--skip-
Alternatively, consider just storing links to starred repos in JSON format with ``--starred``.
About pull request reviews
--------------------------
Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body.
``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added.
Incremental Backup
------------------
@@ -431,14 +441,14 @@ Quietly and incrementally backup useful Github user data (public and private rep
export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
GH_USER=YOUR-GITHUB-USER
github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER
github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER
Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. ::
export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
GH_USER=YOUR-GITHUB-USER
github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER
github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER
Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only)::

View File

@@ -293,6 +293,12 @@ def parse_args(args=None):
dest="include_pull_comments",
help="include pull request review comments in backup",
)
parser.add_argument(
"--pull-reviews",
action="store_true",
dest="include_pull_reviews",
help="include pull request reviews in backup",
)
parser.add_argument(
"--pull-commits",
action="store_true",
@@ -2427,6 +2433,57 @@ def backup_issues(args, repo_cwd, repository, repos_template):
os.replace(issue_file + ".temp", issue_file) # Atomic write
PULL_OPTIONAL_DATA_KEYS = (
"comment_regular_data",
"comment_data",
"commit_data",
"review_data",
)
PULL_REVIEWS_LAST_UPDATE_FILENAME = "reviews_last_update"
def read_json_file_if_exists(path):
if not os.path.isfile(path):
return None
try:
with codecs.open(path, "r", encoding="utf-8") as f:
return json.load(f)
except (OSError, UnicodeDecodeError, json.decoder.JSONDecodeError) as e:
logger.debug("Error reading existing JSON file {0}: {1}".format(path, e))
return None
def restore_existing_pull_optional_data(pull, existing_pull):
if not existing_pull:
return
for key in PULL_OPTIONAL_DATA_KEYS:
if key not in pull and key in existing_pull:
pull[key] = existing_pull[key]
def get_pull_reviews_since(args, pulls_cwd):
args_since = getattr(args, "since", None)
if not args.incremental:
return args_since, None, None
reviews_last_update_path = os.path.join(
pulls_cwd, PULL_REVIEWS_LAST_UPDATE_FILENAME
)
if not os.path.exists(reviews_last_update_path):
# One-time backfill for existing incremental backups: if the user adds
# --pull-reviews after a repository checkpoint already exists, the
# repository-level checkpoint would otherwise skip old PRs forever.
return None, None, reviews_last_update_path
reviews_since = open(reviews_last_update_path).read().strip()
if args_since and reviews_since:
return min(args_since, reviews_since), reviews_since, reviews_last_update_path
return args_since or reviews_since, reviews_since, reviews_last_update_path
def backup_pulls(args, repo_cwd, repository, repos_template):
has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd))
if args.skip_existing and has_pulls_dir:
@@ -2436,7 +2493,20 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
pulls_cwd = os.path.join(repo_cwd, "pulls")
mkdir_p(repo_cwd, pulls_cwd)
include_pull_reviews = args.include_pull_reviews or args.include_everything
repository_since = getattr(args, "since", None)
pulls_since = repository_since
pull_reviews_since = None
pull_reviews_last_update_path = None
if include_pull_reviews:
(
pulls_since,
pull_reviews_since,
pull_reviews_last_update_path,
) = get_pull_reviews_since(args, pulls_cwd)
pulls = {}
newest_pull_update = None
_pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"])
_issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"])
query_args = {
@@ -2446,27 +2516,43 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
"direction": "desc",
}
def track_newest_pull_update(pull):
nonlocal newest_pull_update
updated_at = pull.get("updated_at")
if updated_at and (
newest_pull_update is None or updated_at > newest_pull_update
):
newest_pull_update = updated_at
def pull_is_due_for_repository_checkpoint(pull):
return not repository_since or pull["updated_at"] >= repository_since
if not args.include_pull_details:
pull_states = ["open", "closed"]
for pull_state in pull_states:
query_args["state"] = pull_state
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
if args.since and pull["updated_at"] < args.since:
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
break
if not args.since or pull["updated_at"] >= args.since:
if not pulls_since or pull["updated_at"] >= pulls_since:
pulls[pull["number"]] = pull
else:
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
if args.since and pull["updated_at"] < args.since:
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
break
if not args.since or pull["updated_at"] >= args.since:
pulls[pull["number"]] = retrieve_data(
args,
_pulls_template + "/{}".format(pull["number"]),
paginated=False,
)[0]
if not pulls_since or pull["updated_at"] >= pulls_since:
if pull_is_due_for_repository_checkpoint(pull):
pulls[pull["number"]] = retrieve_data(
args,
_pulls_template + "/{}".format(pull["number"]),
paginated=False,
)[0]
else:
pulls[pull["number"]] = pull
logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys()))))
# Comments from pulls API are only _review_ comments
@@ -2476,24 +2562,50 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
comments_regular_template = _issue_template + "/{0}/comments"
comments_template = _pulls_template + "/{0}/comments"
commits_template = _pulls_template + "/{0}/commits"
reviews_template = _pulls_template + "/{0}/reviews"
pull_review_errors = False
for number, pull in list(pulls.items()):
pull_file = "{0}/{1}.json".format(pulls_cwd, number)
existing_pull = read_json_file_if_exists(pull_file)
needs_review_backfill = (
include_pull_reviews
and (not existing_pull or "review_data" not in existing_pull)
)
if args.incremental_by_files and os.path.isfile(pull_file):
modified = os.path.getmtime(pull_file)
modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
if modified > pull["updated_at"]:
if modified > pull["updated_at"] and not needs_review_backfill:
logger.info(
"Skipping pull request {0} because it wasn't modified since last backup".format(
number
)
)
continue
if args.include_pull_comments or args.include_everything:
should_fetch_non_review_data = pull_is_due_for_repository_checkpoint(pull)
if (
args.include_pull_comments or args.include_everything
) and should_fetch_non_review_data:
template = comments_regular_template.format(number)
pulls[number]["comment_regular_data"] = retrieve_data(args, template)
template = comments_template.format(number)
pulls[number]["comment_data"] = retrieve_data(args, template)
if args.include_pull_commits or args.include_everything:
if include_pull_reviews:
template = reviews_template.format(number)
try:
pulls[number]["review_data"] = retrieve_data(args, template)
except Exception as e:
pull_review_errors = True
logger.warning(
"Unable to retrieve reviews for pull request {0}#{1}, skipping reviews: {2}".format(
repository["full_name"], number, e
)
)
if (
args.include_pull_commits or args.include_everything
) and should_fetch_non_review_data:
template = commits_template.format(number)
pulls[number]["commit_data"] = retrieve_data(args, template)
if args.include_attachments:
@@ -2501,10 +2613,22 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
args, pulls_cwd, pulls[number], number, repository, item_type="pull"
)
restore_existing_pull_optional_data(pull, existing_pull)
with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f:
json_dump(pull, f)
os.replace(pull_file + ".temp", pull_file) # Atomic write
if (
include_pull_reviews
and args.incremental
and pull_reviews_last_update_path
and newest_pull_update
and not pull_review_errors
and (not pull_reviews_since or newest_pull_update > pull_reviews_since)
):
open(pull_reviews_last_update_path, "w").write(newest_pull_update)
def backup_milestones(args, repo_cwd, repository, repos_template):
milestone_cwd = os.path.join(repo_cwd, "milestones")

237
tests/test_pull_reviews.py Normal file
View File

@@ -0,0 +1,237 @@
"""Tests for pull request review backups."""
import json
import os
from github_backup import github_backup
def test_parse_args_pull_reviews_flag():
args = github_backup.parse_args(["--pull-reviews", "testuser"])
assert args.include_pull_reviews is True
def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch):
args = create_args(include_pulls=True, include_pull_reviews=True)
repository = {"full_name": "owner/repo"}
calls = []
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
calls.append((template, query_args))
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
{
"number": 1,
"updated_at": "2026-02-01T00:00:00Z",
"title": "Add feature",
}
]
return []
if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews":
return [
{
"id": 123,
"state": "APPROVED",
"body": "Looks good",
"submitted_at": "2026-02-01T00:00:00Z",
}
]
raise AssertionError("Unexpected template: {0}".format(template))
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f:
pull = json.load(f)
assert pull["review_data"] == [
{
"body": "Looks good",
"id": 123,
"state": "APPROVED",
"submitted_at": "2026-02-01T00:00:00Z",
}
]
assert (
"https://api.github.com/repos/owner/repo/pulls/1/reviews",
None,
) in calls
def test_pull_reviews_backfill_ignores_repository_checkpoint(
create_args, tmp_path, monkeypatch
):
args = create_args(
include_pulls=True,
include_pull_reviews=True,
incremental=True,
)
args.since = "2026-01-01T00:00:00Z"
repository = {"full_name": "owner/repo"}
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
{
"number": 1,
"updated_at": "2025-01-01T00:00:00Z",
"title": "Old pull request",
}
]
return []
if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews":
return [{"id": 123, "state": "APPROVED"}]
raise AssertionError("Unexpected template: {0}".format(template))
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f:
pull = json.load(f)
assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}]
assert (tmp_path / "pulls" / "reviews_last_update").read_text() == (
"2025-01-01T00:00:00Z"
)
def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoint(
create_args, tmp_path, monkeypatch
):
args = create_args(
include_pulls=True,
include_pull_reviews=True,
incremental=True,
)
args.since = "2026-01-01T00:00:00Z"
repository = {"full_name": "owner/repo"}
pulls_dir = tmp_path / "pulls"
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
{
"number": 1,
"updated_at": "2025-06-01T00:00:00Z",
"title": "Review changed while feature was disabled",
},
{
"number": 2,
"updated_at": "2024-12-01T00:00:00Z",
"title": "Too old",
},
]
return []
if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews":
return [{"id": 123, "state": "COMMENTED"}]
raise AssertionError("Unexpected template: {0}".format(template))
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
assert os.path.exists(tmp_path / "pulls" / "1.json")
assert not os.path.exists(tmp_path / "pulls" / "2.json")
assert (tmp_path / "pulls" / "reviews_last_update").read_text() == (
"2025-06-01T00:00:00Z"
)
def test_pull_reviews_preserves_existing_optional_pull_data(
create_args, tmp_path, monkeypatch
):
args = create_args(include_pulls=True, include_pull_reviews=True)
repository = {"full_name": "owner/repo"}
pulls_dir = tmp_path / "pulls"
pulls_dir.mkdir()
with open(pulls_dir / "1.json", "w", encoding="utf-8") as f:
json.dump(
{
"number": 1,
"updated_at": "2026-01-01T00:00:00Z",
"comment_data": [{"id": 10, "body": "inline comment"}],
"comment_regular_data": [{"id": 11, "body": "regular comment"}],
"commit_data": [{"sha": "abc"}],
},
f,
)
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
{
"number": 1,
"updated_at": "2026-02-01T00:00:00Z",
"title": "Add reviews",
}
]
return []
if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews":
return [{"id": 123, "state": "APPROVED"}]
raise AssertionError("Unexpected template: {0}".format(template))
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
with open(pulls_dir / "1.json", encoding="utf-8") as f:
pull = json.load(f)
assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}]
assert pull["comment_data"] == [{"id": 10, "body": "inline comment"}]
assert pull["comment_regular_data"] == [{"id": 11, "body": "regular comment"}]
assert pull["commit_data"] == [{"sha": "abc"}]
def test_pull_reviews_does_not_advance_checkpoint_on_review_error(
create_args, tmp_path, monkeypatch
):
args = create_args(
include_pulls=True,
include_pull_reviews=True,
incremental=True,
)
args.since = "2026-01-01T00:00:00Z"
repository = {"full_name": "owner/repo"}
pulls_dir = tmp_path / "pulls"
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
{
"number": 1,
"updated_at": "2025-06-01T00:00:00Z",
"title": "Review retrieval fails",
}
]
return []
if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews":
raise Exception("temporary API failure")
raise AssertionError("Unexpected template: {0}".format(template))
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
assert (pulls_dir / "reviews_last_update").read_text() == "2025-01-01T00:00:00Z"