Skip checkpoint-equal incremental items

This commit is contained in:
Duncan Ogilvie
2026-04-26 16:09:42 +02:00
parent 9d0cfdb61d
commit 014eff395a
4 changed files with 90 additions and 7 deletions

View File

@@ -13,7 +13,9 @@ Unreleased
legacy file is removed once existing issue/pull backups have resource
checkpoints (#62).
- Stop paginating pull requests during incremental backups once the sorted
results are older than the active checkpoint.
results are at or older than the active checkpoint.
- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly
matches the active incremental checkpoint.
- Avoid extra release asset list requests by using asset metadata already
included in GitHub's releases response.
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.

View File

@@ -2233,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None):
if updated_at and (newest_seen is None or updated_at > newest_seen):
newest_seen = updated_at
if since and updated_at and updated_at < since:
if since and updated_at and updated_at <= since:
stop = True
break
@@ -2654,7 +2654,7 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
newest_pull_update = updated_at
def pull_is_due_for_repository_checkpoint(pull):
return not repository_since or pull["updated_at"] >= repository_since
return not repository_since or pull["updated_at"] > repository_since
if not args.include_pull_details:
pull_states = ["open", "closed"]
@@ -2664,18 +2664,18 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
if pulls_since and pull["updated_at"] <= pulls_since:
break
if not pulls_since or pull["updated_at"] >= pulls_since:
if not pulls_since or pull["updated_at"] > pulls_since:
pulls[pull["number"]] = pull
else:
for pull in retrieve_data(
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
if pulls_since and pull["updated_at"] <= pulls_since:
break
if not pulls_since or pull["updated_at"] >= pulls_since:
if not pulls_since or pull["updated_at"] > pulls_since:
if pull_is_due_for_repository_checkpoint(pull):
pulls[pull["number"]] = retrieve_data(
args,

View File

@@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args):
)
def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args):
args = create_args()
repository = {"full_name": "owner/repo"}
page = {
"repository": {
"hasDiscussionsEnabled": True,
"discussions": {
"totalCount": 1,
"nodes": [
{
"number": 1,
"title": "already backed up",
"updatedAt": "2026-01-01T00:00:00Z",
},
],
"pageInfo": {"hasNextPage": True, "endCursor": "NEXT"},
},
}
}
with patch(
"github_backup.github_backup.retrieve_graphql_data", return_value=page
) as mock_retrieve:
summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
args, repository, since="2026-01-01T00:00:00Z"
)
assert enabled is True
assert total == 1
assert newest == "2026-01-01T00:00:00Z"
assert summaries == []
assert mock_retrieve.call_count == 1
def test_retrieve_discussion_summaries_disabled_discussions(create_args):
args = create_args()
repository = {"full_name": "owner/repo"}

View File

@@ -31,6 +31,52 @@ class MockHTTPResponse:
return headers
def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path):
args = create_args(include_pulls=True, incremental=True)
args.since = "2026-04-26T08:13:46Z"
repository = {"full_name": "owner/repo"}
responses = [
MockHTTPResponse([]),
MockHTTPResponse(
[
{
"number": 1,
"title": "already backed up",
"updated_at": "2026-04-26T08:13:46Z",
},
],
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
),
MockHTTPResponse(
[
{
"number": 0,
"title": "older pull on page 2",
"updated_at": "2026-04-25T07:00:00Z",
}
]
),
]
requests_made = []
def mock_urlopen(request, *args, **kwargs):
requests_made.append(request.get_full_url())
return responses[len(requests_made) - 1]
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
assert len(requests_made) == 2
assert "state=open" in requests_made[0]
assert "state=closed" in requests_made[1]
assert all("page=2" not in url for url in requests_made)
assert not os.path.exists(tmp_path / "pulls" / "1.json")
assert not os.path.exists(tmp_path / "pulls" / "0.json")
def test_backup_pulls_incremental_stops_before_fetching_old_pages(
create_args, tmp_path
):