From 014eff395a999f82674547efd77a6470b038ce91 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 16:09:42 +0200 Subject: [PATCH] Skip checkpoint-equal incremental items --- CHANGES.rst | 4 +- github_backup/github_backup.py | 12 +++--- tests/test_discussions.py | 35 +++++++++++++++++ tests/test_pull_incremental_pagination.py | 46 +++++++++++++++++++++++ 4 files changed, 90 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d2ceb0..3d4cdce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -13,7 +13,9 @@ Unreleased legacy file is removed once existing issue/pull backups have resource checkpoints (#62). - Stop paginating pull requests during incremental backups once the sorted - results are older than the active checkpoint. + results are at or older than the active checkpoint. +- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly + matches the active incremental checkpoint. - Avoid extra release asset list requests by using asset metadata already included in GitHub's releases response. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 6edfb05..ae4ef2e 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2233,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None): if updated_at and (newest_seen is None or updated_at > newest_seen): newest_seen = updated_at - if since and updated_at and updated_at < since: + if since and updated_at and updated_at <= since: stop = True break @@ -2654,7 +2654,7 @@ def backup_pulls(args, repo_cwd, repository, repos_template): newest_pull_update = updated_at def pull_is_due_for_repository_checkpoint(pull): - return not repository_since or pull["updated_at"] >= repository_since + return not repository_since or pull["updated_at"] > repository_since if not args.include_pull_details: pull_states = ["open", "closed"] @@ -2664,18 +2664,18 @@ def backup_pulls(args, repo_cwd, repository, repos_template): args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: pulls[pull["number"]] = pull else: for pull in retrieve_data( args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: if pull_is_due_for_repository_checkpoint(pull): pulls[pull["number"]] = retrieve_data( args, diff --git a/tests/test_discussions.py b/tests/test_discussions.py index 89fd8dd..2b5e3fb 100644 --- a/tests/test_discussions.py +++ b/tests/test_discussions.py @@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args): ) +def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 1, + "nodes": [ + { + "number": 1, + "title": "already backed up", + "updatedAt": "2026-01-01T00:00:00Z", + }, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 1 + assert newest == "2026-01-01T00:00:00Z" + assert summaries == [] + assert mock_retrieve.call_count == 1 + + def test_retrieve_discussion_summaries_disabled_discussions(create_args): args = create_args() repository = {"full_name": "owner/repo"} diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py index 11230b0..ac0f83f 100644 --- a/tests/test_pull_incremental_pagination.py +++ b/tests/test_pull_incremental_pagination.py @@ -31,6 +31,52 @@ class MockHTTPResponse: return headers +def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 1, + "title": "already backed up", + "updated_at": "2026-04-26T08:13:46Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") + + def test_backup_pulls_incremental_stops_before_fetching_old_pages( create_args, tmp_path ):