Reduce unnecessary pull requests with incremental fetching

This commit is contained in:
Duncan Ogilvie
2026-04-26 15:15:22 +02:00
parent 02e833e40a
commit 6cd0ab3633
4 changed files with 104 additions and 11 deletions

View File

@@ -12,6 +12,8 @@ Unreleased
backups use the legacy global checkpoint as a migration fallback, and the
legacy file is removed once existing issue/pull backups have resource
checkpoints (#62).
- Stop paginating pull requests during incremental backups once the sorted
results are older than the active checkpoint.
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.

View File

@@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers):
return delay + random.uniform(0, delay * 0.1)
def retrieve_data(args, template, query_args=None, paginated=True):
def retrieve_data(args, template, query_args=None, paginated=True, lazy=False):
"""
Fetch the data from GitHub API.
Handle both single requests and pagination with yield of individual dicts.
Handle both single requests and pagination. Returns a list by default, or
a generator when lazy=True so callers can stop before fetching every page.
Handles throttling, retries, read errors, and DMCA takedowns.
"""
query_args = query_args or {}
@@ -851,6 +852,9 @@ def retrieve_data(args, template, query_args=None, paginated=True):
):
break # No more data
if lazy:
return fetch_all()
return list(fetch_all())
@@ -2656,16 +2660,18 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
pull_states = ["open", "closed"]
for pull_state in pull_states:
query_args["state"] = pull_state
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
for pull in retrieve_data(
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
break
if not pulls_since or pull["updated_at"] >= pulls_since:
pulls[pull["number"]] = pull
else:
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
for pull in _pulls:
for pull in retrieve_data(
args, _pulls_template, query_args=query_args, lazy=True
):
track_newest_pull_update(pull)
if pulls_since and pull["updated_at"] < pulls_since:
break

View File

@@ -0,0 +1,85 @@
"""Tests for incremental pull request pagination."""
import json
import os
from unittest.mock import patch
from github_backup import github_backup
class MockHTTPResponse:
def __init__(self, data, link_header=None):
self._content = json.dumps(data).encode("utf-8")
self._link_header = link_header
self._read = False
self.reason = "OK"
def getcode(self):
return 200
def read(self):
if self._read:
return b""
self._read = True
return self._content
@property
def headers(self):
headers = {"x-ratelimit-remaining": "5000"}
if self._link_header:
headers["Link"] = self._link_header
return headers
def test_backup_pulls_incremental_stops_before_fetching_old_pages(
create_args, tmp_path
):
args = create_args(include_pulls=True, incremental=True)
args.since = "2026-04-26T08:13:46Z"
repository = {"full_name": "owner/repo"}
responses = [
MockHTTPResponse([]),
MockHTTPResponse(
[
{
"number": 2,
"title": "new pull",
"updated_at": "2026-04-26T09:00:00Z",
},
{
"number": 1,
"title": "old pull",
"updated_at": "2026-04-26T07:00:00Z",
},
],
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
),
MockHTTPResponse(
[
{
"number": 0,
"title": "older pull on page 2",
"updated_at": "2026-04-25T07:00:00Z",
}
]
),
]
requests_made = []
def mock_urlopen(request, *args, **kwargs):
requests_made.append(request.get_full_url())
return responses[len(requests_made) - 1]
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
github_backup.backup_pulls(
args, tmp_path, repository, "https://api.github.com/repos"
)
assert len(requests_made) == 2
assert "state=open" in requests_made[0]
assert "state=closed" in requests_made[1]
assert all("page=2" not in url for url in requests_made)
assert os.path.exists(tmp_path / "pulls" / "2.json")
assert not os.path.exists(tmp_path / "pulls" / "1.json")
assert not os.path.exists(tmp_path / "pulls" / "0.json")

View File

@@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch):
repository = {"full_name": "owner/repo"}
calls = []
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
calls.append((template, query_args))
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
@@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint(
args.since = "2026-01-01T00:00:00Z"
repository = {"full_name": "owner/repo"}
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
@@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
@@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data(
f,
)
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [
@@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error(
pulls_dir.mkdir()
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
if template == "https://api.github.com/repos/owner/repo/pulls":
if query_args["state"] == "open":
return [