mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2026-04-29 20:15:36 +02:00
Merge pull request #505 from mrexodia/redundant-fetches
Reduce redundant fetches
This commit is contained in:
@@ -12,6 +12,12 @@ Unreleased
|
||||
backups use the legacy global checkpoint as a migration fallback, and the
|
||||
legacy file is removed once existing issue/pull backups have resource
|
||||
checkpoints (#62).
|
||||
- Stop paginating pull requests during incremental backups once the sorted
|
||||
results are at or older than the active checkpoint.
|
||||
- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly
|
||||
matches the active incremental checkpoint.
|
||||
- Avoid extra release asset list requests by using asset metadata already
|
||||
included in GitHub's releases response.
|
||||
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.
|
||||
|
||||
|
||||
|
||||
@@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers):
|
||||
return delay + random.uniform(0, delay * 0.1)
|
||||
|
||||
|
||||
def retrieve_data(args, template, query_args=None, paginated=True):
|
||||
def retrieve_data(args, template, query_args=None, paginated=True, lazy=False):
|
||||
"""
|
||||
Fetch the data from GitHub API.
|
||||
|
||||
Handle both single requests and pagination with yield of individual dicts.
|
||||
Handle both single requests and pagination. Returns a list by default, or
|
||||
a generator when lazy=True so callers can stop before fetching every page.
|
||||
Handles throttling, retries, read errors, and DMCA takedowns.
|
||||
"""
|
||||
query_args = query_args or {}
|
||||
@@ -851,6 +852,9 @@ def retrieve_data(args, template, query_args=None, paginated=True):
|
||||
):
|
||||
break # No more data
|
||||
|
||||
if lazy:
|
||||
return fetch_all()
|
||||
|
||||
return list(fetch_all())
|
||||
|
||||
|
||||
@@ -2229,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None):
|
||||
if updated_at and (newest_seen is None or updated_at > newest_seen):
|
||||
newest_seen = updated_at
|
||||
|
||||
if since and updated_at and updated_at < since:
|
||||
if since and updated_at and updated_at <= since:
|
||||
stop = True
|
||||
break
|
||||
|
||||
@@ -2650,26 +2654,28 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
|
||||
newest_pull_update = updated_at
|
||||
|
||||
def pull_is_due_for_repository_checkpoint(pull):
|
||||
return not repository_since or pull["updated_at"] >= repository_since
|
||||
return not repository_since or pull["updated_at"] > repository_since
|
||||
|
||||
if not args.include_pull_details:
|
||||
pull_states = ["open", "closed"]
|
||||
for pull_state in pull_states:
|
||||
query_args["state"] = pull_state
|
||||
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
|
||||
for pull in _pulls:
|
||||
for pull in retrieve_data(
|
||||
args, _pulls_template, query_args=query_args, lazy=True
|
||||
):
|
||||
track_newest_pull_update(pull)
|
||||
if pulls_since and pull["updated_at"] < pulls_since:
|
||||
if pulls_since and pull["updated_at"] <= pulls_since:
|
||||
break
|
||||
if not pulls_since or pull["updated_at"] >= pulls_since:
|
||||
if not pulls_since or pull["updated_at"] > pulls_since:
|
||||
pulls[pull["number"]] = pull
|
||||
else:
|
||||
_pulls = retrieve_data(args, _pulls_template, query_args=query_args)
|
||||
for pull in _pulls:
|
||||
for pull in retrieve_data(
|
||||
args, _pulls_template, query_args=query_args, lazy=True
|
||||
):
|
||||
track_newest_pull_update(pull)
|
||||
if pulls_since and pull["updated_at"] < pulls_since:
|
||||
if pulls_since and pull["updated_at"] <= pulls_since:
|
||||
break
|
||||
if not pulls_since or pull["updated_at"] >= pulls_since:
|
||||
if not pulls_since or pull["updated_at"] > pulls_since:
|
||||
if pull_is_due_for_repository_checkpoint(pull):
|
||||
pulls[pull["number"]] = retrieve_data(
|
||||
args,
|
||||
@@ -2913,7 +2919,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
|
||||
written_count += 1
|
||||
|
||||
if include_assets and not skip_assets:
|
||||
assets = retrieve_data(args, release["assets_url"])
|
||||
# The releases list API already includes release asset metadata. Use
|
||||
# it to avoid an extra /releases/{id}/assets request per release.
|
||||
# Keep a fallback for older/enterprise responses that might omit it.
|
||||
assets = release.get("assets")
|
||||
if assets is None:
|
||||
assets = retrieve_data(args, release["assets_url"])
|
||||
if len(assets) > 0:
|
||||
# give release asset files somewhere to live & download them (not including source archives)
|
||||
release_assets_cwd = os.path.join(release_cwd, release_name_safe)
|
||||
|
||||
@@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args):
|
||||
)
|
||||
|
||||
|
||||
def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args):
|
||||
args = create_args()
|
||||
repository = {"full_name": "owner/repo"}
|
||||
|
||||
page = {
|
||||
"repository": {
|
||||
"hasDiscussionsEnabled": True,
|
||||
"discussions": {
|
||||
"totalCount": 1,
|
||||
"nodes": [
|
||||
{
|
||||
"number": 1,
|
||||
"title": "already backed up",
|
||||
"updatedAt": "2026-01-01T00:00:00Z",
|
||||
},
|
||||
],
|
||||
"pageInfo": {"hasNextPage": True, "endCursor": "NEXT"},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
with patch(
|
||||
"github_backup.github_backup.retrieve_graphql_data", return_value=page
|
||||
) as mock_retrieve:
|
||||
summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
|
||||
args, repository, since="2026-01-01T00:00:00Z"
|
||||
)
|
||||
|
||||
assert enabled is True
|
||||
assert total == 1
|
||||
assert newest == "2026-01-01T00:00:00Z"
|
||||
assert summaries == []
|
||||
assert mock_retrieve.call_count == 1
|
||||
|
||||
|
||||
def test_retrieve_discussion_summaries_disabled_discussions(create_args):
|
||||
args = create_args()
|
||||
repository = {"full_name": "owner/repo"}
|
||||
|
||||
131
tests/test_pull_incremental_pagination.py
Normal file
131
tests/test_pull_incremental_pagination.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Tests for incremental pull request pagination."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
from github_backup import github_backup
|
||||
|
||||
|
||||
class MockHTTPResponse:
|
||||
def __init__(self, data, link_header=None):
|
||||
self._content = json.dumps(data).encode("utf-8")
|
||||
self._link_header = link_header
|
||||
self._read = False
|
||||
self.reason = "OK"
|
||||
|
||||
def getcode(self):
|
||||
return 200
|
||||
|
||||
def read(self):
|
||||
if self._read:
|
||||
return b""
|
||||
self._read = True
|
||||
return self._content
|
||||
|
||||
@property
|
||||
def headers(self):
|
||||
headers = {"x-ratelimit-remaining": "5000"}
|
||||
if self._link_header:
|
||||
headers["Link"] = self._link_header
|
||||
return headers
|
||||
|
||||
|
||||
def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path):
|
||||
args = create_args(include_pulls=True, incremental=True)
|
||||
args.since = "2026-04-26T08:13:46Z"
|
||||
repository = {"full_name": "owner/repo"}
|
||||
|
||||
responses = [
|
||||
MockHTTPResponse([]),
|
||||
MockHTTPResponse(
|
||||
[
|
||||
{
|
||||
"number": 1,
|
||||
"title": "already backed up",
|
||||
"updated_at": "2026-04-26T08:13:46Z",
|
||||
},
|
||||
],
|
||||
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
|
||||
),
|
||||
MockHTTPResponse(
|
||||
[
|
||||
{
|
||||
"number": 0,
|
||||
"title": "older pull on page 2",
|
||||
"updated_at": "2026-04-25T07:00:00Z",
|
||||
}
|
||||
]
|
||||
),
|
||||
]
|
||||
requests_made = []
|
||||
|
||||
def mock_urlopen(request, *args, **kwargs):
|
||||
requests_made.append(request.get_full_url())
|
||||
return responses[len(requests_made) - 1]
|
||||
|
||||
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||
github_backup.backup_pulls(
|
||||
args, tmp_path, repository, "https://api.github.com/repos"
|
||||
)
|
||||
|
||||
assert len(requests_made) == 2
|
||||
assert "state=open" in requests_made[0]
|
||||
assert "state=closed" in requests_made[1]
|
||||
assert all("page=2" not in url for url in requests_made)
|
||||
assert not os.path.exists(tmp_path / "pulls" / "1.json")
|
||||
assert not os.path.exists(tmp_path / "pulls" / "0.json")
|
||||
|
||||
|
||||
def test_backup_pulls_incremental_stops_before_fetching_old_pages(
|
||||
create_args, tmp_path
|
||||
):
|
||||
args = create_args(include_pulls=True, incremental=True)
|
||||
args.since = "2026-04-26T08:13:46Z"
|
||||
repository = {"full_name": "owner/repo"}
|
||||
|
||||
responses = [
|
||||
MockHTTPResponse([]),
|
||||
MockHTTPResponse(
|
||||
[
|
||||
{
|
||||
"number": 2,
|
||||
"title": "new pull",
|
||||
"updated_at": "2026-04-26T09:00:00Z",
|
||||
},
|
||||
{
|
||||
"number": 1,
|
||||
"title": "old pull",
|
||||
"updated_at": "2026-04-26T07:00:00Z",
|
||||
},
|
||||
],
|
||||
link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&state=closed&page=2>; rel="next"',
|
||||
),
|
||||
MockHTTPResponse(
|
||||
[
|
||||
{
|
||||
"number": 0,
|
||||
"title": "older pull on page 2",
|
||||
"updated_at": "2026-04-25T07:00:00Z",
|
||||
}
|
||||
]
|
||||
),
|
||||
]
|
||||
requests_made = []
|
||||
|
||||
def mock_urlopen(request, *args, **kwargs):
|
||||
requests_made.append(request.get_full_url())
|
||||
return responses[len(requests_made) - 1]
|
||||
|
||||
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||
github_backup.backup_pulls(
|
||||
args, tmp_path, repository, "https://api.github.com/repos"
|
||||
)
|
||||
|
||||
assert len(requests_made) == 2
|
||||
assert "state=open" in requests_made[0]
|
||||
assert "state=closed" in requests_made[1]
|
||||
assert all("page=2" not in url for url in requests_made)
|
||||
assert os.path.exists(tmp_path / "pulls" / "2.json")
|
||||
assert not os.path.exists(tmp_path / "pulls" / "1.json")
|
||||
assert not os.path.exists(tmp_path / "pulls" / "0.json")
|
||||
@@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch):
|
||||
repository = {"full_name": "owner/repo"}
|
||||
calls = []
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
calls.append((template, query_args))
|
||||
if template == "https://api.github.com/repos/owner/repo/pulls":
|
||||
if query_args["state"] == "open":
|
||||
@@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint(
|
||||
args.since = "2026-01-01T00:00:00Z"
|
||||
repository = {"full_name": "owner/repo"}
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
if template == "https://api.github.com/repos/owner/repo/pulls":
|
||||
if query_args["state"] == "open":
|
||||
return [
|
||||
@@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi
|
||||
pulls_dir.mkdir()
|
||||
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
if template == "https://api.github.com/repos/owner/repo/pulls":
|
||||
if query_args["state"] == "open":
|
||||
return [
|
||||
@@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data(
|
||||
f,
|
||||
)
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
if template == "https://api.github.com/repos/owner/repo/pulls":
|
||||
if query_args["state"] == "open":
|
||||
return [
|
||||
@@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error(
|
||||
pulls_dir.mkdir()
|
||||
(pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z")
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True):
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
if template == "https://api.github.com/repos/owner/repo/pulls":
|
||||
if query_args["state"] == "open":
|
||||
return [
|
||||
|
||||
95
tests/test_releases.py
Normal file
95
tests/test_releases.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Tests for release backup behavior."""
|
||||
|
||||
from github_backup import github_backup
|
||||
|
||||
|
||||
def test_backup_releases_uses_embedded_assets_without_extra_asset_list_request(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(include_releases=True, include_assets=True)
|
||||
repository = {"full_name": "owner/repo", "name": "repo"}
|
||||
calls = []
|
||||
downloads = []
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
calls.append(template)
|
||||
if template == "https://api.github.com/repos/owner/repo/releases":
|
||||
return [
|
||||
{
|
||||
"tag_name": "v1.0.0",
|
||||
"created_at": "2026-01-01T00:00:00Z",
|
||||
"updated_at": "2026-01-01T00:00:00Z",
|
||||
"prerelease": False,
|
||||
"draft": False,
|
||||
"assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets",
|
||||
"assets": [
|
||||
{
|
||||
"name": "artifact.zip",
|
||||
"url": "https://api.github.com/repos/owner/repo/releases/assets/1",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
raise AssertionError("Unexpected API request: {0}".format(template))
|
||||
|
||||
def fake_download_file(url, path, auth, as_app=False, fine=False):
|
||||
downloads.append((url, path))
|
||||
|
||||
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
|
||||
monkeypatch.setattr(github_backup, "download_file", fake_download_file)
|
||||
|
||||
github_backup.backup_releases(
|
||||
args,
|
||||
tmp_path,
|
||||
repository,
|
||||
"https://api.github.com/repos",
|
||||
include_assets=True,
|
||||
)
|
||||
|
||||
assert calls == ["https://api.github.com/repos/owner/repo/releases"]
|
||||
assert downloads == [
|
||||
(
|
||||
"https://api.github.com/repos/owner/repo/releases/assets/1",
|
||||
str(tmp_path / "releases" / "v1.0.0" / "artifact.zip"),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_backup_releases_falls_back_to_assets_url_when_assets_missing(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(include_releases=True, include_assets=True)
|
||||
repository = {"full_name": "owner/repo", "name": "repo"}
|
||||
calls = []
|
||||
|
||||
def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs):
|
||||
calls.append(template)
|
||||
if template == "https://api.github.com/repos/owner/repo/releases":
|
||||
return [
|
||||
{
|
||||
"tag_name": "v1.0.0",
|
||||
"created_at": "2026-01-01T00:00:00Z",
|
||||
"updated_at": "2026-01-01T00:00:00Z",
|
||||
"prerelease": False,
|
||||
"draft": False,
|
||||
"assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets",
|
||||
}
|
||||
]
|
||||
if template == "https://api.github.com/repos/owner/repo/releases/1/assets":
|
||||
return []
|
||||
raise AssertionError("Unexpected API request: {0}".format(template))
|
||||
|
||||
monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data)
|
||||
|
||||
github_backup.backup_releases(
|
||||
args,
|
||||
tmp_path,
|
||||
repository,
|
||||
"https://api.github.com/repos",
|
||||
include_assets=True,
|
||||
)
|
||||
|
||||
assert calls == [
|
||||
"https://api.github.com/repos/owner/repo/releases",
|
||||
"https://api.github.com/repos/owner/repo/releases/1/assets",
|
||||
]
|
||||
Reference in New Issue
Block a user