Merge pull request #504 from mrexodia/per-resource-last-update

Implement per-resource last_update timestamps
This commit is contained in:
Jose Diaz-Gonzalez
2026-04-29 06:25:05 -04:00
committed by GitHub
4 changed files with 348 additions and 25 deletions

View File

@@ -7,6 +7,11 @@ Unreleased
optional attachment downloads, and per-repository incremental checkpoints.
- Add pull request review backups with ``--pull-reviews`` and one-time
incremental backfill for existing backups.
- Store incremental ``last_update`` checkpoints per repository resource instead
of using one global checkpoint for the whole output directory. Existing
backups use the legacy global checkpoint as a migration fallback, and the
legacy file is removed once existing issue/pull backups have resource
checkpoints (#62).
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.

View File

@@ -347,15 +347,19 @@ About pull request reviews
Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body.
``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added.
``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added.
Incremental Backup
------------------
Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run.
Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository.
This means any blocking errors on previous runs can cause a large amount of missing data in backups.
Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint.
After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically.
This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource.
Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something.
@@ -368,7 +372,7 @@ Known blocking errors
Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API.
If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data.
If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints.
It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs.

View File

@@ -1928,26 +1928,138 @@ def filter_repositories(args, unfiltered_repositories):
return repositories
INCREMENTAL_LAST_UPDATE_FILENAME = "last_update"
INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls")
def get_repository_checkpoint_time(repository):
timestamps = [
timestamp
for timestamp in (repository.get("updated_at"), repository.get("pushed_at"))
if timestamp
]
if timestamps:
return max(timestamps)
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
def resource_backup_exists(resource_cwd):
if not os.path.isdir(resource_cwd):
return False
ignored_names = {
INCREMENTAL_LAST_UPDATE_FILENAME,
PULL_REVIEWS_LAST_UPDATE_FILENAME,
}
for name in os.listdir(resource_cwd):
if name in ignored_names or name.endswith(".temp"):
continue
return True
return False
def read_legacy_last_update(args, output_directory):
if not args.incremental:
return None, None
last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME)
if os.path.exists(last_update_path):
return last_update_path, open(last_update_path).read().strip()
return last_update_path, None
def read_resource_last_update(args, resource_cwd, legacy_last_update=None):
if not args.incremental:
return None
last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME)
if os.path.exists(last_update_path):
return open(last_update_path).read().strip()
if legacy_last_update and resource_backup_exists(resource_cwd):
return legacy_last_update
return None
def write_resource_last_update(args, resource_cwd, repository):
if not args.incremental:
return
mkdir_p(resource_cwd)
last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME)
open(last_update_path, "w").write(get_repository_checkpoint_time(repository))
def iter_incremental_resource_dirs(output_directory):
repositories_dir = os.path.join(output_directory, "repositories")
if os.path.isdir(repositories_dir):
for repository_name in os.listdir(repositories_dir):
repo_cwd = os.path.join(repositories_dir, repository_name)
if not os.path.isdir(repo_cwd):
continue
for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES:
yield os.path.join(repo_cwd, resource_name)
starred_dir = os.path.join(output_directory, "starred")
if os.path.isdir(starred_dir):
for owner_name in os.listdir(starred_dir):
owner_cwd = os.path.join(starred_dir, owner_name)
if not os.path.isdir(owner_cwd):
continue
for repository_name in os.listdir(owner_cwd):
repo_cwd = os.path.join(owner_cwd, repository_name)
if not os.path.isdir(repo_cwd):
continue
for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES:
yield os.path.join(repo_cwd, resource_name)
def has_unmigrated_incremental_resources(output_directory):
for resource_cwd in iter_incremental_resource_dirs(output_directory):
last_update_path = os.path.join(
resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME
)
if resource_backup_exists(resource_cwd) and not os.path.exists(
last_update_path
):
return True
return False
def remove_legacy_last_update_if_migrated(
args, output_directory, legacy_last_update_path
):
if not args.incremental or not legacy_last_update_path:
return
if not os.path.exists(legacy_last_update_path):
return
if has_unmigrated_incremental_resources(output_directory):
logger.info(
"Keeping legacy global last_update until all existing issue/pull "
"backups have per-resource checkpoints"
)
return
os.remove(legacy_last_update_path)
logger.info(
"Removed legacy global last_update after migrating incremental checkpoints"
)
def backup_repositories(args, output_directory, repositories):
logger.info("Backing up repositories")
repos_template = "https://{0}/repos".format(get_github_api_host(args))
legacy_last_update_path, legacy_last_update = read_legacy_last_update(
args, output_directory
)
incremental_resource_work_attempted = False
if args.incremental:
last_update_path = os.path.join(output_directory, "last_update")
if os.path.exists(last_update_path):
args.since = open(last_update_path).read().strip()
else:
args.since = None
else:
args.since = None
last_update = "0000-00-00T00:00:00Z"
for repository in repositories:
if repository.get("updated_at") and repository["updated_at"] > last_update:
last_update = repository["updated_at"]
elif repository.get("pushed_at") and repository["pushed_at"] > last_update:
last_update = repository["pushed_at"]
if repository.get("is_gist"):
repo_cwd = os.path.join(output_directory, "gists", repository["id"])
elif repository.get("is_starred"):
@@ -2010,10 +2122,22 @@ def backup_repositories(args, output_directory, repositories):
no_prune=args.no_prune,
)
if args.include_issues or args.include_everything:
incremental_resource_work_attempted = True
issue_cwd = os.path.join(repo_cwd, "issues")
args.since = read_resource_last_update(
args, issue_cwd, legacy_last_update
)
backup_issues(args, repo_cwd, repository, repos_template)
write_resource_last_update(args, issue_cwd, repository)
if args.include_pulls or args.include_everything:
incremental_resource_work_attempted = True
pulls_cwd = os.path.join(repo_cwd, "pulls")
args.since = read_resource_last_update(
args, pulls_cwd, legacy_last_update
)
backup_pulls(args, repo_cwd, repository, repos_template)
write_resource_last_update(args, pulls_cwd, repository)
if args.include_discussions or args.include_everything:
backup_discussions(args, repo_cwd, repository)
@@ -2021,7 +2145,9 @@ def backup_repositories(args, output_directory, repositories):
if args.include_milestones or args.include_everything:
backup_milestones(args, repo_cwd, repository, repos_template)
if args.include_security_advisories or (args.include_everything and not repository.get("private", False)):
if args.include_security_advisories or (
args.include_everything and not repository.get("private", False)
):
backup_security_advisories(args, repo_cwd, repository, repos_template)
if args.include_labels or args.include_everything:
@@ -2045,11 +2171,10 @@ def backup_repositories(args, output_directory, repositories):
logger.info(f"Skipping remaining resources for {repository['full_name']}")
continue
if args.incremental:
if last_update == "0000-00-00T00:00:00Z":
last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
open(last_update_path, "w").write(last_update)
if incremental_resource_work_attempted:
remove_legacy_last_update_if_migrated(
args, output_directory, legacy_last_update_path
)
def _repository_owner_name(repository):

View File

@@ -0,0 +1,189 @@
"""Tests for per-resource incremental checkpoints."""
import json
import os
from github_backup import github_backup
def _repo(name, updated_at, pushed_at=None):
return {
"name": name,
"full_name": "owner/{0}".format(name),
"owner": {"login": "owner"},
"clone_url": "https://github.com/owner/{0}.git".format(name),
"private": False,
"fork": False,
"has_wiki": False,
"updated_at": updated_at,
"pushed_at": pushed_at,
}
def test_incremental_uses_per_resource_last_update(
create_args, tmp_path, monkeypatch
):
args = create_args(incremental=True, include_issues=True)
repositories = [
_repo("repo-one", "2026-02-01T00:00:00Z"),
_repo("repo-two", "2026-03-01T00:00:00Z"),
]
repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues"
repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues"
repo_one_issues.mkdir(parents=True)
repo_two_issues.mkdir(parents=True)
(repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z")
(repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z")
seen_since = []
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
seen_since.append((repository["name"], passed_args.since))
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
github_backup.backup_repositories(args, tmp_path, repositories)
assert seen_since == [
("repo-one", "2026-01-01T00:00:00Z"),
("repo-two", "2025-01-01T00:00:00Z"),
]
assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z"
assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z"
assert not os.path.exists(tmp_path / "last_update")
def test_incremental_uses_independent_issue_and_pull_checkpoints(
create_args, tmp_path, monkeypatch
):
args = create_args(incremental=True, include_issues=True, include_pulls=True)
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
repo_dir = tmp_path / "repositories" / "repo-one"
issues_dir = repo_dir / "issues"
pulls_dir = repo_dir / "pulls"
issues_dir.mkdir(parents=True)
pulls_dir.mkdir(parents=True)
(issues_dir / "last_update").write_text("2026-01-01T00:00:00Z")
(pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z")
seen_since = []
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
seen_since.append(("issues", passed_args.since))
def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template):
seen_since.append(("pulls", passed_args.since))
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls)
github_backup.backup_repositories(args, tmp_path, [repository])
assert seen_since == [
("issues", "2026-01-01T00:00:00Z"),
("pulls", "2025-01-01T00:00:00Z"),
]
assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup(
create_args, tmp_path, monkeypatch
):
args = create_args(incremental=True, include_issues=True)
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
issues_dir = tmp_path / "repositories" / "repo-one" / "issues"
issues_dir.mkdir(parents=True)
with open(issues_dir / "1.json", "w", encoding="utf-8") as f:
json.dump({"number": 1}, f)
seen_since = []
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
seen_since.append(passed_args.since)
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
github_backup.backup_repositories(args, tmp_path, [repository])
assert seen_since == ["2026-01-01T00:00:00Z"]
assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
assert not os.path.exists(tmp_path / "last_update")
def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup(
create_args, tmp_path, monkeypatch
):
args = create_args(incremental=True, include_issues=True)
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
(tmp_path / "last_update").write_text("2099-01-01T00:00:00Z")
seen_since = []
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
seen_since.append(passed_args.since)
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
github_backup.backup_repositories(args, tmp_path, [repository])
assert seen_since == [None]
assert (
tmp_path / "repositories" / "repo-one" / "issues" / "last_update"
).read_text() == "2026-02-01T00:00:00Z"
assert not os.path.exists(tmp_path / "last_update")
def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated(
create_args, tmp_path, monkeypatch
):
args = create_args(incremental=True, include_issues=True)
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues"
repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues"
repo_one_issues.mkdir(parents=True)
repo_two_issues.mkdir(parents=True)
with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f:
json.dump({"number": 1}, f)
with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f:
json.dump({"number": 2}, f)
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
pass
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
github_backup.backup_repositories(args, tmp_path, [repository])
assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z"
assert not os.path.exists(repo_two_issues / "last_update")
assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z"
def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work(
create_args, tmp_path
):
args = create_args(incremental=True, include_repository=True)
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
github_backup.backup_repositories(args, tmp_path, [repository])
assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z"
assert not os.path.exists(
tmp_path / "repositories" / "repo-one" / "issues" / "last_update"
)
def test_repository_checkpoint_time_uses_newest_available_repo_timestamp():
repository = _repo(
"repo-one",
updated_at="2026-02-01T00:00:00Z",
pushed_at="2026-03-01T00:00:00Z",
)
assert github_backup.get_repository_checkpoint_time(repository) == (
"2026-03-01T00:00:00Z"
)