From b3a8241c9ab5930acfae2014d6a48a4feabe95ae Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 15:03:48 +0200 Subject: [PATCH] Implement per-resource last_update timestamps Closes #62 --- CHANGES.rst | 5 + README.rst | 12 +- github_backup/github_backup.py | 167 +++++++++++++++++--- tests/test_incremental_per_repository.py | 189 +++++++++++++++++++++++ 4 files changed, 348 insertions(+), 25 deletions(-) create mode 100644 tests/test_incremental_per_repository.py diff --git a/CHANGES.rst b/CHANGES.rst index b790ce1..6cf9f17 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,11 @@ Unreleased optional attachment downloads, and per-repository incremental checkpoints. - Add pull request review backups with ``--pull-reviews`` and one-time incremental backfill for existing backups. +- Store incremental ``last_update`` checkpoints per repository resource instead + of using one global checkpoint for the whole output directory. Existing + backups use the legacy global checkpoint as a migration fallback, and the + legacy file is removed once existing issue/pull backups have resource + checkpoints (#62). - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 52d7222..3a4be3b 100644 --- a/README.rst +++ b/README.rst @@ -347,15 +347,19 @@ About pull request reviews Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. -``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. Incremental Backup ------------------ -Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. +Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository. -This means any blocking errors on previous runs can cause a large amount of missing data in backups. +Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint. + +After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically. + +This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource. Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. @@ -368,7 +372,7 @@ Known blocking errors Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. -If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. +If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints. It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 054d0c6..e56bb28 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1928,26 +1928,138 @@ def filter_repositories(args, unfiltered_repositories): return repositories +INCREMENTAL_LAST_UPDATE_FILENAME = "last_update" +INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls") + + +def get_repository_checkpoint_time(repository): + timestamps = [ + timestamp + for timestamp in (repository.get("updated_at"), repository.get("pushed_at")) + if timestamp + ] + if timestamps: + return max(timestamps) + + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) + + +def resource_backup_exists(resource_cwd): + if not os.path.isdir(resource_cwd): + return False + + ignored_names = { + INCREMENTAL_LAST_UPDATE_FILENAME, + PULL_REVIEWS_LAST_UPDATE_FILENAME, + } + for name in os.listdir(resource_cwd): + if name in ignored_names or name.endswith(".temp"): + continue + return True + + return False + + +def read_legacy_last_update(args, output_directory): + if not args.incremental: + return None, None + + last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return last_update_path, open(last_update_path).read().strip() + + return last_update_path, None + + +def read_resource_last_update(args, resource_cwd, legacy_last_update=None): + if not args.incremental: + return None + + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return open(last_update_path).read().strip() + + if legacy_last_update and resource_backup_exists(resource_cwd): + return legacy_last_update + + return None + + +def write_resource_last_update(args, resource_cwd, repository): + if not args.incremental: + return + + mkdir_p(resource_cwd) + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + open(last_update_path, "w").write(get_repository_checkpoint_time(repository)) + + +def iter_incremental_resource_dirs(output_directory): + repositories_dir = os.path.join(output_directory, "repositories") + if os.path.isdir(repositories_dir): + for repository_name in os.listdir(repositories_dir): + repo_cwd = os.path.join(repositories_dir, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + starred_dir = os.path.join(output_directory, "starred") + if os.path.isdir(starred_dir): + for owner_name in os.listdir(starred_dir): + owner_cwd = os.path.join(starred_dir, owner_name) + if not os.path.isdir(owner_cwd): + continue + for repository_name in os.listdir(owner_cwd): + repo_cwd = os.path.join(owner_cwd, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + +def has_unmigrated_incremental_resources(output_directory): + for resource_cwd in iter_incremental_resource_dirs(output_directory): + last_update_path = os.path.join( + resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME + ) + if resource_backup_exists(resource_cwd) and not os.path.exists( + last_update_path + ): + return True + + return False + + +def remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path +): + if not args.incremental or not legacy_last_update_path: + return + if not os.path.exists(legacy_last_update_path): + return + if has_unmigrated_incremental_resources(output_directory): + logger.info( + "Keeping legacy global last_update until all existing issue/pull " + "backups have per-resource checkpoints" + ) + return + + os.remove(legacy_last_update_path) + logger.info( + "Removed legacy global last_update after migrating incremental checkpoints" + ) + + def backup_repositories(args, output_directory, repositories): logger.info("Backing up repositories") repos_template = "https://{0}/repos".format(get_github_api_host(args)) + legacy_last_update_path, legacy_last_update = read_legacy_last_update( + args, output_directory + ) + incremental_resource_work_attempted = False - if args.incremental: - last_update_path = os.path.join(output_directory, "last_update") - if os.path.exists(last_update_path): - args.since = open(last_update_path).read().strip() - else: - args.since = None - else: - args.since = None - - last_update = "0000-00-00T00:00:00Z" for repository in repositories: - if repository.get("updated_at") and repository["updated_at"] > last_update: - last_update = repository["updated_at"] - elif repository.get("pushed_at") and repository["pushed_at"] > last_update: - last_update = repository["pushed_at"] - if repository.get("is_gist"): repo_cwd = os.path.join(output_directory, "gists", repository["id"]) elif repository.get("is_starred"): @@ -2010,10 +2122,22 @@ def backup_repositories(args, output_directory, repositories): no_prune=args.no_prune, ) if args.include_issues or args.include_everything: + incremental_resource_work_attempted = True + issue_cwd = os.path.join(repo_cwd, "issues") + args.since = read_resource_last_update( + args, issue_cwd, legacy_last_update + ) backup_issues(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, issue_cwd, repository) if args.include_pulls or args.include_everything: + incremental_resource_work_attempted = True + pulls_cwd = os.path.join(repo_cwd, "pulls") + args.since = read_resource_last_update( + args, pulls_cwd, legacy_last_update + ) backup_pulls(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, pulls_cwd, repository) if args.include_discussions or args.include_everything: backup_discussions(args, repo_cwd, repository) @@ -2021,7 +2145,9 @@ def backup_repositories(args, output_directory, repositories): if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) - if args.include_security_advisories or (args.include_everything and not repository.get("private", False)): + if args.include_security_advisories or ( + args.include_everything and not repository.get("private", False) + ): backup_security_advisories(args, repo_cwd, repository, repos_template) if args.include_labels or args.include_everything: @@ -2045,11 +2171,10 @@ def backup_repositories(args, output_directory, repositories): logger.info(f"Skipping remaining resources for {repository['full_name']}") continue - if args.incremental: - if last_update == "0000-00-00T00:00:00Z": - last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) - - open(last_update_path, "w").write(last_update) + if incremental_resource_work_attempted: + remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path + ) def _repository_owner_name(repository): diff --git a/tests/test_incremental_per_repository.py b/tests/test_incremental_per_repository.py new file mode 100644 index 0000000..f1fd67a --- /dev/null +++ b/tests/test_incremental_per_repository.py @@ -0,0 +1,189 @@ +"""Tests for per-resource incremental checkpoints.""" + +import json +import os + +from github_backup import github_backup + + +def _repo(name, updated_at, pushed_at=None): + return { + "name": name, + "full_name": "owner/{0}".format(name), + "owner": {"login": "owner"}, + "clone_url": "https://github.com/owner/{0}.git".format(name), + "private": False, + "fork": False, + "has_wiki": False, + "updated_at": updated_at, + "pushed_at": pushed_at, + } + + +def test_incremental_uses_per_resource_last_update( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repositories = [ + _repo("repo-one", "2026-02-01T00:00:00Z"), + _repo("repo-two", "2026-03-01T00:00:00Z"), + ] + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + (repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z") + (repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append((repository["name"], passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, repositories) + + assert seen_since == [ + ("repo-one", "2026-01-01T00:00:00Z"), + ("repo-two", "2025-01-01T00:00:00Z"), + ] + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_uses_independent_issue_and_pull_checkpoints( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True, include_pulls=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + repo_dir = tmp_path / "repositories" / "repo-one" + issues_dir = repo_dir / "issues" + pulls_dir = repo_dir / "pulls" + issues_dir.mkdir(parents=True) + pulls_dir.mkdir(parents=True) + (issues_dir / "last_update").write_text("2026-01-01T00:00:00Z") + (pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("issues", passed_args.since)) + + def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("pulls", passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [ + ("issues", "2026-01-01T00:00:00Z"), + ("pulls", "2025-01-01T00:00:00Z"), + ] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + issues_dir = tmp_path / "repositories" / "repo-one" / "issues" + issues_dir.mkdir(parents=True) + with open(issues_dir / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == ["2026-01-01T00:00:00Z"] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2099-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [None] + assert ( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ).read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f: + json.dump({"number": 2}, f) + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + pass + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(repo_two_issues / "last_update") + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + + +def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work( + create_args, tmp_path +): + args = create_args(incremental=True, include_repository=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ) + + +def test_repository_checkpoint_time_uses_newest_available_repo_timestamp(): + repository = _repo( + "repo-one", + updated_at="2026-02-01T00:00:00Z", + pushed_at="2026-03-01T00:00:00Z", + ) + + assert github_backup.get_repository_checkpoint_time(repository) == ( + "2026-03-01T00:00:00Z" + )