mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2026-04-29 20:15:36 +02:00
@@ -7,6 +7,11 @@ Unreleased
|
||||
optional attachment downloads, and per-repository incremental checkpoints.
|
||||
- Add pull request review backups with ``--pull-reviews`` and one-time
|
||||
incremental backfill for existing backups.
|
||||
- Store incremental ``last_update`` checkpoints per repository resource instead
|
||||
of using one global checkpoint for the whole output directory. Existing
|
||||
backups use the legacy global checkpoint as a migration fallback, and the
|
||||
legacy file is removed once existing issue/pull backups have resource
|
||||
checkpoints (#62).
|
||||
- Add ``--token-from-gh`` to read authentication from ``gh auth token``.
|
||||
|
||||
|
||||
|
||||
12
README.rst
12
README.rst
@@ -347,15 +347,19 @@ About pull request reviews
|
||||
|
||||
Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body.
|
||||
|
||||
``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added.
|
||||
``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added.
|
||||
|
||||
|
||||
Incremental Backup
|
||||
------------------
|
||||
|
||||
Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run.
|
||||
Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository.
|
||||
|
||||
This means any blocking errors on previous runs can cause a large amount of missing data in backups.
|
||||
Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint.
|
||||
|
||||
After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically.
|
||||
|
||||
This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource.
|
||||
|
||||
Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something.
|
||||
|
||||
@@ -368,7 +372,7 @@ Known blocking errors
|
||||
|
||||
Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API.
|
||||
|
||||
If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data.
|
||||
If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints.
|
||||
|
||||
It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs.
|
||||
|
||||
|
||||
@@ -1928,26 +1928,138 @@ def filter_repositories(args, unfiltered_repositories):
|
||||
return repositories
|
||||
|
||||
|
||||
INCREMENTAL_LAST_UPDATE_FILENAME = "last_update"
|
||||
INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls")
|
||||
|
||||
|
||||
def get_repository_checkpoint_time(repository):
|
||||
timestamps = [
|
||||
timestamp
|
||||
for timestamp in (repository.get("updated_at"), repository.get("pushed_at"))
|
||||
if timestamp
|
||||
]
|
||||
if timestamps:
|
||||
return max(timestamps)
|
||||
|
||||
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
|
||||
|
||||
|
||||
def resource_backup_exists(resource_cwd):
|
||||
if not os.path.isdir(resource_cwd):
|
||||
return False
|
||||
|
||||
ignored_names = {
|
||||
INCREMENTAL_LAST_UPDATE_FILENAME,
|
||||
PULL_REVIEWS_LAST_UPDATE_FILENAME,
|
||||
}
|
||||
for name in os.listdir(resource_cwd):
|
||||
if name in ignored_names or name.endswith(".temp"):
|
||||
continue
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def read_legacy_last_update(args, output_directory):
|
||||
if not args.incremental:
|
||||
return None, None
|
||||
|
||||
last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME)
|
||||
if os.path.exists(last_update_path):
|
||||
return last_update_path, open(last_update_path).read().strip()
|
||||
|
||||
return last_update_path, None
|
||||
|
||||
|
||||
def read_resource_last_update(args, resource_cwd, legacy_last_update=None):
|
||||
if not args.incremental:
|
||||
return None
|
||||
|
||||
last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME)
|
||||
if os.path.exists(last_update_path):
|
||||
return open(last_update_path).read().strip()
|
||||
|
||||
if legacy_last_update and resource_backup_exists(resource_cwd):
|
||||
return legacy_last_update
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def write_resource_last_update(args, resource_cwd, repository):
|
||||
if not args.incremental:
|
||||
return
|
||||
|
||||
mkdir_p(resource_cwd)
|
||||
last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME)
|
||||
open(last_update_path, "w").write(get_repository_checkpoint_time(repository))
|
||||
|
||||
|
||||
def iter_incremental_resource_dirs(output_directory):
|
||||
repositories_dir = os.path.join(output_directory, "repositories")
|
||||
if os.path.isdir(repositories_dir):
|
||||
for repository_name in os.listdir(repositories_dir):
|
||||
repo_cwd = os.path.join(repositories_dir, repository_name)
|
||||
if not os.path.isdir(repo_cwd):
|
||||
continue
|
||||
for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES:
|
||||
yield os.path.join(repo_cwd, resource_name)
|
||||
|
||||
starred_dir = os.path.join(output_directory, "starred")
|
||||
if os.path.isdir(starred_dir):
|
||||
for owner_name in os.listdir(starred_dir):
|
||||
owner_cwd = os.path.join(starred_dir, owner_name)
|
||||
if not os.path.isdir(owner_cwd):
|
||||
continue
|
||||
for repository_name in os.listdir(owner_cwd):
|
||||
repo_cwd = os.path.join(owner_cwd, repository_name)
|
||||
if not os.path.isdir(repo_cwd):
|
||||
continue
|
||||
for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES:
|
||||
yield os.path.join(repo_cwd, resource_name)
|
||||
|
||||
|
||||
def has_unmigrated_incremental_resources(output_directory):
|
||||
for resource_cwd in iter_incremental_resource_dirs(output_directory):
|
||||
last_update_path = os.path.join(
|
||||
resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME
|
||||
)
|
||||
if resource_backup_exists(resource_cwd) and not os.path.exists(
|
||||
last_update_path
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def remove_legacy_last_update_if_migrated(
|
||||
args, output_directory, legacy_last_update_path
|
||||
):
|
||||
if not args.incremental or not legacy_last_update_path:
|
||||
return
|
||||
if not os.path.exists(legacy_last_update_path):
|
||||
return
|
||||
if has_unmigrated_incremental_resources(output_directory):
|
||||
logger.info(
|
||||
"Keeping legacy global last_update until all existing issue/pull "
|
||||
"backups have per-resource checkpoints"
|
||||
)
|
||||
return
|
||||
|
||||
os.remove(legacy_last_update_path)
|
||||
logger.info(
|
||||
"Removed legacy global last_update after migrating incremental checkpoints"
|
||||
)
|
||||
|
||||
|
||||
def backup_repositories(args, output_directory, repositories):
|
||||
logger.info("Backing up repositories")
|
||||
repos_template = "https://{0}/repos".format(get_github_api_host(args))
|
||||
legacy_last_update_path, legacy_last_update = read_legacy_last_update(
|
||||
args, output_directory
|
||||
)
|
||||
incremental_resource_work_attempted = False
|
||||
|
||||
if args.incremental:
|
||||
last_update_path = os.path.join(output_directory, "last_update")
|
||||
if os.path.exists(last_update_path):
|
||||
args.since = open(last_update_path).read().strip()
|
||||
else:
|
||||
args.since = None
|
||||
else:
|
||||
args.since = None
|
||||
|
||||
last_update = "0000-00-00T00:00:00Z"
|
||||
for repository in repositories:
|
||||
if repository.get("updated_at") and repository["updated_at"] > last_update:
|
||||
last_update = repository["updated_at"]
|
||||
elif repository.get("pushed_at") and repository["pushed_at"] > last_update:
|
||||
last_update = repository["pushed_at"]
|
||||
|
||||
if repository.get("is_gist"):
|
||||
repo_cwd = os.path.join(output_directory, "gists", repository["id"])
|
||||
elif repository.get("is_starred"):
|
||||
@@ -2010,10 +2122,22 @@ def backup_repositories(args, output_directory, repositories):
|
||||
no_prune=args.no_prune,
|
||||
)
|
||||
if args.include_issues or args.include_everything:
|
||||
incremental_resource_work_attempted = True
|
||||
issue_cwd = os.path.join(repo_cwd, "issues")
|
||||
args.since = read_resource_last_update(
|
||||
args, issue_cwd, legacy_last_update
|
||||
)
|
||||
backup_issues(args, repo_cwd, repository, repos_template)
|
||||
write_resource_last_update(args, issue_cwd, repository)
|
||||
|
||||
if args.include_pulls or args.include_everything:
|
||||
incremental_resource_work_attempted = True
|
||||
pulls_cwd = os.path.join(repo_cwd, "pulls")
|
||||
args.since = read_resource_last_update(
|
||||
args, pulls_cwd, legacy_last_update
|
||||
)
|
||||
backup_pulls(args, repo_cwd, repository, repos_template)
|
||||
write_resource_last_update(args, pulls_cwd, repository)
|
||||
|
||||
if args.include_discussions or args.include_everything:
|
||||
backup_discussions(args, repo_cwd, repository)
|
||||
@@ -2021,7 +2145,9 @@ def backup_repositories(args, output_directory, repositories):
|
||||
if args.include_milestones or args.include_everything:
|
||||
backup_milestones(args, repo_cwd, repository, repos_template)
|
||||
|
||||
if args.include_security_advisories or (args.include_everything and not repository.get("private", False)):
|
||||
if args.include_security_advisories or (
|
||||
args.include_everything and not repository.get("private", False)
|
||||
):
|
||||
backup_security_advisories(args, repo_cwd, repository, repos_template)
|
||||
|
||||
if args.include_labels or args.include_everything:
|
||||
@@ -2045,11 +2171,10 @@ def backup_repositories(args, output_directory, repositories):
|
||||
logger.info(f"Skipping remaining resources for {repository['full_name']}")
|
||||
continue
|
||||
|
||||
if args.incremental:
|
||||
if last_update == "0000-00-00T00:00:00Z":
|
||||
last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
|
||||
|
||||
open(last_update_path, "w").write(last_update)
|
||||
if incremental_resource_work_attempted:
|
||||
remove_legacy_last_update_if_migrated(
|
||||
args, output_directory, legacy_last_update_path
|
||||
)
|
||||
|
||||
|
||||
def _repository_owner_name(repository):
|
||||
|
||||
189
tests/test_incremental_per_repository.py
Normal file
189
tests/test_incremental_per_repository.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for per-resource incremental checkpoints."""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from github_backup import github_backup
|
||||
|
||||
|
||||
def _repo(name, updated_at, pushed_at=None):
|
||||
return {
|
||||
"name": name,
|
||||
"full_name": "owner/{0}".format(name),
|
||||
"owner": {"login": "owner"},
|
||||
"clone_url": "https://github.com/owner/{0}.git".format(name),
|
||||
"private": False,
|
||||
"fork": False,
|
||||
"has_wiki": False,
|
||||
"updated_at": updated_at,
|
||||
"pushed_at": pushed_at,
|
||||
}
|
||||
|
||||
|
||||
def test_incremental_uses_per_resource_last_update(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(incremental=True, include_issues=True)
|
||||
repositories = [
|
||||
_repo("repo-one", "2026-02-01T00:00:00Z"),
|
||||
_repo("repo-two", "2026-03-01T00:00:00Z"),
|
||||
]
|
||||
repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues"
|
||||
repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues"
|
||||
repo_one_issues.mkdir(parents=True)
|
||||
repo_two_issues.mkdir(parents=True)
|
||||
(repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z")
|
||||
(repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z")
|
||||
|
||||
seen_since = []
|
||||
|
||||
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
|
||||
seen_since.append((repository["name"], passed_args.since))
|
||||
|
||||
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, repositories)
|
||||
|
||||
assert seen_since == [
|
||||
("repo-one", "2026-01-01T00:00:00Z"),
|
||||
("repo-two", "2025-01-01T00:00:00Z"),
|
||||
]
|
||||
assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z"
|
||||
assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z"
|
||||
assert not os.path.exists(tmp_path / "last_update")
|
||||
|
||||
|
||||
def test_incremental_uses_independent_issue_and_pull_checkpoints(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(incremental=True, include_issues=True, include_pulls=True)
|
||||
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
|
||||
repo_dir = tmp_path / "repositories" / "repo-one"
|
||||
issues_dir = repo_dir / "issues"
|
||||
pulls_dir = repo_dir / "pulls"
|
||||
issues_dir.mkdir(parents=True)
|
||||
pulls_dir.mkdir(parents=True)
|
||||
(issues_dir / "last_update").write_text("2026-01-01T00:00:00Z")
|
||||
(pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z")
|
||||
|
||||
seen_since = []
|
||||
|
||||
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
|
||||
seen_since.append(("issues", passed_args.since))
|
||||
|
||||
def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template):
|
||||
seen_since.append(("pulls", passed_args.since))
|
||||
|
||||
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
|
||||
monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls)
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, [repository])
|
||||
|
||||
assert seen_since == [
|
||||
("issues", "2026-01-01T00:00:00Z"),
|
||||
("pulls", "2025-01-01T00:00:00Z"),
|
||||
]
|
||||
assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
|
||||
assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(incremental=True, include_issues=True)
|
||||
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
|
||||
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
|
||||
issues_dir = tmp_path / "repositories" / "repo-one" / "issues"
|
||||
issues_dir.mkdir(parents=True)
|
||||
with open(issues_dir / "1.json", "w", encoding="utf-8") as f:
|
||||
json.dump({"number": 1}, f)
|
||||
|
||||
seen_since = []
|
||||
|
||||
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
|
||||
seen_since.append(passed_args.since)
|
||||
|
||||
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, [repository])
|
||||
|
||||
assert seen_since == ["2026-01-01T00:00:00Z"]
|
||||
assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
|
||||
assert not os.path.exists(tmp_path / "last_update")
|
||||
|
||||
|
||||
def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(incremental=True, include_issues=True)
|
||||
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
|
||||
(tmp_path / "last_update").write_text("2099-01-01T00:00:00Z")
|
||||
|
||||
seen_since = []
|
||||
|
||||
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
|
||||
seen_since.append(passed_args.since)
|
||||
|
||||
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, [repository])
|
||||
|
||||
assert seen_since == [None]
|
||||
assert (
|
||||
tmp_path / "repositories" / "repo-one" / "issues" / "last_update"
|
||||
).read_text() == "2026-02-01T00:00:00Z"
|
||||
assert not os.path.exists(tmp_path / "last_update")
|
||||
|
||||
|
||||
def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated(
|
||||
create_args, tmp_path, monkeypatch
|
||||
):
|
||||
args = create_args(incremental=True, include_issues=True)
|
||||
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
|
||||
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
|
||||
repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues"
|
||||
repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues"
|
||||
repo_one_issues.mkdir(parents=True)
|
||||
repo_two_issues.mkdir(parents=True)
|
||||
with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f:
|
||||
json.dump({"number": 1}, f)
|
||||
with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f:
|
||||
json.dump({"number": 2}, f)
|
||||
|
||||
def fake_backup_issues(passed_args, repo_cwd, repository, repos_template):
|
||||
pass
|
||||
|
||||
monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues)
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, [repository])
|
||||
|
||||
assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z"
|
||||
assert not os.path.exists(repo_two_issues / "last_update")
|
||||
assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work(
|
||||
create_args, tmp_path
|
||||
):
|
||||
args = create_args(incremental=True, include_repository=True)
|
||||
repository = _repo("repo-one", "2026-02-01T00:00:00Z")
|
||||
(tmp_path / "last_update").write_text("2026-01-01T00:00:00Z")
|
||||
|
||||
github_backup.backup_repositories(args, tmp_path, [repository])
|
||||
|
||||
assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z"
|
||||
assert not os.path.exists(
|
||||
tmp_path / "repositories" / "repo-one" / "issues" / "last_update"
|
||||
)
|
||||
|
||||
|
||||
def test_repository_checkpoint_time_uses_newest_available_repo_timestamp():
|
||||
repository = _repo(
|
||||
"repo-one",
|
||||
updated_at="2026-02-01T00:00:00Z",
|
||||
pushed_at="2026-03-01T00:00:00Z",
|
||||
)
|
||||
|
||||
assert github_backup.get_repository_checkpoint_time(repository) == (
|
||||
"2026-03-01T00:00:00Z"
|
||||
)
|
||||
Reference in New Issue
Block a user