mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2025-12-05 16:18:02 +01:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ff2681e196 | ||
|
|
745b05a63f | ||
|
|
83ff0ae1dd | ||
|
|
6ad1959d43 | ||
|
|
5739ac0745 |
37
CHANGES.rst
37
CHANGES.rst
@@ -1,9 +1,44 @@
|
|||||||
Changelog
|
Changelog
|
||||||
=========
|
=========
|
||||||
|
|
||||||
0.52.0 (2025-11-28)
|
0.53.0 (2025-11-30)
|
||||||
-------------------
|
-------------------
|
||||||
------------------------
|
------------------------
|
||||||
|
|
||||||
|
Fix
|
||||||
|
~~~
|
||||||
|
- Case-sensitive username filtering causing silent backup failures.
|
||||||
|
[Rodos]
|
||||||
|
|
||||||
|
GitHub's API accepts usernames in any case but returns canonical case.
|
||||||
|
The case-sensitive comparison in filter_repositories() filtered out all
|
||||||
|
repositories when user-provided case didn't match GitHub's canonical case.
|
||||||
|
|
||||||
|
Changed to case-insensitive comparison.
|
||||||
|
|
||||||
|
Fixes #198
|
||||||
|
|
||||||
|
Other
|
||||||
|
~~~~~
|
||||||
|
- Avoid rewriting unchanged JSON files for labels, milestones, releases,
|
||||||
|
hooks, followers, and following. [Rodos]
|
||||||
|
|
||||||
|
This change reduces unnecessary writes when backing up metadata that changes
|
||||||
|
infrequently. The implementation compares existing file content before writing
|
||||||
|
and skips the write if the content is identical, preserving file timestamps.
|
||||||
|
|
||||||
|
Key changes:
|
||||||
|
- Added json_dump_if_changed() helper that compares content before writing
|
||||||
|
- Uses atomic writes (temp file + rename) for all metadata files
|
||||||
|
- NOT applied to issues/pulls (they use incremental_by_files logic)
|
||||||
|
- Made log messages consistent and past tense ("Saved" instead of "Saving")
|
||||||
|
- Added informative logging showing skip counts
|
||||||
|
|
||||||
|
Fixes #133
|
||||||
|
|
||||||
|
|
||||||
|
0.52.0 (2025-11-28)
|
||||||
|
-------------------
|
||||||
- Skip DMCA'd repos which return a 451 response. [Rodos]
|
- Skip DMCA'd repos which return a 451 response. [Rodos]
|
||||||
|
|
||||||
Log a warning and the link to the DMCA notice. Continue backing up
|
Log a warning and the link to the DMCA notice. Continue backing up
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
__version__ = "0.52.0"
|
__version__ = "0.53.0"
|
||||||
|
|||||||
@@ -1587,7 +1587,9 @@ def filter_repositories(args, unfiltered_repositories):
|
|||||||
repositories = []
|
repositories = []
|
||||||
for r in unfiltered_repositories:
|
for r in unfiltered_repositories:
|
||||||
# gists can be anonymous, so need to safely check owner
|
# gists can be anonymous, so need to safely check owner
|
||||||
if r.get("owner", {}).get("login") == args.user or r.get("is_starred"):
|
# Use case-insensitive comparison to match GitHub's case-insensitive username behavior
|
||||||
|
owner_login = r.get("owner", {}).get("login", "")
|
||||||
|
if owner_login.lower() == args.user.lower() or r.get("is_starred"):
|
||||||
repositories.append(r)
|
repositories.append(r)
|
||||||
|
|
||||||
name_regex = None
|
name_regex = None
|
||||||
@@ -1898,11 +1900,21 @@ def backup_milestones(args, repo_cwd, repository, repos_template):
|
|||||||
for milestone in _milestones:
|
for milestone in _milestones:
|
||||||
milestones[milestone["number"]] = milestone
|
milestones[milestone["number"]] = milestone
|
||||||
|
|
||||||
logger.info("Saving {0} milestones to disk".format(len(list(milestones.keys()))))
|
written_count = 0
|
||||||
for number, milestone in list(milestones.items()):
|
for number, milestone in list(milestones.items()):
|
||||||
milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
|
milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
|
||||||
with codecs.open(milestone_file, "w", encoding="utf-8") as f:
|
if json_dump_if_changed(milestone, milestone_file):
|
||||||
json_dump(milestone, f)
|
written_count += 1
|
||||||
|
|
||||||
|
total = len(milestones)
|
||||||
|
if written_count == total:
|
||||||
|
logger.info("Saved {0} milestones to disk".format(total))
|
||||||
|
elif written_count == 0:
|
||||||
|
logger.info("{0} milestones unchanged, skipped write".format(total))
|
||||||
|
else:
|
||||||
|
logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format(
|
||||||
|
written_count, total, total - written_count
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
def backup_labels(args, repo_cwd, repository, repos_template):
|
def backup_labels(args, repo_cwd, repository, repos_template):
|
||||||
@@ -1955,19 +1967,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
|
|||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
releases = releases[: args.number_of_latest_releases]
|
releases = releases[: args.number_of_latest_releases]
|
||||||
logger.info("Saving the latest {0} releases to disk".format(len(releases)))
|
|
||||||
else:
|
|
||||||
logger.info("Saving {0} releases to disk".format(len(releases)))
|
|
||||||
|
|
||||||
# for each release, store it
|
# for each release, store it
|
||||||
|
written_count = 0
|
||||||
for release in releases:
|
for release in releases:
|
||||||
release_name = release["tag_name"]
|
release_name = release["tag_name"]
|
||||||
release_name_safe = release_name.replace("/", "__")
|
release_name_safe = release_name.replace("/", "__")
|
||||||
output_filepath = os.path.join(
|
output_filepath = os.path.join(
|
||||||
release_cwd, "{0}.json".format(release_name_safe)
|
release_cwd, "{0}.json".format(release_name_safe)
|
||||||
)
|
)
|
||||||
with codecs.open(output_filepath, "w+", encoding="utf-8") as f:
|
if json_dump_if_changed(release, output_filepath):
|
||||||
json_dump(release, f)
|
written_count += 1
|
||||||
|
|
||||||
if include_assets:
|
if include_assets:
|
||||||
assets = retrieve_data(args, release["assets_url"])
|
assets = retrieve_data(args, release["assets_url"])
|
||||||
@@ -1984,6 +1994,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
|
|||||||
fine=True if args.token_fine is not None else False,
|
fine=True if args.token_fine is not None else False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Log the results
|
||||||
|
total = len(releases)
|
||||||
|
if written_count == total:
|
||||||
|
logger.info("Saved {0} releases to disk".format(total))
|
||||||
|
elif written_count == 0:
|
||||||
|
logger.info("{0} releases unchanged, skipped write".format(total))
|
||||||
|
else:
|
||||||
|
logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format(
|
||||||
|
written_count, total, total - written_count
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
def fetch_repository(
|
def fetch_repository(
|
||||||
name,
|
name,
|
||||||
@@ -2108,9 +2129,10 @@ def _backup_data(args, name, template, output_file, output_directory):
|
|||||||
mkdir_p(output_directory)
|
mkdir_p(output_directory)
|
||||||
data = retrieve_data(args, template)
|
data = retrieve_data(args, template)
|
||||||
|
|
||||||
logger.info("Writing {0} {1} to disk".format(len(data), name))
|
if json_dump_if_changed(data, output_file):
|
||||||
with codecs.open(output_file, "w", encoding="utf-8") as f:
|
logger.info("Saved {0} {1} to disk".format(len(data), name))
|
||||||
json_dump(data, f)
|
else:
|
||||||
|
logger.info("{0} {1} unchanged, skipped write".format(len(data), name))
|
||||||
|
|
||||||
|
|
||||||
def json_dump(data, output_file):
|
def json_dump(data, output_file):
|
||||||
@@ -2122,3 +2144,57 @@ def json_dump(data, output_file):
|
|||||||
indent=4,
|
indent=4,
|
||||||
separators=(",", ": "),
|
separators=(",", ": "),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def json_dump_if_changed(data, output_file_path):
|
||||||
|
"""
|
||||||
|
Write JSON data to file only if content has changed.
|
||||||
|
|
||||||
|
Compares the serialized JSON data with the existing file content
|
||||||
|
and only writes if different. This prevents unnecessary file
|
||||||
|
modification timestamp updates and disk writes.
|
||||||
|
|
||||||
|
Uses atomic writes (temp file + rename) to prevent corruption
|
||||||
|
if the process is interrupted during the write.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: The data to serialize as JSON
|
||||||
|
output_file_path: The path to the output file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if file was written (content changed or new file)
|
||||||
|
False if write was skipped (content unchanged)
|
||||||
|
"""
|
||||||
|
# Serialize new data with consistent formatting matching json_dump()
|
||||||
|
new_content = json.dumps(
|
||||||
|
data,
|
||||||
|
ensure_ascii=False,
|
||||||
|
sort_keys=True,
|
||||||
|
indent=4,
|
||||||
|
separators=(",", ": "),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if file exists and compare content
|
||||||
|
if os.path.exists(output_file_path):
|
||||||
|
try:
|
||||||
|
with codecs.open(output_file_path, "r", encoding="utf-8") as f:
|
||||||
|
existing_content = f.read()
|
||||||
|
if existing_content == new_content:
|
||||||
|
logger.debug(
|
||||||
|
"Content unchanged, skipping write: {0}".format(output_file_path)
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
except (OSError, UnicodeDecodeError) as e:
|
||||||
|
# If we can't read the existing file, write the new one
|
||||||
|
logger.debug(
|
||||||
|
"Error reading existing file {0}, will overwrite: {1}".format(
|
||||||
|
output_file_path, e
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write the file atomically using temp file + rename
|
||||||
|
temp_file = output_file_path + ".temp"
|
||||||
|
with codecs.open(temp_file, "w", encoding="utf-8") as f:
|
||||||
|
f.write(new_content)
|
||||||
|
os.rename(temp_file, output_file_path) # Atomic on POSIX systems
|
||||||
|
return True
|
||||||
|
|||||||
198
tests/test_json_dump_if_changed.py
Normal file
198
tests/test_json_dump_if_changed.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
"""Tests for json_dump_if_changed functionality."""
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from github_backup import github_backup
|
||||||
|
|
||||||
|
|
||||||
|
class TestJsonDumpIfChanged:
|
||||||
|
"""Test suite for json_dump_if_changed function."""
|
||||||
|
|
||||||
|
def test_writes_new_file(self):
|
||||||
|
"""Should write file when it doesn't exist."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {"key": "value", "number": 42}
|
||||||
|
|
||||||
|
result = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert os.path.exists(output_file)
|
||||||
|
|
||||||
|
# Verify content matches expected format
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
loaded = json.loads(content)
|
||||||
|
assert loaded == test_data
|
||||||
|
|
||||||
|
def test_skips_unchanged_file(self):
|
||||||
|
"""Should skip write when content is identical."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {"key": "value", "number": 42}
|
||||||
|
|
||||||
|
# First write
|
||||||
|
result1 = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result1 is True
|
||||||
|
|
||||||
|
# Get the initial mtime
|
||||||
|
mtime1 = os.path.getmtime(output_file)
|
||||||
|
|
||||||
|
# Second write with same data
|
||||||
|
result2 = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result2 is False
|
||||||
|
|
||||||
|
# File should not have been modified
|
||||||
|
mtime2 = os.path.getmtime(output_file)
|
||||||
|
assert mtime1 == mtime2
|
||||||
|
|
||||||
|
def test_writes_when_content_changed(self):
|
||||||
|
"""Should write file when content has changed."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data1 = {"key": "value1"}
|
||||||
|
test_data2 = {"key": "value2"}
|
||||||
|
|
||||||
|
# First write
|
||||||
|
result1 = github_backup.json_dump_if_changed(test_data1, output_file)
|
||||||
|
assert result1 is True
|
||||||
|
|
||||||
|
# Second write with different data
|
||||||
|
result2 = github_backup.json_dump_if_changed(test_data2, output_file)
|
||||||
|
assert result2 is True
|
||||||
|
|
||||||
|
# Verify new content
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
loaded = json.load(f)
|
||||||
|
assert loaded == test_data2
|
||||||
|
|
||||||
|
def test_uses_consistent_formatting(self):
|
||||||
|
"""Should use same JSON formatting as json_dump."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {"z": "last", "a": "first", "m": "middle"}
|
||||||
|
|
||||||
|
github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Check for consistent formatting:
|
||||||
|
# - sorted keys
|
||||||
|
# - 4-space indent
|
||||||
|
# - comma-colon-space separator
|
||||||
|
expected = json.dumps(
|
||||||
|
test_data,
|
||||||
|
ensure_ascii=False,
|
||||||
|
sort_keys=True,
|
||||||
|
indent=4,
|
||||||
|
separators=(",", ": "),
|
||||||
|
)
|
||||||
|
assert content == expected
|
||||||
|
|
||||||
|
def test_atomic_write_always_used(self):
|
||||||
|
"""Should always use temp file and rename for atomic writes."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {"key": "value"}
|
||||||
|
|
||||||
|
result = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert os.path.exists(output_file)
|
||||||
|
|
||||||
|
# Temp file should not exist after atomic write
|
||||||
|
temp_file = output_file + ".temp"
|
||||||
|
assert not os.path.exists(temp_file)
|
||||||
|
|
||||||
|
# Verify content
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
loaded = json.load(f)
|
||||||
|
assert loaded == test_data
|
||||||
|
|
||||||
|
def test_handles_unicode_content(self):
|
||||||
|
"""Should correctly handle Unicode content."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {
|
||||||
|
"emoji": "🚀",
|
||||||
|
"chinese": "你好",
|
||||||
|
"arabic": "مرحبا",
|
||||||
|
"cyrillic": "Привет",
|
||||||
|
}
|
||||||
|
|
||||||
|
result = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify Unicode is preserved
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
loaded = json.load(f)
|
||||||
|
assert loaded == test_data
|
||||||
|
|
||||||
|
# Second write should skip
|
||||||
|
result2 = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result2 is False
|
||||||
|
|
||||||
|
def test_handles_complex_nested_data(self):
|
||||||
|
"""Should handle complex nested data structures."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {
|
||||||
|
"users": [
|
||||||
|
{"id": 1, "name": "Alice", "tags": ["admin", "user"]},
|
||||||
|
{"id": 2, "name": "Bob", "tags": ["user"]},
|
||||||
|
],
|
||||||
|
"metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}},
|
||||||
|
}
|
||||||
|
|
||||||
|
result = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify structure is preserved
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
loaded = json.load(f)
|
||||||
|
assert loaded == test_data
|
||||||
|
|
||||||
|
def test_overwrites_on_unicode_decode_error(self):
|
||||||
|
"""Should overwrite if existing file has invalid UTF-8."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
test_data = {"key": "value"}
|
||||||
|
|
||||||
|
# Write invalid UTF-8 bytes
|
||||||
|
with open(output_file, "wb") as f:
|
||||||
|
f.write(b"\xff\xfe invalid utf-8")
|
||||||
|
|
||||||
|
# Should catch UnicodeDecodeError and overwrite
|
||||||
|
result = github_backup.json_dump_if_changed(test_data, output_file)
|
||||||
|
assert result is True
|
||||||
|
|
||||||
|
# Verify new content was written
|
||||||
|
with codecs.open(output_file, "r", encoding="utf-8") as f:
|
||||||
|
loaded = json.load(f)
|
||||||
|
assert loaded == test_data
|
||||||
|
|
||||||
|
def test_key_order_independence(self):
|
||||||
|
"""Should treat differently-ordered dicts as same if keys/values match."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_file = os.path.join(tmpdir, "test.json")
|
||||||
|
|
||||||
|
# Write first dict
|
||||||
|
data1 = {"z": 1, "a": 2, "m": 3}
|
||||||
|
github_backup.json_dump_if_changed(data1, output_file)
|
||||||
|
|
||||||
|
# Try to write same data but different order
|
||||||
|
data2 = {"a": 2, "m": 3, "z": 1}
|
||||||
|
result = github_backup.json_dump_if_changed(data2, output_file)
|
||||||
|
|
||||||
|
# Should skip because content is the same (keys are sorted)
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user