diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index dcf79e8..9d39a64 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1898,11 +1898,21 @@ def backup_milestones(args, repo_cwd, repository, repos_template): for milestone in _milestones: milestones[milestone["number"]] = milestone - logger.info("Saving {0} milestones to disk".format(len(list(milestones.keys())))) + written_count = 0 for number, milestone in list(milestones.items()): milestone_file = "{0}/{1}.json".format(milestone_cwd, number) - with codecs.open(milestone_file, "w", encoding="utf-8") as f: - json_dump(milestone, f) + if json_dump_if_changed(milestone, milestone_file): + written_count += 1 + + total = len(milestones) + if written_count == total: + logger.info("Saved {0} milestones to disk".format(total)) + elif written_count == 0: + logger.info("{0} milestones unchanged, skipped write".format(total)) + else: + logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format( + written_count, total, total - written_count + )) def backup_labels(args, repo_cwd, repository, repos_template): @@ -1955,19 +1965,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F reverse=True, ) releases = releases[: args.number_of_latest_releases] - logger.info("Saving the latest {0} releases to disk".format(len(releases))) - else: - logger.info("Saving {0} releases to disk".format(len(releases))) # for each release, store it + written_count = 0 for release in releases: release_name = release["tag_name"] release_name_safe = release_name.replace("/", "__") output_filepath = os.path.join( release_cwd, "{0}.json".format(release_name_safe) ) - with codecs.open(output_filepath, "w+", encoding="utf-8") as f: - json_dump(release, f) + if json_dump_if_changed(release, output_filepath): + written_count += 1 if include_assets: assets = retrieve_data(args, release["assets_url"]) @@ -1984,6 +1992,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F fine=True if args.token_fine is not None else False, ) + # Log the results + total = len(releases) + if written_count == total: + logger.info("Saved {0} releases to disk".format(total)) + elif written_count == 0: + logger.info("{0} releases unchanged, skipped write".format(total)) + else: + logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format( + written_count, total, total - written_count + )) + def fetch_repository( name, @@ -2108,9 +2127,10 @@ def _backup_data(args, name, template, output_file, output_directory): mkdir_p(output_directory) data = retrieve_data(args, template) - logger.info("Writing {0} {1} to disk".format(len(data), name)) - with codecs.open(output_file, "w", encoding="utf-8") as f: - json_dump(data, f) + if json_dump_if_changed(data, output_file): + logger.info("Saved {0} {1} to disk".format(len(data), name)) + else: + logger.info("{0} {1} unchanged, skipped write".format(len(data), name)) def json_dump(data, output_file): @@ -2122,3 +2142,57 @@ def json_dump(data, output_file): indent=4, separators=(",", ": "), ) + + +def json_dump_if_changed(data, output_file_path): + """ + Write JSON data to file only if content has changed. + + Compares the serialized JSON data with the existing file content + and only writes if different. This prevents unnecessary file + modification timestamp updates and disk writes. + + Uses atomic writes (temp file + rename) to prevent corruption + if the process is interrupted during the write. + + Args: + data: The data to serialize as JSON + output_file_path: The path to the output file + + Returns: + True if file was written (content changed or new file) + False if write was skipped (content unchanged) + """ + # Serialize new data with consistent formatting matching json_dump() + new_content = json.dumps( + data, + ensure_ascii=False, + sort_keys=True, + indent=4, + separators=(",", ": "), + ) + + # Check if file exists and compare content + if os.path.exists(output_file_path): + try: + with codecs.open(output_file_path, "r", encoding="utf-8") as f: + existing_content = f.read() + if existing_content == new_content: + logger.debug( + "Content unchanged, skipping write: {0}".format(output_file_path) + ) + return False + except (OSError, UnicodeDecodeError) as e: + # If we can't read the existing file, write the new one + logger.debug( + "Error reading existing file {0}, will overwrite: {1}".format( + output_file_path, e + ) + ) + + # Write the file atomically using temp file + rename + temp_file = output_file_path + ".temp" + with codecs.open(temp_file, "w", encoding="utf-8") as f: + f.write(new_content) + os.rename(temp_file, output_file_path) # Atomic on POSIX systems + return True diff --git a/tests/test_json_dump_if_changed.py b/tests/test_json_dump_if_changed.py new file mode 100644 index 0000000..426baee --- /dev/null +++ b/tests/test_json_dump_if_changed.py @@ -0,0 +1,198 @@ +"""Tests for json_dump_if_changed functionality.""" + +import codecs +import json +import os +import tempfile + +import pytest + +from github_backup import github_backup + + +class TestJsonDumpIfChanged: + """Test suite for json_dump_if_changed function.""" + + def test_writes_new_file(self): + """Should write file when it doesn't exist.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = {"key": "value", "number": 42} + + result = github_backup.json_dump_if_changed(test_data, output_file) + + assert result is True + assert os.path.exists(output_file) + + # Verify content matches expected format + with codecs.open(output_file, "r", encoding="utf-8") as f: + content = f.read() + loaded = json.loads(content) + assert loaded == test_data + + def test_skips_unchanged_file(self): + """Should skip write when content is identical.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = {"key": "value", "number": 42} + + # First write + result1 = github_backup.json_dump_if_changed(test_data, output_file) + assert result1 is True + + # Get the initial mtime + mtime1 = os.path.getmtime(output_file) + + # Second write with same data + result2 = github_backup.json_dump_if_changed(test_data, output_file) + assert result2 is False + + # File should not have been modified + mtime2 = os.path.getmtime(output_file) + assert mtime1 == mtime2 + + def test_writes_when_content_changed(self): + """Should write file when content has changed.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data1 = {"key": "value1"} + test_data2 = {"key": "value2"} + + # First write + result1 = github_backup.json_dump_if_changed(test_data1, output_file) + assert result1 is True + + # Second write with different data + result2 = github_backup.json_dump_if_changed(test_data2, output_file) + assert result2 is True + + # Verify new content + with codecs.open(output_file, "r", encoding="utf-8") as f: + loaded = json.load(f) + assert loaded == test_data2 + + def test_uses_consistent_formatting(self): + """Should use same JSON formatting as json_dump.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = {"z": "last", "a": "first", "m": "middle"} + + github_backup.json_dump_if_changed(test_data, output_file) + + with codecs.open(output_file, "r", encoding="utf-8") as f: + content = f.read() + + # Check for consistent formatting: + # - sorted keys + # - 4-space indent + # - comma-colon-space separator + expected = json.dumps( + test_data, + ensure_ascii=False, + sort_keys=True, + indent=4, + separators=(",", ": "), + ) + assert content == expected + + def test_atomic_write_always_used(self): + """Should always use temp file and rename for atomic writes.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = {"key": "value"} + + result = github_backup.json_dump_if_changed(test_data, output_file) + + assert result is True + assert os.path.exists(output_file) + + # Temp file should not exist after atomic write + temp_file = output_file + ".temp" + assert not os.path.exists(temp_file) + + # Verify content + with codecs.open(output_file, "r", encoding="utf-8") as f: + loaded = json.load(f) + assert loaded == test_data + + def test_handles_unicode_content(self): + """Should correctly handle Unicode content.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = { + "emoji": "🚀", + "chinese": "你好", + "arabic": "مرحبا", + "cyrillic": "Привет", + } + + result = github_backup.json_dump_if_changed(test_data, output_file) + assert result is True + + # Verify Unicode is preserved + with codecs.open(output_file, "r", encoding="utf-8") as f: + loaded = json.load(f) + assert loaded == test_data + + # Second write should skip + result2 = github_backup.json_dump_if_changed(test_data, output_file) + assert result2 is False + + def test_handles_complex_nested_data(self): + """Should handle complex nested data structures.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = { + "users": [ + {"id": 1, "name": "Alice", "tags": ["admin", "user"]}, + {"id": 2, "name": "Bob", "tags": ["user"]}, + ], + "metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}}, + } + + result = github_backup.json_dump_if_changed(test_data, output_file) + assert result is True + + # Verify structure is preserved + with codecs.open(output_file, "r", encoding="utf-8") as f: + loaded = json.load(f) + assert loaded == test_data + + def test_overwrites_on_unicode_decode_error(self): + """Should overwrite if existing file has invalid UTF-8.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + test_data = {"key": "value"} + + # Write invalid UTF-8 bytes + with open(output_file, "wb") as f: + f.write(b"\xff\xfe invalid utf-8") + + # Should catch UnicodeDecodeError and overwrite + result = github_backup.json_dump_if_changed(test_data, output_file) + assert result is True + + # Verify new content was written + with codecs.open(output_file, "r", encoding="utf-8") as f: + loaded = json.load(f) + assert loaded == test_data + + def test_key_order_independence(self): + """Should treat differently-ordered dicts as same if keys/values match.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_file = os.path.join(tmpdir, "test.json") + + # Write first dict + data1 = {"z": 1, "a": 2, "m": 3} + github_backup.json_dump_if_changed(data1, output_file) + + # Try to write same data but different order + data2 = {"a": 2, "m": 3, "z": 1} + result = github_backup.json_dump_if_changed(data2, output_file) + + # Should skip because content is the same (keys are sorted) + assert result is False + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])