Skip DMCA'd repos which return a 451 response

Log a warning and the link to the DMCA notice. Continue backing up
other repositories instead of crashing.

Closes #163
This commit is contained in:
Rodos
2025-11-29 09:19:23 +11:00
parent 6fb0d86977
commit 7840528fe2
2 changed files with 201 additions and 29 deletions

View File

@@ -37,6 +37,15 @@ FNULL = open(os.devnull, "w")
FILE_URI_PREFIX = "file://" FILE_URI_PREFIX = "file://"
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class RepositoryUnavailableError(Exception):
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
def __init__(self, message, dmca_url=None):
super().__init__(message)
self.dmca_url = dmca_url
# Setup SSL context with fallback chain # Setup SSL context with fallback chain
https_ctx = ssl.create_default_context() https_ctx = ssl.create_default_context()
if https_ctx.get_ca_certs(): if https_ctx.get_ca_certs():
@@ -612,6 +621,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
status_code = int(r.getcode()) status_code = int(r.getcode())
# Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository
if status_code == 451:
dmca_url = None
try:
response_data = json.loads(r.read().decode("utf-8"))
dmca_url = response_data.get("block", {}).get("html_url")
except Exception:
pass
raise RepositoryUnavailableError(
"Repository unavailable due to legal reasons (HTTP 451)",
dmca_url=dmca_url
)
# Check if we got correct data # Check if we got correct data
try: try:
response = json.loads(r.read().decode("utf-8")) response = json.loads(r.read().decode("utf-8"))
@@ -1668,40 +1690,47 @@ def backup_repositories(args, output_directory, repositories):
continue # don't try to back anything else for a gist; it doesn't exist continue # don't try to back anything else for a gist; it doesn't exist
download_wiki = args.include_wiki or args.include_everything try:
if repository["has_wiki"] and download_wiki: download_wiki = args.include_wiki or args.include_everything
fetch_repository( if repository["has_wiki"] and download_wiki:
repository["name"], fetch_repository(
repo_url.replace(".git", ".wiki.git"), repository["name"],
os.path.join(repo_cwd, "wiki"), repo_url.replace(".git", ".wiki.git"),
skip_existing=args.skip_existing, os.path.join(repo_cwd, "wiki"),
bare_clone=args.bare_clone, skip_existing=args.skip_existing,
lfs_clone=args.lfs_clone, bare_clone=args.bare_clone,
no_prune=args.no_prune, lfs_clone=args.lfs_clone,
) no_prune=args.no_prune,
if args.include_issues or args.include_everything: )
backup_issues(args, repo_cwd, repository, repos_template) if args.include_issues or args.include_everything:
backup_issues(args, repo_cwd, repository, repos_template)
if args.include_pulls or args.include_everything: if args.include_pulls or args.include_everything:
backup_pulls(args, repo_cwd, repository, repos_template) backup_pulls(args, repo_cwd, repository, repos_template)
if args.include_milestones or args.include_everything: if args.include_milestones or args.include_everything:
backup_milestones(args, repo_cwd, repository, repos_template) backup_milestones(args, repo_cwd, repository, repos_template)
if args.include_labels or args.include_everything: if args.include_labels or args.include_everything:
backup_labels(args, repo_cwd, repository, repos_template) backup_labels(args, repo_cwd, repository, repos_template)
if args.include_hooks or args.include_everything: if args.include_hooks or args.include_everything:
backup_hooks(args, repo_cwd, repository, repos_template) backup_hooks(args, repo_cwd, repository, repos_template)
if args.include_releases or args.include_everything: if args.include_releases or args.include_everything:
backup_releases( backup_releases(
args, args,
repo_cwd, repo_cwd,
repository, repository,
repos_template, repos_template,
include_assets=args.include_assets or args.include_everything, include_assets=args.include_assets or args.include_everything,
) )
except RepositoryUnavailableError as e:
logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)")
if e.dmca_url:
logger.warning(f"DMCA notice: {e.dmca_url}")
logger.info(f"Skipping remaining resources for {repository['full_name']}")
continue
if args.incremental: if args.incremental:
if last_update == "0000-00-00T00:00:00Z": if last_update == "0000-00-00T00:00:00Z":

143
tests/test_http_451.py Normal file
View File

@@ -0,0 +1,143 @@
"""Tests for HTTP 451 (DMCA takedown) handling."""
import json
from unittest.mock import Mock, patch
import pytest
from github_backup import github_backup
class TestHTTP451Exception:
"""Test suite for HTTP 451 DMCA takedown exception handling."""
def test_repository_unavailable_error_raised(self):
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
# Create mock args
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
# Mock HTTPError 451 response
mock_response = Mock()
mock_response.getcode.return_value = 451
dmca_data = {
"message": "Repository access blocked",
"block": {
"reason": "dmca",
"created_at": "2024-11-12T14:38:04Z",
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
}
}
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
# Check exception has DMCA URL
assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
assert "451" in str(exc_info.value)
def test_repository_unavailable_error_without_dmca_url(self):
"""HTTP 451 without DMCA details should still raise exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 451
mock_response.read.return_value = b'{"message": "Blocked"}'
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
# Exception raised even without DMCA URL
assert exc_info.value.dmca_url is None
assert "451" in str(exc_info.value)
def test_repository_unavailable_error_with_malformed_json(self):
"""HTTP 451 with malformed JSON should still raise exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 451
mock_response.read.return_value = b"invalid json {"
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError):
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
def test_other_http_errors_unchanged(self):
"""Other HTTP errors should still raise generic Exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 404
mock_response.read.return_value = b'{"message": "Not Found"}'
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Not Found"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
# Should raise generic Exception, not RepositoryUnavailableError
with pytest.raises(Exception) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues"))
assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError)
assert "404" in str(exc_info.value)
if __name__ == "__main__":
pytest.main([__file__, "-v"])