Merge pull request #454 from Iamrodos/http-451

Skip DMCA'd repos which return a 451 response
This commit is contained in:
Jose Diaz-Gonzalez
2025-11-28 18:38:32 -05:00
committed by GitHub
2 changed files with 201 additions and 29 deletions

View File

@@ -37,6 +37,15 @@ FNULL = open(os.devnull, "w")
FILE_URI_PREFIX = "file://"
logger = logging.getLogger(__name__)
class RepositoryUnavailableError(Exception):
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
def __init__(self, message, dmca_url=None):
super().__init__(message)
self.dmca_url = dmca_url
# Setup SSL context with fallback chain
https_ctx = ssl.create_default_context()
if https_ctx.get_ca_certs():
@@ -612,6 +621,19 @@ def retrieve_data_gen(args, template, query_args=None, single_request=False):
status_code = int(r.getcode())
# Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository
if status_code == 451:
dmca_url = None
try:
response_data = json.loads(r.read().decode("utf-8"))
dmca_url = response_data.get("block", {}).get("html_url")
except Exception:
pass
raise RepositoryUnavailableError(
"Repository unavailable due to legal reasons (HTTP 451)",
dmca_url=dmca_url
)
# Check if we got correct data
try:
response = json.loads(r.read().decode("utf-8"))
@@ -1668,6 +1690,7 @@ def backup_repositories(args, output_directory, repositories):
continue # don't try to back anything else for a gist; it doesn't exist
try:
download_wiki = args.include_wiki or args.include_everything
if repository["has_wiki"] and download_wiki:
fetch_repository(
@@ -1702,6 +1725,12 @@ def backup_repositories(args, output_directory, repositories):
repos_template,
include_assets=args.include_assets or args.include_everything,
)
except RepositoryUnavailableError as e:
logger.warning(f"Repository {repository['full_name']} is unavailable (HTTP 451)")
if e.dmca_url:
logger.warning(f"DMCA notice: {e.dmca_url}")
logger.info(f"Skipping remaining resources for {repository['full_name']}")
continue
if args.incremental:
if last_update == "0000-00-00T00:00:00Z":

143
tests/test_http_451.py Normal file
View File

@@ -0,0 +1,143 @@
"""Tests for HTTP 451 (DMCA takedown) handling."""
import json
from unittest.mock import Mock, patch
import pytest
from github_backup import github_backup
class TestHTTP451Exception:
"""Test suite for HTTP 451 DMCA takedown exception handling."""
def test_repository_unavailable_error_raised(self):
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
# Create mock args
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
# Mock HTTPError 451 response
mock_response = Mock()
mock_response.getcode.return_value = 451
dmca_data = {
"message": "Repository access blocked",
"block": {
"reason": "dmca",
"created_at": "2024-11-12T14:38:04Z",
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
}
}
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
# Check exception has DMCA URL
assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
assert "451" in str(exc_info.value)
def test_repository_unavailable_error_without_dmca_url(self):
"""HTTP 451 without DMCA details should still raise exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 451
mock_response.read.return_value = b'{"message": "Blocked"}'
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
# Exception raised even without DMCA URL
assert exc_info.value.dmca_url is None
assert "451" in str(exc_info.value)
def test_repository_unavailable_error_with_malformed_json(self):
"""HTTP 451 with malformed JSON should still raise exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 451
mock_response.read.return_value = b"invalid json {"
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
with pytest.raises(github_backup.RepositoryUnavailableError):
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
def test_other_http_errors_unchanged(self):
"""Other HTTP errors should still raise generic Exception."""
args = Mock()
args.as_app = False
args.token_fine = None
args.token_classic = None
args.username = None
args.password = None
args.osx_keychain_item_name = None
args.osx_keychain_item_account = None
args.throttle_limit = None
args.throttle_pause = 0
mock_response = Mock()
mock_response.getcode.return_value = 404
mock_response.read.return_value = b'{"message": "Not Found"}'
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Not Found"
def mock_get_response(request, auth, template):
return mock_response, []
with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
# Should raise generic Exception, not RepositoryUnavailableError
with pytest.raises(Exception) as exc_info:
list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues"))
assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError)
assert "404" in str(exc_info.value)
if __name__ == "__main__":
pytest.main([__file__, "-v"])