Merge pull request #488 from Iamrodos/fix/487-dmca-regression

Fix HTTP 451 DMCA and 403 TOS handling regression (#487)
This commit is contained in:
Jose Diaz-Gonzalez
2026-02-16 00:46:05 -05:00
committed by GitHub
3 changed files with 244 additions and 79 deletions

View File

@@ -39,11 +39,11 @@ logger = logging.getLogger(__name__)
class RepositoryUnavailableError(Exception): class RepositoryUnavailableError(Exception):
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown).""" """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation)."""
def __init__(self, message, dmca_url=None): def __init__(self, message, legal_url=None):
super().__init__(message) super().__init__(message)
self.dmca_url = dmca_url self.legal_url = legal_url
# Setup SSL context with fallback chain # Setup SSL context with fallback chain
@@ -647,6 +647,14 @@ def retrieve_data(args, template, query_args=None, paginated=True):
return None return None
def fetch_all() -> Generator[dict, None, None]: def fetch_all() -> Generator[dict, None, None]:
def _extract_legal_url(response_body_bytes):
"""Extract DMCA/legal notice URL from GitHub API error response body."""
try:
data = json.loads(response_body_bytes.decode("utf-8"))
return data.get("block", {}).get("html_url")
except Exception:
return None
next_url = None next_url = None
while True: while True:
@@ -661,47 +669,66 @@ def retrieve_data(args, template, query_args=None, paginated=True):
as_app=args.as_app, as_app=args.as_app,
fine=args.token_fine is not None, fine=args.token_fine is not None,
) )
http_response = make_request_with_retry(request, auth, args.max_retries) try:
http_response = make_request_with_retry(
match http_response.getcode(): request, auth, args.max_retries
case 200: )
# Success - Parse JSON response except HTTPError as exc:
try: if exc.code == 451:
response = json.loads(http_response.read().decode("utf-8")) legal_url = _extract_legal_url(exc.read())
break # Exit retry loop and handle the data returned
except (
IncompleteRead,
json.decoder.JSONDecodeError,
TimeoutError,
) as e:
logger.warning(f"{type(e).__name__} reading response")
if attempt < args.max_retries:
delay = calculate_retry_delay(attempt, {})
logger.warning(
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
)
time.sleep(delay)
continue # Next retry attempt
case 451:
# DMCA takedown - extract URL if available, then raise
dmca_url = None
try:
response_data = json.loads(
http_response.read().decode("utf-8")
)
dmca_url = response_data.get("block", {}).get("html_url")
except Exception:
pass
raise RepositoryUnavailableError( raise RepositoryUnavailableError(
"Repository unavailable due to legal reasons (HTTP 451)", f"Repository unavailable due to legal reasons (HTTP {exc.code})",
dmca_url=dmca_url, legal_url=legal_url,
) )
elif exc.code == 403:
# Rate-limit 403s (x-ratelimit-remaining=0) are retried
# by make_request_with_retry — re-raise if exhausted.
if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1:
raise
# Only convert to RepositoryUnavailableError if GitHub
# indicates a TOS/DMCA block (response contains "block"
# key). Other 403s (permissions, scopes) should propagate.
body = exc.read()
try:
data = json.loads(body.decode("utf-8"))
except Exception:
data = {}
if "block" in data:
raise RepositoryUnavailableError(
"Repository access blocked (HTTP 403)",
legal_url=data.get("block", {}).get("html_url"),
)
raise
else:
raise
case _: # urlopen raises HTTPError for non-2xx, so only success gets here.
raise Exception( # Guard against unexpected status codes from proxies, future Python
f"API request returned HTTP {http_response.getcode()}: {http_response.reason}" # changes, or other edge cases we haven't considered.
status = http_response.getcode()
if status != 200:
raise Exception(
f"Unexpected HTTP {status} from {next_url or template} "
f"(expected non-2xx to raise HTTPError)"
)
# Parse JSON response
try:
response = json.loads(http_response.read().decode("utf-8"))
break # Exit retry loop and handle the data returned
except (
IncompleteRead,
json.decoder.JSONDecodeError,
TimeoutError,
) as e:
logger.warning(f"{type(e).__name__} reading response")
if attempt < args.max_retries:
delay = calculate_retry_delay(attempt, {})
logger.warning(
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
) )
time.sleep(delay)
continue # Next retry attempt
else: else:
logger.error( logger.error(
f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}" f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}"
@@ -1614,7 +1641,13 @@ def retrieve_repositories(args, authenticated_user):
paginated = False paginated = False
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path) template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
repos = retrieve_data(args, template, paginated=paginated) try:
repos = retrieve_data(args, template, paginated=paginated)
except RepositoryUnavailableError as e:
logger.warning(f"Repository is unavailable: {e}")
if e.legal_url:
logger.warning(f"Legal notice: {e.legal_url}")
return []
if args.all_starred: if args.all_starred:
starred_template = "https://{0}/users/{1}/starred".format( starred_template = "https://{0}/users/{1}/starred".format(
@@ -1832,11 +1865,9 @@ def backup_repositories(args, output_directory, repositories):
include_assets=args.include_assets or args.include_everything, include_assets=args.include_assets or args.include_everything,
) )
except RepositoryUnavailableError as e: except RepositoryUnavailableError as e:
logger.warning( logger.warning(f"Repository {repository['full_name']} is unavailable: {e}")
f"Repository {repository['full_name']} is unavailable (HTTP 451)" if e.legal_url:
) logger.warning(f"Legal notice: {e.legal_url}")
if e.dmca_url:
logger.warning(f"DMCA notice: {e.dmca_url}")
logger.info(f"Skipping remaining resources for {repository['full_name']}") logger.info(f"Skipping remaining resources for {repository['full_name']}")
continue continue

View File

@@ -1,13 +1,28 @@
"""Tests for HTTP 451 (DMCA takedown) handling.""" """Tests for HTTP 451 (DMCA takedown) and HTTP 403 (TOS) handling."""
import io
import json import json
from unittest.mock import Mock, patch from unittest.mock import patch
from urllib.error import HTTPError
import pytest import pytest
from github_backup import github_backup from github_backup import github_backup
def _make_http_error(code, body_bytes, msg="Error", headers=None):
"""Create an HTTPError with a readable body (like a real urllib response)."""
if headers is None:
headers = {"x-ratelimit-remaining": "5000"}
return HTTPError(
url="https://api.github.com/repos/test/repo",
code=code,
msg=msg,
hdrs=headers,
fp=io.BytesIO(body_bytes),
)
class TestHTTP451Exception: class TestHTTP451Exception:
"""Test suite for HTTP 451 DMCA takedown exception handling.""" """Test suite for HTTP 451 DMCA takedown exception handling."""
@@ -15,9 +30,6 @@ class TestHTTP451Exception:
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL.""" """HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
args = create_args() args = create_args()
mock_response = Mock()
mock_response.getcode.return_value = 451
dmca_data = { dmca_data = {
"message": "Repository access blocked", "message": "Repository access blocked",
"block": { "block": {
@@ -26,66 +38,166 @@ class TestHTTP451Exception:
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md", "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md",
}, },
} }
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8") body = json.dumps(dmca_data).encode("utf-8")
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
with patch( def mock_urlopen(*a, **kw):
"github_backup.github_backup.make_request_with_retry", raise _make_http_error(451, body, msg="Unavailable For Legal Reasons")
return_value=mock_response,
): with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
github_backup.retrieve_data( github_backup.retrieve_data(
args, "https://api.github.com/repos/test/dmca/issues" args, "https://api.github.com/repos/test/dmca/issues"
) )
assert ( assert (
exc_info.value.dmca_url exc_info.value.legal_url
== "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
) )
assert "451" in str(exc_info.value) assert "451" in str(exc_info.value)
def test_repository_unavailable_error_without_dmca_url(self, create_args): def test_repository_unavailable_error_without_legal_url(self, create_args):
"""HTTP 451 without DMCA details should still raise exception.""" """HTTP 451 without DMCA details should still raise exception."""
args = create_args() args = create_args()
mock_response = Mock() def mock_urlopen(*a, **kw):
mock_response.getcode.return_value = 451 raise _make_http_error(451, b'{"message": "Blocked"}')
mock_response.read.return_value = b'{"message": "Blocked"}'
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
with patch( with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
"github_backup.github_backup.make_request_with_retry",
return_value=mock_response,
):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
github_backup.retrieve_data( github_backup.retrieve_data(
args, "https://api.github.com/repos/test/dmca/issues" args, "https://api.github.com/repos/test/dmca/issues"
) )
assert exc_info.value.dmca_url is None assert exc_info.value.legal_url is None
assert "451" in str(exc_info.value) assert "451" in str(exc_info.value)
def test_repository_unavailable_error_with_malformed_json(self, create_args): def test_repository_unavailable_error_with_malformed_json(self, create_args):
"""HTTP 451 with malformed JSON should still raise exception.""" """HTTP 451 with malformed JSON should still raise exception."""
args = create_args() args = create_args()
mock_response = Mock() def mock_urlopen(*a, **kw):
mock_response.getcode.return_value = 451 raise _make_http_error(451, b"invalid json {")
mock_response.read.return_value = b"invalid json {"
mock_response.headers = {"x-ratelimit-remaining": "5000"}
mock_response.reason = "Unavailable For Legal Reasons"
with patch( with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
"github_backup.github_backup.make_request_with_retry",
return_value=mock_response,
):
with pytest.raises(github_backup.RepositoryUnavailableError): with pytest.raises(github_backup.RepositoryUnavailableError):
github_backup.retrieve_data( github_backup.retrieve_data(
args, "https://api.github.com/repos/test/dmca/issues" args, "https://api.github.com/repos/test/dmca/issues"
) )
class TestHTTP403TOS:
"""Test suite for HTTP 403 TOS violation handling."""
def test_403_tos_raises_repository_unavailable(self, create_args):
"""HTTP 403 (non-rate-limit) should raise RepositoryUnavailableError."""
args = create_args()
tos_data = {
"message": "Repository access blocked",
"block": {
"reason": "tos",
"html_url": "https://github.com/contact/tos-violation",
},
}
body = json.dumps(tos_data).encode("utf-8")
def mock_urlopen(*a, **kw):
raise _make_http_error(
403,
body,
msg="Forbidden",
headers={"x-ratelimit-remaining": "5000"},
)
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
github_backup.retrieve_data(
args, "https://api.github.com/repos/test/blocked/issues"
)
assert (
exc_info.value.legal_url == "https://github.com/contact/tos-violation"
)
assert "403" in str(exc_info.value)
def test_403_permission_denied_not_converted(self, create_args):
"""HTTP 403 without 'block' in body should propagate as HTTPError, not RepositoryUnavailableError."""
args = create_args()
body = json.dumps({"message": "Must have admin rights to Repository."}).encode(
"utf-8"
)
def mock_urlopen(*a, **kw):
raise _make_http_error(
403,
body,
msg="Forbidden",
headers={"x-ratelimit-remaining": "5000"},
)
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
with pytest.raises(HTTPError) as exc_info:
github_backup.retrieve_data(
args, "https://api.github.com/repos/test/private/issues"
)
assert exc_info.value.code == 403
def test_403_rate_limit_not_converted(self, create_args):
"""HTTP 403 with rate limit exhausted should NOT become RepositoryUnavailableError."""
args = create_args()
call_count = 0
def mock_urlopen(*a, **kw):
nonlocal call_count
call_count += 1
raise _make_http_error(
403,
b'{"message": "rate limit"}',
msg="Forbidden",
headers={"x-ratelimit-remaining": "0"},
)
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
with patch(
"github_backup.github_backup.calculate_retry_delay", return_value=0
):
with pytest.raises(HTTPError) as exc_info:
github_backup.retrieve_data(
args, "https://api.github.com/repos/test/ratelimit/issues"
)
assert exc_info.value.code == 403
# Should have retried (not raised immediately as RepositoryUnavailableError)
assert call_count > 1
class TestRetrieveRepositoriesUnavailable:
"""Test that retrieve_repositories handles RepositoryUnavailableError gracefully."""
def test_unavailable_repo_returns_empty_list(self, create_args):
"""retrieve_repositories should return [] when the repo is unavailable."""
args = create_args(repository="blocked-repo")
def mock_urlopen(*a, **kw):
raise _make_http_error(
451,
json.dumps(
{
"message": "Blocked",
"block": {"html_url": "https://example.com/dmca"},
}
).encode("utf-8"),
msg="Unavailable For Legal Reasons",
)
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
repos = github_backup.retrieve_repositories(args, {"login": None})
assert repos == []
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])

View File

@@ -288,6 +288,28 @@ class TestMakeRequestWithRetry:
assert exc_info.value.code == 403 assert exc_info.value.code == 403
assert call_count == 1 # No retries assert call_count == 1 # No retries
def test_451_error_not_retried(self):
"""HTTP 451 should not be retried - raise immediately."""
call_count = 0
def mock_urlopen(*args, **kwargs):
nonlocal call_count
call_count += 1
raise HTTPError(
url="https://api.github.com/test",
code=451,
msg="Unavailable For Legal Reasons",
hdrs={"x-ratelimit-remaining": "5000"},
fp=None,
)
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
with pytest.raises(HTTPError) as exc_info:
make_request_with_retry(Mock(), None)
assert exc_info.value.code == 451
assert call_count == 1 # No retries
def test_connection_error_retries_and_succeeds(self): def test_connection_error_retries_and_succeeds(self):
"""URLError (connection error) should retry and succeed if subsequent request works.""" """URLError (connection error) should retry and succeed if subsequent request works."""
good_response = Mock() good_response = Mock()