diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 0b7e1f8..ada2d40 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -39,11 +39,11 @@ logger = logging.getLogger(__name__) class RepositoryUnavailableError(Exception): - """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown).""" + """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation).""" - def __init__(self, message, dmca_url=None): + def __init__(self, message, legal_url=None): super().__init__(message) - self.dmca_url = dmca_url + self.legal_url = legal_url # Setup SSL context with fallback chain @@ -647,6 +647,14 @@ def retrieve_data(args, template, query_args=None, paginated=True): return None def fetch_all() -> Generator[dict, None, None]: + def _extract_legal_url(response_body_bytes): + """Extract DMCA/legal notice URL from GitHub API error response body.""" + try: + data = json.loads(response_body_bytes.decode("utf-8")) + return data.get("block", {}).get("html_url") + except Exception: + return None + next_url = None while True: @@ -661,47 +669,66 @@ def retrieve_data(args, template, query_args=None, paginated=True): as_app=args.as_app, fine=args.token_fine is not None, ) - http_response = make_request_with_retry(request, auth, args.max_retries) - - match http_response.getcode(): - case 200: - # Success - Parse JSON response - try: - response = json.loads(http_response.read().decode("utf-8")) - break # Exit retry loop and handle the data returned - except ( - IncompleteRead, - json.decoder.JSONDecodeError, - TimeoutError, - ) as e: - logger.warning(f"{type(e).__name__} reading response") - if attempt < args.max_retries: - delay = calculate_retry_delay(attempt, {}) - logger.warning( - f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})" - ) - time.sleep(delay) - continue # Next retry attempt - - case 451: - # DMCA takedown - extract URL if available, then raise - dmca_url = None - try: - response_data = json.loads( - http_response.read().decode("utf-8") - ) - dmca_url = response_data.get("block", {}).get("html_url") - except Exception: - pass + try: + http_response = make_request_with_retry( + request, auth, args.max_retries + ) + except HTTPError as exc: + if exc.code == 451: + legal_url = _extract_legal_url(exc.read()) raise RepositoryUnavailableError( - "Repository unavailable due to legal reasons (HTTP 451)", - dmca_url=dmca_url, + f"Repository unavailable due to legal reasons (HTTP {exc.code})", + legal_url=legal_url, ) + elif exc.code == 403: + # Rate-limit 403s (x-ratelimit-remaining=0) are retried + # by make_request_with_retry — re-raise if exhausted. + if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1: + raise + # Only convert to RepositoryUnavailableError if GitHub + # indicates a TOS/DMCA block (response contains "block" + # key). Other 403s (permissions, scopes) should propagate. + body = exc.read() + try: + data = json.loads(body.decode("utf-8")) + except Exception: + data = {} + if "block" in data: + raise RepositoryUnavailableError( + "Repository access blocked (HTTP 403)", + legal_url=data.get("block", {}).get("html_url"), + ) + raise + else: + raise - case _: - raise Exception( - f"API request returned HTTP {http_response.getcode()}: {http_response.reason}" + # urlopen raises HTTPError for non-2xx, so only success gets here. + # Guard against unexpected status codes from proxies, future Python + # changes, or other edge cases we haven't considered. + status = http_response.getcode() + if status != 200: + raise Exception( + f"Unexpected HTTP {status} from {next_url or template} " + f"(expected non-2xx to raise HTTPError)" + ) + + # Parse JSON response + try: + response = json.loads(http_response.read().decode("utf-8")) + break # Exit retry loop and handle the data returned + except ( + IncompleteRead, + json.decoder.JSONDecodeError, + TimeoutError, + ) as e: + logger.warning(f"{type(e).__name__} reading response") + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, {}) + logger.warning( + f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})" ) + time.sleep(delay) + continue # Next retry attempt else: logger.error( f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}" @@ -1614,7 +1641,13 @@ def retrieve_repositories(args, authenticated_user): paginated = False template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path) - repos = retrieve_data(args, template, paginated=paginated) + try: + repos = retrieve_data(args, template, paginated=paginated) + except RepositoryUnavailableError as e: + logger.warning(f"Repository is unavailable: {e}") + if e.legal_url: + logger.warning(f"Legal notice: {e.legal_url}") + return [] if args.all_starred: starred_template = "https://{0}/users/{1}/starred".format( @@ -1832,11 +1865,9 @@ def backup_repositories(args, output_directory, repositories): include_assets=args.include_assets or args.include_everything, ) except RepositoryUnavailableError as e: - logger.warning( - f"Repository {repository['full_name']} is unavailable (HTTP 451)" - ) - if e.dmca_url: - logger.warning(f"DMCA notice: {e.dmca_url}") + logger.warning(f"Repository {repository['full_name']} is unavailable: {e}") + if e.legal_url: + logger.warning(f"Legal notice: {e.legal_url}") logger.info(f"Skipping remaining resources for {repository['full_name']}") continue diff --git a/tests/test_http_451.py b/tests/test_http_451.py index b556069..bba866e 100644 --- a/tests/test_http_451.py +++ b/tests/test_http_451.py @@ -1,13 +1,28 @@ -"""Tests for HTTP 451 (DMCA takedown) handling.""" +"""Tests for HTTP 451 (DMCA takedown) and HTTP 403 (TOS) handling.""" +import io import json -from unittest.mock import Mock, patch +from unittest.mock import patch +from urllib.error import HTTPError import pytest from github_backup import github_backup +def _make_http_error(code, body_bytes, msg="Error", headers=None): + """Create an HTTPError with a readable body (like a real urllib response).""" + if headers is None: + headers = {"x-ratelimit-remaining": "5000"} + return HTTPError( + url="https://api.github.com/repos/test/repo", + code=code, + msg=msg, + hdrs=headers, + fp=io.BytesIO(body_bytes), + ) + + class TestHTTP451Exception: """Test suite for HTTP 451 DMCA takedown exception handling.""" @@ -15,9 +30,6 @@ class TestHTTP451Exception: """HTTP 451 should raise RepositoryUnavailableError with DMCA URL.""" args = create_args() - mock_response = Mock() - mock_response.getcode.return_value = 451 - dmca_data = { "message": "Repository access blocked", "block": { @@ -26,66 +38,166 @@ class TestHTTP451Exception: "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md", }, } - mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8") - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" + body = json.dumps(dmca_data).encode("utf-8") - with patch( - "github_backup.github_backup.make_request_with_retry", - return_value=mock_response, - ): + def mock_urlopen(*a, **kw): + raise _make_http_error(451, body, msg="Unavailable For Legal Reasons") + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: github_backup.retrieve_data( args, "https://api.github.com/repos/test/dmca/issues" ) assert ( - exc_info.value.dmca_url + exc_info.value.legal_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" ) assert "451" in str(exc_info.value) - def test_repository_unavailable_error_without_dmca_url(self, create_args): + def test_repository_unavailable_error_without_legal_url(self, create_args): """HTTP 451 without DMCA details should still raise exception.""" args = create_args() - mock_response = Mock() - mock_response.getcode.return_value = 451 - mock_response.read.return_value = b'{"message": "Blocked"}' - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" + def mock_urlopen(*a, **kw): + raise _make_http_error(451, b'{"message": "Blocked"}') - with patch( - "github_backup.github_backup.make_request_with_retry", - return_value=mock_response, - ): + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: github_backup.retrieve_data( args, "https://api.github.com/repos/test/dmca/issues" ) - assert exc_info.value.dmca_url is None + assert exc_info.value.legal_url is None assert "451" in str(exc_info.value) def test_repository_unavailable_error_with_malformed_json(self, create_args): """HTTP 451 with malformed JSON should still raise exception.""" args = create_args() - mock_response = Mock() - mock_response.getcode.return_value = 451 - mock_response.read.return_value = b"invalid json {" - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" + def mock_urlopen(*a, **kw): + raise _make_http_error(451, b"invalid json {") - with patch( - "github_backup.github_backup.make_request_with_retry", - return_value=mock_response, - ): + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): with pytest.raises(github_backup.RepositoryUnavailableError): github_backup.retrieve_data( args, "https://api.github.com/repos/test/dmca/issues" ) +class TestHTTP403TOS: + """Test suite for HTTP 403 TOS violation handling.""" + + def test_403_tos_raises_repository_unavailable(self, create_args): + """HTTP 403 (non-rate-limit) should raise RepositoryUnavailableError.""" + args = create_args() + + tos_data = { + "message": "Repository access blocked", + "block": { + "reason": "tos", + "html_url": "https://github.com/contact/tos-violation", + }, + } + body = json.dumps(tos_data).encode("utf-8") + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 403, + body, + msg="Forbidden", + headers={"x-ratelimit-remaining": "5000"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/blocked/issues" + ) + + assert ( + exc_info.value.legal_url == "https://github.com/contact/tos-violation" + ) + assert "403" in str(exc_info.value) + + def test_403_permission_denied_not_converted(self, create_args): + """HTTP 403 without 'block' in body should propagate as HTTPError, not RepositoryUnavailableError.""" + args = create_args() + + body = json.dumps({"message": "Must have admin rights to Repository."}).encode( + "utf-8" + ) + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 403, + body, + msg="Forbidden", + headers={"x-ratelimit-remaining": "5000"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(HTTPError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/private/issues" + ) + + assert exc_info.value.code == 403 + + def test_403_rate_limit_not_converted(self, create_args): + """HTTP 403 with rate limit exhausted should NOT become RepositoryUnavailableError.""" + args = create_args() + + call_count = 0 + + def mock_urlopen(*a, **kw): + nonlocal call_count + call_count += 1 + raise _make_http_error( + 403, + b'{"message": "rate limit"}', + msg="Forbidden", + headers={"x-ratelimit-remaining": "0"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): + with pytest.raises(HTTPError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/ratelimit/issues" + ) + + assert exc_info.value.code == 403 + # Should have retried (not raised immediately as RepositoryUnavailableError) + assert call_count > 1 + + +class TestRetrieveRepositoriesUnavailable: + """Test that retrieve_repositories handles RepositoryUnavailableError gracefully.""" + + def test_unavailable_repo_returns_empty_list(self, create_args): + """retrieve_repositories should return [] when the repo is unavailable.""" + args = create_args(repository="blocked-repo") + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 451, + json.dumps( + { + "message": "Blocked", + "block": {"html_url": "https://example.com/dmca"}, + } + ).encode("utf-8"), + msg="Unavailable For Legal Reasons", + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + repos = github_backup.retrieve_repositories(args, {"login": None}) + + assert repos == [] + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/test_retrieve_data.py b/tests/test_retrieve_data.py index 159f06e..014c309 100644 --- a/tests/test_retrieve_data.py +++ b/tests/test_retrieve_data.py @@ -288,6 +288,28 @@ class TestMakeRequestWithRetry: assert exc_info.value.code == 403 assert call_count == 1 # No retries + def test_451_error_not_retried(self): + """HTTP 451 should not be retried - raise immediately.""" + call_count = 0 + + def mock_urlopen(*args, **kwargs): + nonlocal call_count + call_count += 1 + raise HTTPError( + url="https://api.github.com/test", + code=451, + msg="Unavailable For Legal Reasons", + hdrs={"x-ratelimit-remaining": "5000"}, + fp=None, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(HTTPError) as exc_info: + make_request_with_retry(Mock(), None) + + assert exc_info.value.code == 451 + assert call_count == 1 # No retries + def test_connection_error_retries_and_succeeds(self): """URLError (connection error) should retry and succeed if subsequent request works.""" good_response = Mock()