mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2026-02-16 18:04:30 +01:00
Merge pull request #488 from Iamrodos/fix/487-dmca-regression
Fix HTTP 451 DMCA and 403 TOS handling regression (#487)
This commit is contained in:
@@ -39,11 +39,11 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class RepositoryUnavailableError(Exception):
|
class RepositoryUnavailableError(Exception):
|
||||||
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
|
"""Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation)."""
|
||||||
|
|
||||||
def __init__(self, message, dmca_url=None):
|
def __init__(self, message, legal_url=None):
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
self.dmca_url = dmca_url
|
self.legal_url = legal_url
|
||||||
|
|
||||||
|
|
||||||
# Setup SSL context with fallback chain
|
# Setup SSL context with fallback chain
|
||||||
@@ -647,6 +647,14 @@ def retrieve_data(args, template, query_args=None, paginated=True):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def fetch_all() -> Generator[dict, None, None]:
|
def fetch_all() -> Generator[dict, None, None]:
|
||||||
|
def _extract_legal_url(response_body_bytes):
|
||||||
|
"""Extract DMCA/legal notice URL from GitHub API error response body."""
|
||||||
|
try:
|
||||||
|
data = json.loads(response_body_bytes.decode("utf-8"))
|
||||||
|
return data.get("block", {}).get("html_url")
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
next_url = None
|
next_url = None
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -661,47 +669,66 @@ def retrieve_data(args, template, query_args=None, paginated=True):
|
|||||||
as_app=args.as_app,
|
as_app=args.as_app,
|
||||||
fine=args.token_fine is not None,
|
fine=args.token_fine is not None,
|
||||||
)
|
)
|
||||||
http_response = make_request_with_retry(request, auth, args.max_retries)
|
try:
|
||||||
|
http_response = make_request_with_retry(
|
||||||
match http_response.getcode():
|
request, auth, args.max_retries
|
||||||
case 200:
|
)
|
||||||
# Success - Parse JSON response
|
except HTTPError as exc:
|
||||||
try:
|
if exc.code == 451:
|
||||||
response = json.loads(http_response.read().decode("utf-8"))
|
legal_url = _extract_legal_url(exc.read())
|
||||||
break # Exit retry loop and handle the data returned
|
|
||||||
except (
|
|
||||||
IncompleteRead,
|
|
||||||
json.decoder.JSONDecodeError,
|
|
||||||
TimeoutError,
|
|
||||||
) as e:
|
|
||||||
logger.warning(f"{type(e).__name__} reading response")
|
|
||||||
if attempt < args.max_retries:
|
|
||||||
delay = calculate_retry_delay(attempt, {})
|
|
||||||
logger.warning(
|
|
||||||
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
|
|
||||||
)
|
|
||||||
time.sleep(delay)
|
|
||||||
continue # Next retry attempt
|
|
||||||
|
|
||||||
case 451:
|
|
||||||
# DMCA takedown - extract URL if available, then raise
|
|
||||||
dmca_url = None
|
|
||||||
try:
|
|
||||||
response_data = json.loads(
|
|
||||||
http_response.read().decode("utf-8")
|
|
||||||
)
|
|
||||||
dmca_url = response_data.get("block", {}).get("html_url")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
raise RepositoryUnavailableError(
|
raise RepositoryUnavailableError(
|
||||||
"Repository unavailable due to legal reasons (HTTP 451)",
|
f"Repository unavailable due to legal reasons (HTTP {exc.code})",
|
||||||
dmca_url=dmca_url,
|
legal_url=legal_url,
|
||||||
)
|
)
|
||||||
|
elif exc.code == 403:
|
||||||
|
# Rate-limit 403s (x-ratelimit-remaining=0) are retried
|
||||||
|
# by make_request_with_retry — re-raise if exhausted.
|
||||||
|
if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1:
|
||||||
|
raise
|
||||||
|
# Only convert to RepositoryUnavailableError if GitHub
|
||||||
|
# indicates a TOS/DMCA block (response contains "block"
|
||||||
|
# key). Other 403s (permissions, scopes) should propagate.
|
||||||
|
body = exc.read()
|
||||||
|
try:
|
||||||
|
data = json.loads(body.decode("utf-8"))
|
||||||
|
except Exception:
|
||||||
|
data = {}
|
||||||
|
if "block" in data:
|
||||||
|
raise RepositoryUnavailableError(
|
||||||
|
"Repository access blocked (HTTP 403)",
|
||||||
|
legal_url=data.get("block", {}).get("html_url"),
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
case _:
|
# urlopen raises HTTPError for non-2xx, so only success gets here.
|
||||||
raise Exception(
|
# Guard against unexpected status codes from proxies, future Python
|
||||||
f"API request returned HTTP {http_response.getcode()}: {http_response.reason}"
|
# changes, or other edge cases we haven't considered.
|
||||||
|
status = http_response.getcode()
|
||||||
|
if status != 200:
|
||||||
|
raise Exception(
|
||||||
|
f"Unexpected HTTP {status} from {next_url or template} "
|
||||||
|
f"(expected non-2xx to raise HTTPError)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse JSON response
|
||||||
|
try:
|
||||||
|
response = json.loads(http_response.read().decode("utf-8"))
|
||||||
|
break # Exit retry loop and handle the data returned
|
||||||
|
except (
|
||||||
|
IncompleteRead,
|
||||||
|
json.decoder.JSONDecodeError,
|
||||||
|
TimeoutError,
|
||||||
|
) as e:
|
||||||
|
logger.warning(f"{type(e).__name__} reading response")
|
||||||
|
if attempt < args.max_retries:
|
||||||
|
delay = calculate_retry_delay(attempt, {})
|
||||||
|
logger.warning(
|
||||||
|
f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})"
|
||||||
)
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
continue # Next retry attempt
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}"
|
f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}"
|
||||||
@@ -1614,7 +1641,13 @@ def retrieve_repositories(args, authenticated_user):
|
|||||||
paginated = False
|
paginated = False
|
||||||
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
|
template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
|
||||||
|
|
||||||
repos = retrieve_data(args, template, paginated=paginated)
|
try:
|
||||||
|
repos = retrieve_data(args, template, paginated=paginated)
|
||||||
|
except RepositoryUnavailableError as e:
|
||||||
|
logger.warning(f"Repository is unavailable: {e}")
|
||||||
|
if e.legal_url:
|
||||||
|
logger.warning(f"Legal notice: {e.legal_url}")
|
||||||
|
return []
|
||||||
|
|
||||||
if args.all_starred:
|
if args.all_starred:
|
||||||
starred_template = "https://{0}/users/{1}/starred".format(
|
starred_template = "https://{0}/users/{1}/starred".format(
|
||||||
@@ -1832,11 +1865,9 @@ def backup_repositories(args, output_directory, repositories):
|
|||||||
include_assets=args.include_assets or args.include_everything,
|
include_assets=args.include_assets or args.include_everything,
|
||||||
)
|
)
|
||||||
except RepositoryUnavailableError as e:
|
except RepositoryUnavailableError as e:
|
||||||
logger.warning(
|
logger.warning(f"Repository {repository['full_name']} is unavailable: {e}")
|
||||||
f"Repository {repository['full_name']} is unavailable (HTTP 451)"
|
if e.legal_url:
|
||||||
)
|
logger.warning(f"Legal notice: {e.legal_url}")
|
||||||
if e.dmca_url:
|
|
||||||
logger.warning(f"DMCA notice: {e.dmca_url}")
|
|
||||||
logger.info(f"Skipping remaining resources for {repository['full_name']}")
|
logger.info(f"Skipping remaining resources for {repository['full_name']}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,28 @@
|
|||||||
"""Tests for HTTP 451 (DMCA takedown) handling."""
|
"""Tests for HTTP 451 (DMCA takedown) and HTTP 403 (TOS) handling."""
|
||||||
|
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import patch
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from github_backup import github_backup
|
from github_backup import github_backup
|
||||||
|
|
||||||
|
|
||||||
|
def _make_http_error(code, body_bytes, msg="Error", headers=None):
|
||||||
|
"""Create an HTTPError with a readable body (like a real urllib response)."""
|
||||||
|
if headers is None:
|
||||||
|
headers = {"x-ratelimit-remaining": "5000"}
|
||||||
|
return HTTPError(
|
||||||
|
url="https://api.github.com/repos/test/repo",
|
||||||
|
code=code,
|
||||||
|
msg=msg,
|
||||||
|
hdrs=headers,
|
||||||
|
fp=io.BytesIO(body_bytes),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestHTTP451Exception:
|
class TestHTTP451Exception:
|
||||||
"""Test suite for HTTP 451 DMCA takedown exception handling."""
|
"""Test suite for HTTP 451 DMCA takedown exception handling."""
|
||||||
|
|
||||||
@@ -15,9 +30,6 @@ class TestHTTP451Exception:
|
|||||||
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
|
"""HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
|
||||||
args = create_args()
|
args = create_args()
|
||||||
|
|
||||||
mock_response = Mock()
|
|
||||||
mock_response.getcode.return_value = 451
|
|
||||||
|
|
||||||
dmca_data = {
|
dmca_data = {
|
||||||
"message": "Repository access blocked",
|
"message": "Repository access blocked",
|
||||||
"block": {
|
"block": {
|
||||||
@@ -26,66 +38,166 @@ class TestHTTP451Exception:
|
|||||||
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md",
|
"html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
|
body = json.dumps(dmca_data).encode("utf-8")
|
||||||
mock_response.headers = {"x-ratelimit-remaining": "5000"}
|
|
||||||
mock_response.reason = "Unavailable For Legal Reasons"
|
|
||||||
|
|
||||||
with patch(
|
def mock_urlopen(*a, **kw):
|
||||||
"github_backup.github_backup.make_request_with_retry",
|
raise _make_http_error(451, body, msg="Unavailable For Legal Reasons")
|
||||||
return_value=mock_response,
|
|
||||||
):
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
|
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
|
||||||
github_backup.retrieve_data(
|
github_backup.retrieve_data(
|
||||||
args, "https://api.github.com/repos/test/dmca/issues"
|
args, "https://api.github.com/repos/test/dmca/issues"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
exc_info.value.dmca_url
|
exc_info.value.legal_url
|
||||||
== "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
|
== "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
|
||||||
)
|
)
|
||||||
assert "451" in str(exc_info.value)
|
assert "451" in str(exc_info.value)
|
||||||
|
|
||||||
def test_repository_unavailable_error_without_dmca_url(self, create_args):
|
def test_repository_unavailable_error_without_legal_url(self, create_args):
|
||||||
"""HTTP 451 without DMCA details should still raise exception."""
|
"""HTTP 451 without DMCA details should still raise exception."""
|
||||||
args = create_args()
|
args = create_args()
|
||||||
|
|
||||||
mock_response = Mock()
|
def mock_urlopen(*a, **kw):
|
||||||
mock_response.getcode.return_value = 451
|
raise _make_http_error(451, b'{"message": "Blocked"}')
|
||||||
mock_response.read.return_value = b'{"message": "Blocked"}'
|
|
||||||
mock_response.headers = {"x-ratelimit-remaining": "5000"}
|
|
||||||
mock_response.reason = "Unavailable For Legal Reasons"
|
|
||||||
|
|
||||||
with patch(
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
"github_backup.github_backup.make_request_with_retry",
|
|
||||||
return_value=mock_response,
|
|
||||||
):
|
|
||||||
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
|
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
|
||||||
github_backup.retrieve_data(
|
github_backup.retrieve_data(
|
||||||
args, "https://api.github.com/repos/test/dmca/issues"
|
args, "https://api.github.com/repos/test/dmca/issues"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert exc_info.value.dmca_url is None
|
assert exc_info.value.legal_url is None
|
||||||
assert "451" in str(exc_info.value)
|
assert "451" in str(exc_info.value)
|
||||||
|
|
||||||
def test_repository_unavailable_error_with_malformed_json(self, create_args):
|
def test_repository_unavailable_error_with_malformed_json(self, create_args):
|
||||||
"""HTTP 451 with malformed JSON should still raise exception."""
|
"""HTTP 451 with malformed JSON should still raise exception."""
|
||||||
args = create_args()
|
args = create_args()
|
||||||
|
|
||||||
mock_response = Mock()
|
def mock_urlopen(*a, **kw):
|
||||||
mock_response.getcode.return_value = 451
|
raise _make_http_error(451, b"invalid json {")
|
||||||
mock_response.read.return_value = b"invalid json {"
|
|
||||||
mock_response.headers = {"x-ratelimit-remaining": "5000"}
|
|
||||||
mock_response.reason = "Unavailable For Legal Reasons"
|
|
||||||
|
|
||||||
with patch(
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
"github_backup.github_backup.make_request_with_retry",
|
|
||||||
return_value=mock_response,
|
|
||||||
):
|
|
||||||
with pytest.raises(github_backup.RepositoryUnavailableError):
|
with pytest.raises(github_backup.RepositoryUnavailableError):
|
||||||
github_backup.retrieve_data(
|
github_backup.retrieve_data(
|
||||||
args, "https://api.github.com/repos/test/dmca/issues"
|
args, "https://api.github.com/repos/test/dmca/issues"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTTP403TOS:
|
||||||
|
"""Test suite for HTTP 403 TOS violation handling."""
|
||||||
|
|
||||||
|
def test_403_tos_raises_repository_unavailable(self, create_args):
|
||||||
|
"""HTTP 403 (non-rate-limit) should raise RepositoryUnavailableError."""
|
||||||
|
args = create_args()
|
||||||
|
|
||||||
|
tos_data = {
|
||||||
|
"message": "Repository access blocked",
|
||||||
|
"block": {
|
||||||
|
"reason": "tos",
|
||||||
|
"html_url": "https://github.com/contact/tos-violation",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
body = json.dumps(tos_data).encode("utf-8")
|
||||||
|
|
||||||
|
def mock_urlopen(*a, **kw):
|
||||||
|
raise _make_http_error(
|
||||||
|
403,
|
||||||
|
body,
|
||||||
|
msg="Forbidden",
|
||||||
|
headers={"x-ratelimit-remaining": "5000"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
|
with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
|
||||||
|
github_backup.retrieve_data(
|
||||||
|
args, "https://api.github.com/repos/test/blocked/issues"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
exc_info.value.legal_url == "https://github.com/contact/tos-violation"
|
||||||
|
)
|
||||||
|
assert "403" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_403_permission_denied_not_converted(self, create_args):
|
||||||
|
"""HTTP 403 without 'block' in body should propagate as HTTPError, not RepositoryUnavailableError."""
|
||||||
|
args = create_args()
|
||||||
|
|
||||||
|
body = json.dumps({"message": "Must have admin rights to Repository."}).encode(
|
||||||
|
"utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
def mock_urlopen(*a, **kw):
|
||||||
|
raise _make_http_error(
|
||||||
|
403,
|
||||||
|
body,
|
||||||
|
msg="Forbidden",
|
||||||
|
headers={"x-ratelimit-remaining": "5000"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
|
with pytest.raises(HTTPError) as exc_info:
|
||||||
|
github_backup.retrieve_data(
|
||||||
|
args, "https://api.github.com/repos/test/private/issues"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert exc_info.value.code == 403
|
||||||
|
|
||||||
|
def test_403_rate_limit_not_converted(self, create_args):
|
||||||
|
"""HTTP 403 with rate limit exhausted should NOT become RepositoryUnavailableError."""
|
||||||
|
args = create_args()
|
||||||
|
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
|
def mock_urlopen(*a, **kw):
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
raise _make_http_error(
|
||||||
|
403,
|
||||||
|
b'{"message": "rate limit"}',
|
||||||
|
msg="Forbidden",
|
||||||
|
headers={"x-ratelimit-remaining": "0"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
|
with patch(
|
||||||
|
"github_backup.github_backup.calculate_retry_delay", return_value=0
|
||||||
|
):
|
||||||
|
with pytest.raises(HTTPError) as exc_info:
|
||||||
|
github_backup.retrieve_data(
|
||||||
|
args, "https://api.github.com/repos/test/ratelimit/issues"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert exc_info.value.code == 403
|
||||||
|
# Should have retried (not raised immediately as RepositoryUnavailableError)
|
||||||
|
assert call_count > 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRetrieveRepositoriesUnavailable:
|
||||||
|
"""Test that retrieve_repositories handles RepositoryUnavailableError gracefully."""
|
||||||
|
|
||||||
|
def test_unavailable_repo_returns_empty_list(self, create_args):
|
||||||
|
"""retrieve_repositories should return [] when the repo is unavailable."""
|
||||||
|
args = create_args(repository="blocked-repo")
|
||||||
|
|
||||||
|
def mock_urlopen(*a, **kw):
|
||||||
|
raise _make_http_error(
|
||||||
|
451,
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"message": "Blocked",
|
||||||
|
"block": {"html_url": "https://example.com/dmca"},
|
||||||
|
}
|
||||||
|
).encode("utf-8"),
|
||||||
|
msg="Unavailable For Legal Reasons",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
|
repos = github_backup.retrieve_repositories(args, {"login": None})
|
||||||
|
|
||||||
|
assert repos == []
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pytest.main([__file__, "-v"])
|
pytest.main([__file__, "-v"])
|
||||||
|
|||||||
@@ -288,6 +288,28 @@ class TestMakeRequestWithRetry:
|
|||||||
assert exc_info.value.code == 403
|
assert exc_info.value.code == 403
|
||||||
assert call_count == 1 # No retries
|
assert call_count == 1 # No retries
|
||||||
|
|
||||||
|
def test_451_error_not_retried(self):
|
||||||
|
"""HTTP 451 should not be retried - raise immediately."""
|
||||||
|
call_count = 0
|
||||||
|
|
||||||
|
def mock_urlopen(*args, **kwargs):
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
raise HTTPError(
|
||||||
|
url="https://api.github.com/test",
|
||||||
|
code=451,
|
||||||
|
msg="Unavailable For Legal Reasons",
|
||||||
|
hdrs={"x-ratelimit-remaining": "5000"},
|
||||||
|
fp=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
|
||||||
|
with pytest.raises(HTTPError) as exc_info:
|
||||||
|
make_request_with_retry(Mock(), None)
|
||||||
|
|
||||||
|
assert exc_info.value.code == 451
|
||||||
|
assert call_count == 1 # No retries
|
||||||
|
|
||||||
def test_connection_error_retries_and_succeeds(self):
|
def test_connection_error_retries_and_succeeds(self):
|
||||||
"""URLError (connection error) should retry and succeed if subsequent request works."""
|
"""URLError (connection error) should retry and succeed if subsequent request works."""
|
||||||
good_response = Mock()
|
good_response = Mock()
|
||||||
|
|||||||
Reference in New Issue
Block a user