mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2026-01-14 02:02:36 +01:00
Fix fine-grained PAT attachment downloads for private repos (#477)
Fine-grained personal access tokens cannot download attachments from private repositories directly due to a GitHub platform limitation. This adds a workaround for image attachments (/assets/ URLs) using GitHub's Markdown API to convert URLs to JWT-signed URLs that can be downloaded without authentication. Changes: - Add get_jwt_signed_url_via_markdown_api() function - Detect fine-grained token + private repo + /assets/ URL upfront - Use JWT workaround for those cases, mark success with jwt_workaround flag - Skip download with skipped_at when workaround fails - Add startup warning when using --attachments with fine-grained tokens - Document limitation in README (file attachments still fail) - Add 6 unit tests for JWT workaround logic
This commit is contained in:
@@ -281,6 +281,8 @@ The tool automatically extracts file extensions from HTTP headers to ensure file
|
|||||||
|
|
||||||
**Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer).
|
**Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer).
|
||||||
|
|
||||||
|
**Fine-grained token limitation:** Due to a GitHub platform limitation, fine-grained personal access tokens (``github_pat_...``) cannot download attachments from private repositories directly. This affects both ``/assets/`` (images) and ``/files/`` (documents) URLs. The tool implements a workaround for image attachments using GitHub's Markdown API, which converts URLs to temporary JWT-signed URLs that can be downloaded. However, this workaround only works for images - document attachments (PDFs, text files, etc.) will fail with 404 errors when using fine-grained tokens on private repos. For full attachment support on private repositories, use a classic token (``-t``) instead of a fine-grained token (``-f``). See `#477 <https://github.com/josegonzalez/python-github-backup/issues/477>`_ for details.
|
||||||
|
|
||||||
|
|
||||||
Run in Docker container
|
Run in Docker container
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|||||||
@@ -46,6 +46,16 @@ def main():
|
|||||||
"Use -t/--token or -f/--token-fine to authenticate."
|
"Use -t/--token or -f/--token-fine to authenticate."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Issue #477: Fine-grained PATs cannot download all attachment types from
|
||||||
|
# private repos. Image attachments will be retried via Markdown API workaround.
|
||||||
|
if args.include_attachments and args.token_fine:
|
||||||
|
logger.warning(
|
||||||
|
"Using --attachments with fine-grained token. Due to GitHub platform "
|
||||||
|
"limitations, file attachments (PDFs, etc.) from private repos may fail. "
|
||||||
|
"Image attachments will be retried via workaround. For full attachment "
|
||||||
|
"support, use --token-classic instead."
|
||||||
|
)
|
||||||
|
|
||||||
if args.quiet:
|
if args.quiet:
|
||||||
logger.setLevel(logging.WARNING)
|
logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|||||||
@@ -1062,6 +1062,65 @@ def download_attachment_file(url, path, auth, as_app=False, fine=False):
|
|||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def get_jwt_signed_url_via_markdown_api(url, token, repo_context):
|
||||||
|
"""Convert a user-attachments/assets URL to a JWT-signed URL via Markdown API.
|
||||||
|
|
||||||
|
GitHub's Markdown API renders image URLs and returns HTML containing
|
||||||
|
JWT-signed private-user-images.githubusercontent.com URLs that work
|
||||||
|
without token authentication.
|
||||||
|
|
||||||
|
This is a workaround for issue #477 where fine-grained PATs cannot
|
||||||
|
download user-attachments URLs from private repos directly.
|
||||||
|
|
||||||
|
Limitations:
|
||||||
|
- Only works for /assets/ URLs (images)
|
||||||
|
- Does NOT work for /files/ URLs (PDFs, text files, etc.)
|
||||||
|
- JWT URLs expire after ~5 minutes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The github.com/user-attachments/assets/UUID URL
|
||||||
|
token: Raw fine-grained PAT (github_pat_...)
|
||||||
|
repo_context: Repository context as "owner/repo"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: JWT-signed URL from private-user-images.githubusercontent.com
|
||||||
|
None: If conversion fails
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.dumps(
|
||||||
|
{"text": f"", "mode": "gfm", "context": repo_context}
|
||||||
|
).encode("utf-8")
|
||||||
|
|
||||||
|
request = Request("https://api.github.com/markdown", data=payload, method="POST")
|
||||||
|
request.add_header("Authorization", f"token {token}")
|
||||||
|
request.add_header("Content-Type", "application/json")
|
||||||
|
request.add_header("Accept", "application/vnd.github+json")
|
||||||
|
|
||||||
|
html = urlopen(request, timeout=30).read().decode("utf-8")
|
||||||
|
|
||||||
|
# Parse JWT-signed URL from HTML response
|
||||||
|
# Format: <img src="https://private-user-images.githubusercontent.com/...?jwt=..." ...>
|
||||||
|
if match := re.search(
|
||||||
|
r'src="(https://private-user-images\.githubusercontent\.com/[^"]+)"', html
|
||||||
|
):
|
||||||
|
jwt_url = match.group(1)
|
||||||
|
logger.debug("Converted attachment URL to JWT-signed URL via Markdown API")
|
||||||
|
return jwt_url
|
||||||
|
|
||||||
|
logger.debug("Markdown API response did not contain JWT-signed URL")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except HTTPError as e:
|
||||||
|
logger.debug(
|
||||||
|
"Markdown API request failed with HTTP {0}: {1}".format(e.code, e.reason)
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Markdown API request failed: {0}".format(str(e)))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
|
def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
|
||||||
"""Extract GitHub-hosted attachment URLs from issue/PR body and comments.
|
"""Extract GitHub-hosted attachment URLs from issue/PR body and comments.
|
||||||
|
|
||||||
@@ -1415,15 +1474,46 @@ def download_attachments(
|
|||||||
filename = get_attachment_filename(url)
|
filename = get_attachment_filename(url)
|
||||||
filepath = os.path.join(attachments_dir, filename)
|
filepath = os.path.join(attachments_dir, filename)
|
||||||
|
|
||||||
# Download and get metadata
|
# Issue #477: Fine-grained PATs cannot download user-attachments/assets
|
||||||
metadata = download_attachment_file(
|
# from private repos directly (404). Use Markdown API workaround to get
|
||||||
url,
|
# a JWT-signed URL. Only works for /assets/ (images), not /files/.
|
||||||
filepath,
|
needs_jwt = (
|
||||||
get_auth(args, encode=not args.as_app),
|
args.token_fine is not None
|
||||||
as_app=args.as_app,
|
and repository.get("private", False)
|
||||||
fine=args.token_fine is not None,
|
and "github.com/user-attachments/assets/" in url
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not needs_jwt:
|
||||||
|
# NORMAL download path
|
||||||
|
metadata = download_attachment_file(
|
||||||
|
url,
|
||||||
|
filepath,
|
||||||
|
get_auth(args, encode=not args.as_app),
|
||||||
|
as_app=args.as_app,
|
||||||
|
fine=args.token_fine is not None,
|
||||||
|
)
|
||||||
|
elif jwt_url := get_jwt_signed_url_via_markdown_api(
|
||||||
|
url, args.token_fine, repository["full_name"]
|
||||||
|
):
|
||||||
|
# JWT needed and extracted, download via JWT
|
||||||
|
metadata = download_attachment_file(
|
||||||
|
jwt_url, filepath, auth=None, as_app=False, fine=False
|
||||||
|
)
|
||||||
|
metadata["url"] = url # Apply back the original URL
|
||||||
|
metadata["jwt_workaround"] = True
|
||||||
|
else:
|
||||||
|
# Markdown API workaround failed - skip download we know will fail
|
||||||
|
metadata = {
|
||||||
|
"url": url,
|
||||||
|
"success": False,
|
||||||
|
"skipped_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"error": "Fine-grained token cannot download private repo attachments. "
|
||||||
|
"Markdown API workaround failed. Use --token-classic instead.",
|
||||||
|
}
|
||||||
|
logger.warning(
|
||||||
|
"Skipping attachment {0}: {1}".format(url, metadata["error"])
|
||||||
|
)
|
||||||
|
|
||||||
# If download succeeded but we got an extension from Content-Disposition,
|
# If download succeeded but we got an extension from Content-Disposition,
|
||||||
# we may need to rename the file to add the extension
|
# we may need to rename the file to add the extension
|
||||||
if metadata["success"] and metadata.get("original_filename"):
|
if metadata["success"] and metadata.get("original_filename"):
|
||||||
@@ -1951,7 +2041,9 @@ def backup_security_advisories(args, repo_cwd, repository, repos_template):
|
|||||||
logger.info("Retrieving {0} security advisories".format(repository["full_name"]))
|
logger.info("Retrieving {0} security advisories".format(repository["full_name"]))
|
||||||
mkdir_p(repo_cwd, advisory_cwd)
|
mkdir_p(repo_cwd, advisory_cwd)
|
||||||
|
|
||||||
template = "{0}/{1}/security-advisories".format(repos_template, repository["full_name"])
|
template = "{0}/{1}/security-advisories".format(
|
||||||
|
repos_template, repository["full_name"]
|
||||||
|
)
|
||||||
|
|
||||||
_advisories = retrieve_data(args, template)
|
_advisories = retrieve_data(args, template)
|
||||||
|
|
||||||
|
|||||||
@@ -349,3 +349,139 @@ class TestManifestDuplicatePrevention:
|
|||||||
downloaded_urls[0]
|
downloaded_urls[0]
|
||||||
== "https://github.com/user-attachments/assets/unavailable"
|
== "https://github.com/user-attachments/assets/unavailable"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestJWTWorkaround:
|
||||||
|
"""Test JWT workaround for fine-grained tokens on private repos (issue #477)."""
|
||||||
|
|
||||||
|
def test_markdown_api_extracts_jwt_url(self):
|
||||||
|
"""Markdown API response with JWT URL is extracted correctly."""
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
|
||||||
|
html_response = '''<p><a href="https://private-user-images.githubusercontent.com/123/abc.png?jwt=eyJhbGciOiJ"><img src="https://private-user-images.githubusercontent.com/123/abc.png?jwt=eyJhbGciOiJ" alt="img"></a></p>'''
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.read.return_value = html_response.encode("utf-8")
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", return_value=mock_response):
|
||||||
|
result = github_backup.get_jwt_signed_url_via_markdown_api(
|
||||||
|
"https://github.com/user-attachments/assets/abc123",
|
||||||
|
"github_pat_token",
|
||||||
|
"owner/repo"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == "https://private-user-images.githubusercontent.com/123/abc.png?jwt=eyJhbGciOiJ"
|
||||||
|
|
||||||
|
def test_markdown_api_returns_none_on_http_error(self):
|
||||||
|
"""HTTP errors return None."""
|
||||||
|
from unittest.mock import patch
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", side_effect=HTTPError(None, 403, "Forbidden", {}, None)):
|
||||||
|
result = github_backup.get_jwt_signed_url_via_markdown_api(
|
||||||
|
"https://github.com/user-attachments/assets/abc123",
|
||||||
|
"github_pat_token",
|
||||||
|
"owner/repo"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_markdown_api_returns_none_when_no_jwt_url(self):
|
||||||
|
"""Response without JWT URL returns None."""
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.read.return_value = b"<p>No image here</p>"
|
||||||
|
|
||||||
|
with patch("github_backup.github_backup.urlopen", return_value=mock_response):
|
||||||
|
result = github_backup.get_jwt_signed_url_via_markdown_api(
|
||||||
|
"https://github.com/user-attachments/assets/abc123",
|
||||||
|
"github_pat_token",
|
||||||
|
"owner/repo"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_needs_jwt_only_for_fine_grained_private_assets(self):
|
||||||
|
"""needs_jwt is True only for fine-grained + private + /assets/ URL."""
|
||||||
|
assets_url = "https://github.com/user-attachments/assets/abc123"
|
||||||
|
files_url = "https://github.com/user-attachments/files/123/doc.pdf"
|
||||||
|
|
||||||
|
# Fine-grained + private + assets = True
|
||||||
|
assert (
|
||||||
|
"github_pat_" is not None
|
||||||
|
and True # private
|
||||||
|
and "github.com/user-attachments/assets/" in assets_url
|
||||||
|
) is True
|
||||||
|
|
||||||
|
# Fine-grained + private + files = False
|
||||||
|
assert (
|
||||||
|
"github_pat_" is not None
|
||||||
|
and True
|
||||||
|
and "github.com/user-attachments/assets/" in files_url
|
||||||
|
) is False
|
||||||
|
|
||||||
|
# Fine-grained + public + assets = False
|
||||||
|
assert (
|
||||||
|
"github_pat_" is not None
|
||||||
|
and False # public
|
||||||
|
and "github.com/user-attachments/assets/" in assets_url
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_jwt_workaround_sets_manifest_flag(self, attachment_test_setup):
|
||||||
|
"""Successful JWT workaround sets jwt_workaround flag in manifest."""
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
|
||||||
|
setup = attachment_test_setup
|
||||||
|
setup["args"].token_fine = "github_pat_test"
|
||||||
|
setup["repository"]["private"] = True
|
||||||
|
|
||||||
|
issue_data = {"body": "https://github.com/user-attachments/assets/abc123"}
|
||||||
|
|
||||||
|
jwt_url = "https://private-user-images.githubusercontent.com/123/abc.png?jwt=token"
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"github_backup.github_backup.get_jwt_signed_url_via_markdown_api",
|
||||||
|
return_value=jwt_url
|
||||||
|
), patch(
|
||||||
|
"github_backup.github_backup.download_attachment_file",
|
||||||
|
return_value={"success": True, "http_status": 200, "url": jwt_url}
|
||||||
|
):
|
||||||
|
github_backup.download_attachments(
|
||||||
|
setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"]
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json")
|
||||||
|
with open(manifest_path) as f:
|
||||||
|
manifest = json.load(f)
|
||||||
|
|
||||||
|
assert manifest["attachments"][0]["jwt_workaround"] is True
|
||||||
|
assert manifest["attachments"][0]["url"] == "https://github.com/user-attachments/assets/abc123"
|
||||||
|
|
||||||
|
def test_jwt_workaround_failure_uses_skipped_at(self, attachment_test_setup):
|
||||||
|
"""Failed JWT workaround uses skipped_at instead of downloaded_at."""
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
setup = attachment_test_setup
|
||||||
|
setup["args"].token_fine = "github_pat_test"
|
||||||
|
setup["repository"]["private"] = True
|
||||||
|
|
||||||
|
issue_data = {"body": "https://github.com/user-attachments/assets/abc123"}
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"github_backup.github_backup.get_jwt_signed_url_via_markdown_api",
|
||||||
|
return_value=None # Markdown API failed
|
||||||
|
):
|
||||||
|
github_backup.download_attachments(
|
||||||
|
setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"]
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json")
|
||||||
|
with open(manifest_path) as f:
|
||||||
|
manifest = json.load(f)
|
||||||
|
|
||||||
|
attachment = manifest["attachments"][0]
|
||||||
|
assert attachment["success"] is False
|
||||||
|
assert "skipped_at" in attachment
|
||||||
|
assert "downloaded_at" not in attachment
|
||||||
|
assert "Use --token-classic" in attachment["error"]
|
||||||
|
|||||||
Reference in New Issue
Block a user