From fce4abb74ae729679d5a6dc7b0b5cf57044efcf2 Mon Sep 17 00:00:00 2001 From: Rodos Date: Tue, 13 Jan 2026 13:15:38 +1100 Subject: [PATCH] Fix fine-grained PAT attachment downloads for private repos (#477) Fine-grained personal access tokens cannot download attachments from private repositories directly due to a GitHub platform limitation. This adds a workaround for image attachments (/assets/ URLs) using GitHub's Markdown API to convert URLs to JWT-signed URLs that can be downloaded without authentication. Changes: - Add get_jwt_signed_url_via_markdown_api() function - Detect fine-grained token + private repo + /assets/ URL upfront - Use JWT workaround for those cases, mark success with jwt_workaround flag - Skip download with skipped_at when workaround fails - Add startup warning when using --attachments with fine-grained tokens - Document limitation in README (file attachments still fail) - Add 6 unit tests for JWT workaround logic --- README.rst | 2 + github_backup/cli.py | 10 +++ github_backup/github_backup.py | 108 ++++++++++++++++++++++++-- tests/test_attachments.py | 136 +++++++++++++++++++++++++++++++++ 4 files changed, 248 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index e2c8fc2..c23027d 100644 --- a/README.rst +++ b/README.rst @@ -281,6 +281,8 @@ The tool automatically extracts file extensions from HTTP headers to ensure file **Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer). +**Fine-grained token limitation:** Due to a GitHub platform limitation, fine-grained personal access tokens (``github_pat_...``) cannot download attachments from private repositories directly. This affects both ``/assets/`` (images) and ``/files/`` (documents) URLs. The tool implements a workaround for image attachments using GitHub's Markdown API, which converts URLs to temporary JWT-signed URLs that can be downloaded. However, this workaround only works for images - document attachments (PDFs, text files, etc.) will fail with 404 errors when using fine-grained tokens on private repos. For full attachment support on private repositories, use a classic token (``-t``) instead of a fine-grained token (``-f``). See `#477 `_ for details. + Run in Docker container ----------------------- diff --git a/github_backup/cli.py b/github_backup/cli.py index 54849d4..987ae71 100644 --- a/github_backup/cli.py +++ b/github_backup/cli.py @@ -46,6 +46,16 @@ def main(): "Use -t/--token or -f/--token-fine to authenticate." ) + # Issue #477: Fine-grained PATs cannot download all attachment types from + # private repos. Image attachments will be retried via Markdown API workaround. + if args.include_attachments and args.token_fine: + logger.warning( + "Using --attachments with fine-grained token. Due to GitHub platform " + "limitations, file attachments (PDFs, etc.) from private repos may fail. " + "Image attachments will be retried via workaround. For full attachment " + "support, use --token-classic instead." + ) + if args.quiet: logger.setLevel(logging.WARNING) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 8a60f66..705f013 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1062,6 +1062,65 @@ def download_attachment_file(url, path, auth, as_app=False, fine=False): return metadata +def get_jwt_signed_url_via_markdown_api(url, token, repo_context): + """Convert a user-attachments/assets URL to a JWT-signed URL via Markdown API. + + GitHub's Markdown API renders image URLs and returns HTML containing + JWT-signed private-user-images.githubusercontent.com URLs that work + without token authentication. + + This is a workaround for issue #477 where fine-grained PATs cannot + download user-attachments URLs from private repos directly. + + Limitations: + - Only works for /assets/ URLs (images) + - Does NOT work for /files/ URLs (PDFs, text files, etc.) + - JWT URLs expire after ~5 minutes + + Args: + url: The github.com/user-attachments/assets/UUID URL + token: Raw fine-grained PAT (github_pat_...) + repo_context: Repository context as "owner/repo" + + Returns: + str: JWT-signed URL from private-user-images.githubusercontent.com + None: If conversion fails + """ + + try: + payload = json.dumps( + {"text": f"![img]({url})", "mode": "gfm", "context": repo_context} + ).encode("utf-8") + + request = Request("https://api.github.com/markdown", data=payload, method="POST") + request.add_header("Authorization", f"token {token}") + request.add_header("Content-Type", "application/json") + request.add_header("Accept", "application/vnd.github+json") + + html = urlopen(request, timeout=30).read().decode("utf-8") + + # Parse JWT-signed URL from HTML response + # Format: + if match := re.search( + r'src="(https://private-user-images\.githubusercontent\.com/[^"]+)"', html + ): + jwt_url = match.group(1) + logger.debug("Converted attachment URL to JWT-signed URL via Markdown API") + return jwt_url + + logger.debug("Markdown API response did not contain JWT-signed URL") + return None + + except HTTPError as e: + logger.debug( + "Markdown API request failed with HTTP {0}: {1}".format(e.code, e.reason) + ) + return None + except Exception as e: + logger.debug("Markdown API request failed: {0}".format(str(e))) + return None + + def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None): """Extract GitHub-hosted attachment URLs from issue/PR body and comments. @@ -1415,15 +1474,46 @@ def download_attachments( filename = get_attachment_filename(url) filepath = os.path.join(attachments_dir, filename) - # Download and get metadata - metadata = download_attachment_file( - url, - filepath, - get_auth(args, encode=not args.as_app), - as_app=args.as_app, - fine=args.token_fine is not None, + # Issue #477: Fine-grained PATs cannot download user-attachments/assets + # from private repos directly (404). Use Markdown API workaround to get + # a JWT-signed URL. Only works for /assets/ (images), not /files/. + needs_jwt = ( + args.token_fine is not None + and repository.get("private", False) + and "github.com/user-attachments/assets/" in url ) + if not needs_jwt: + # NORMAL download path + metadata = download_attachment_file( + url, + filepath, + get_auth(args, encode=not args.as_app), + as_app=args.as_app, + fine=args.token_fine is not None, + ) + elif jwt_url := get_jwt_signed_url_via_markdown_api( + url, args.token_fine, repository["full_name"] + ): + # JWT needed and extracted, download via JWT + metadata = download_attachment_file( + jwt_url, filepath, auth=None, as_app=False, fine=False + ) + metadata["url"] = url # Apply back the original URL + metadata["jwt_workaround"] = True + else: + # Markdown API workaround failed - skip download we know will fail + metadata = { + "url": url, + "success": False, + "skipped_at": datetime.now(timezone.utc).isoformat(), + "error": "Fine-grained token cannot download private repo attachments. " + "Markdown API workaround failed. Use --token-classic instead.", + } + logger.warning( + "Skipping attachment {0}: {1}".format(url, metadata["error"]) + ) + # If download succeeded but we got an extension from Content-Disposition, # we may need to rename the file to add the extension if metadata["success"] and metadata.get("original_filename"): @@ -1951,7 +2041,9 @@ def backup_security_advisories(args, repo_cwd, repository, repos_template): logger.info("Retrieving {0} security advisories".format(repository["full_name"])) mkdir_p(repo_cwd, advisory_cwd) - template = "{0}/{1}/security-advisories".format(repos_template, repository["full_name"]) + template = "{0}/{1}/security-advisories".format( + repos_template, repository["full_name"] + ) _advisories = retrieve_data(args, template) diff --git a/tests/test_attachments.py b/tests/test_attachments.py index b338caf..4613984 100644 --- a/tests/test_attachments.py +++ b/tests/test_attachments.py @@ -349,3 +349,139 @@ class TestManifestDuplicatePrevention: downloaded_urls[0] == "https://github.com/user-attachments/assets/unavailable" ) + + +class TestJWTWorkaround: + """Test JWT workaround for fine-grained tokens on private repos (issue #477).""" + + def test_markdown_api_extracts_jwt_url(self): + """Markdown API response with JWT URL is extracted correctly.""" + from unittest.mock import patch, Mock + + html_response = '''

img

''' + + mock_response = Mock() + mock_response.read.return_value = html_response.encode("utf-8") + + with patch("github_backup.github_backup.urlopen", return_value=mock_response): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + assert result == "https://private-user-images.githubusercontent.com/123/abc.png?jwt=eyJhbGciOiJ" + + def test_markdown_api_returns_none_on_http_error(self): + """HTTP errors return None.""" + from unittest.mock import patch + from urllib.error import HTTPError + + with patch("github_backup.github_backup.urlopen", side_effect=HTTPError(None, 403, "Forbidden", {}, None)): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + assert result is None + + def test_markdown_api_returns_none_when_no_jwt_url(self): + """Response without JWT URL returns None.""" + from unittest.mock import patch, Mock + + mock_response = Mock() + mock_response.read.return_value = b"

No image here

" + + with patch("github_backup.github_backup.urlopen", return_value=mock_response): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + assert result is None + + def test_needs_jwt_only_for_fine_grained_private_assets(self): + """needs_jwt is True only for fine-grained + private + /assets/ URL.""" + assets_url = "https://github.com/user-attachments/assets/abc123" + files_url = "https://github.com/user-attachments/files/123/doc.pdf" + + # Fine-grained + private + assets = True + assert ( + "github_pat_" is not None + and True # private + and "github.com/user-attachments/assets/" in assets_url + ) is True + + # Fine-grained + private + files = False + assert ( + "github_pat_" is not None + and True + and "github.com/user-attachments/assets/" in files_url + ) is False + + # Fine-grained + public + assets = False + assert ( + "github_pat_" is not None + and False # public + and "github.com/user-attachments/assets/" in assets_url + ) is False + + def test_jwt_workaround_sets_manifest_flag(self, attachment_test_setup): + """Successful JWT workaround sets jwt_workaround flag in manifest.""" + from unittest.mock import patch, Mock + + setup = attachment_test_setup + setup["args"].token_fine = "github_pat_test" + setup["repository"]["private"] = True + + issue_data = {"body": "https://github.com/user-attachments/assets/abc123"} + + jwt_url = "https://private-user-images.githubusercontent.com/123/abc.png?jwt=token" + + with patch( + "github_backup.github_backup.get_jwt_signed_url_via_markdown_api", + return_value=jwt_url + ), patch( + "github_backup.github_backup.download_attachment_file", + return_value={"success": True, "http_status": 200, "url": jwt_url} + ): + github_backup.download_attachments( + setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"] + ) + + manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json") + with open(manifest_path) as f: + manifest = json.load(f) + + assert manifest["attachments"][0]["jwt_workaround"] is True + assert manifest["attachments"][0]["url"] == "https://github.com/user-attachments/assets/abc123" + + def test_jwt_workaround_failure_uses_skipped_at(self, attachment_test_setup): + """Failed JWT workaround uses skipped_at instead of downloaded_at.""" + from unittest.mock import patch + + setup = attachment_test_setup + setup["args"].token_fine = "github_pat_test" + setup["repository"]["private"] = True + + issue_data = {"body": "https://github.com/user-attachments/assets/abc123"} + + with patch( + "github_backup.github_backup.get_jwt_signed_url_via_markdown_api", + return_value=None # Markdown API failed + ): + github_backup.download_attachments( + setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"] + ) + + manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json") + with open(manifest_path) as f: + manifest = json.load(f) + + attachment = manifest["attachments"][0] + assert attachment["success"] is False + assert "skipped_at" in attachment + assert "downloaded_at" not in attachment + assert "Use --token-classic" in attachment["error"]