feat: Add attachment download support for issues and pull requests

Adds new --attachments flag that downloads user-uploaded files from issue and PR bodies and comments. Key features: - Determines attachment URLs - Tracks downloads in manifest.json with metadata - Supports --skip-existing to avoid re-downloading - Handles filename collisions with counter suffix - Smart retry logic for transient vs permanent failures - Uses Content-Disposition for correct file extensions
2025-12-05 16:18:02 +01:00 · 2025-11-03 13:36:15 +11:00
parent 8f859be355
commit a194fa48ce
2 changed files with 637 additions and 3 deletions
--- a/github_backup/github_backup.py
+++ b/github_backup/github_backup.py
@@ -420,6 +420,12 @@ def parse_args(args=None):
        dest="include_assets",
        help="include assets alongside release information; only applies if including releases",
    )
+    parser.add_argument(
+        "--attachments",
+        action="store_true",
+        dest="include_attachments",
+        help="download user-attachments from issues and pull requests",
+    )
    parser.add_argument(
        "--throttle-limit",
        dest="throttle_limit",
@@ -814,7 +820,9 @@ class S3HTTPRedirectHandler(HTTPRedirectHandler):
        request = super(S3HTTPRedirectHandler, self).redirect_request(
            req, fp, code, msg, headers, newurl
        )
-        del request.headers["Authorization"]
+        # Only delete Authorization header if it exists (attachments may not have it)
+        if "Authorization" in request.headers:
+            del request.headers["Authorization"]
        return request


@@ -867,6 +875,598 @@ def download_file(url, path, auth, as_app=False, fine=False):
        )


+def download_attachment_file(url, path, auth, as_app=False, fine=False):
+    """Download attachment file directly (not via GitHub API).
+
+    Similar to download_file() but for direct file URLs, not API endpoints.
+    Attachment URLs (user-images, user-attachments) are direct downloads,
+    not API endpoints, so we skip _construct_request() which adds API params.
+
+    URL Format Support & Authentication Requirements:
+
+    | URL Format                                   | Auth Required | Notes                    |
+    |----------------------------------------------|---------------|--------------------------|
+    | github.com/user-attachments/assets/*         | Private only  | Modern format (2024+)    |
+    | github.com/user-attachments/files/*          | Private only  | Modern format (2024+)    |
+    | user-images.githubusercontent.com/*          | No (public)   | Legacy CDN, all eras     |
+    | private-user-images.githubusercontent.com/*  | JWT in URL    | Legacy private (5min)    |
+    | github.com/{owner}/{repo}/files/*            | Repo filter   | Old repo files           |
+
+    - Modern user-attachments: Requires GitHub token auth for private repos
+    - Legacy public CDN: No auth needed/accepted (returns 400 with auth header)
+    - Legacy private CDN: Uses JWT token embedded in URL, no GitHub token needed
+    - Repo files: Filtered to current repository only during extraction
+
+    Returns dict with metadata:
+        - success: bool
+        - http_status: int (200, 404, etc.)
+        - content_type: str or None
+        - original_filename: str or None (from Content-Disposition)
+        - size_bytes: int or None
+        - error: str or None
+    """
+    import re
+    from datetime import datetime, timezone
+
+    metadata = {
+        "url": url,
+        "success": False,
+        "http_status": None,
+        "content_type": None,
+        "original_filename": None,
+        "size_bytes": None,
+        "downloaded_at": datetime.now(timezone.utc).isoformat(),
+        "error": None,
+    }
+
+    if os.path.exists(path):
+        metadata["success"] = True
+        metadata["http_status"] = 200  # Assume success if already exists
+        metadata["size_bytes"] = os.path.getsize(path)
+        return metadata
+
+    # Create simple request (no API query params)
+    request = Request(url)
+    request.add_header("Accept", "application/octet-stream")
+
+    # Add authentication header only for modern github.com/user-attachments URLs
+    # Legacy CDN URLs (user-images.githubusercontent.com) are public and don't need/accept auth
+    # Private CDN URLs (private-user-images) use JWT tokens embedded in the URL
+    if auth is not None and "github.com/user-attachments/" in url:
+        if not as_app:
+            if fine:
+                # Fine-grained token: plain token with "token " prefix
+                request.add_header("Authorization", "token " + auth)
+            else:
+                # Classic token: base64-encoded with "Basic " prefix
+                request.add_header("Authorization", "Basic ".encode("ascii") + auth)
+        else:
+            # App authentication
+            auth = auth.encode("ascii")
+            request.add_header("Authorization", "token ".encode("ascii") + auth)
+
+    # Reuse S3HTTPRedirectHandler from download_file()
+    opener = build_opener(S3HTTPRedirectHandler)
+
+    try:
+        response = opener.open(request)
+        metadata["http_status"] = response.getcode()
+
+        # Extract Content-Type
+        content_type = response.headers.get("Content-Type", "").split(";")[0].strip()
+        if content_type:
+            metadata["content_type"] = content_type
+
+        # Extract original filename from Content-Disposition header
+        # Format: attachment; filename=example.mov or attachment;filename="example.mov"
+        content_disposition = response.headers.get("Content-Disposition", "")
+        if content_disposition:
+            # Match: filename=something or filename="something" or filename*=UTF-8''something
+            match = re.search(r'filename\*?=["\']?([^"\';\r\n]+)', content_disposition)
+            if match:
+                original_filename = match.group(1).strip()
+                # Handle RFC 5987 encoding: filename*=UTF-8''example.mov
+                if "UTF-8''" in original_filename:
+                    original_filename = original_filename.split("UTF-8''")[1]
+                metadata["original_filename"] = original_filename
+
+        # Fallback: Extract filename from final URL after redirects
+        # This handles user-attachments/assets URLs which redirect to S3 with filename.ext
+        if not metadata["original_filename"]:
+            from urllib.parse import urlparse, unquote
+
+            final_url = response.geturl()
+            parsed = urlparse(final_url)
+            # Get filename from path (last component before query string)
+            path_parts = parsed.path.split("/")
+            if path_parts:
+                # URL might be encoded, decode it
+                filename_from_url = unquote(path_parts[-1])
+                # Only use if it has an extension
+                if "." in filename_from_url:
+                    metadata["original_filename"] = filename_from_url
+
+        # Download file
+        chunk_size = 16 * 1024
+        bytes_downloaded = 0
+        with open(path, "wb") as f:
+            while True:
+                chunk = response.read(chunk_size)
+                if not chunk:
+                    break
+                f.write(chunk)
+                bytes_downloaded += len(chunk)
+
+        metadata["size_bytes"] = bytes_downloaded
+        metadata["success"] = True
+
+    except HTTPError as exc:
+        metadata["http_status"] = exc.code
+        metadata["error"] = str(exc.reason)
+        logger.warning(
+            "Skipping download of attachment {0} due to HTTPError: {1}".format(
+                url, exc.reason
+            )
+        )
+    except URLError as e:
+        metadata["error"] = str(e.reason)
+        logger.warning(
+            "Skipping download of attachment {0} due to URLError: {1}".format(
+                url, e.reason
+            )
+        )
+    except socket.error as e:
+        metadata["error"] = str(e.strerror) if hasattr(e, "strerror") else str(e)
+        logger.warning(
+            "Skipping download of attachment {0} due to socket error: {1}".format(
+                url, e.strerror if hasattr(e, "strerror") else str(e)
+            )
+        )
+    except Exception as e:
+        metadata["error"] = str(e)
+        logger.warning(
+            "Skipping download of attachment {0} due to error: {1}".format(url, str(e))
+        )
+
+    return metadata
+
+
+def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
+    """Extract GitHub-hosted attachment URLs from issue/PR body and comments.
+
+    What qualifies as an attachment?
+    There is no "attachment" concept in the GitHub API - it's a user behavior pattern
+    we've identified through analysis of real-world repositories. We define attachments as:
+
+    - User-uploaded files hosted on GitHub's CDN domains
+    - Found outside of code blocks (not examples/documentation)
+    - Matches known GitHub attachment URL patterns
+
+    This intentionally captures bare URLs pasted by users, not just markdown/HTML syntax.
+    Some false positives (example URLs in documentation) may occur - these fail gracefully
+    with HTTP 404 and are logged in the manifest.
+
+    Supported URL formats:
+    - Modern: github.com/user-attachments/{assets,files}/*
+    - Legacy: user-images.githubusercontent.com/* (including private-user-images)
+    - Repo files: github.com/{owner}/{repo}/files/* (filtered to current repo)
+    - Repo assets: github.com/{owner}/{repo}/assets/* (filtered to current repo)
+
+    Repository filtering (repo files/assets only):
+    - Direct match: URL is for current repository → included
+    - Redirect match: URL redirects to current repository → included (handles renames/transfers)
+    - Different repo: URL is for different repository → excluded
+
+    Code block filtering:
+    - Removes fenced code blocks (```) and inline code (`) before extraction
+    - Prevents extracting URLs from code examples and documentation snippets
+
+    Args:
+        item_data: Issue or PR data dict
+        issue_number: Issue/PR number for logging
+        repository_full_name: Full repository name (owner/repo) for filtering repo-scoped URLs
+    """
+    import re
+
+    urls = []
+
+    # Define all GitHub attachment patterns
+    # Stop at markdown punctuation: whitespace, ), `, ", >, <
+    # Trailing sentence punctuation (. ! ? , ; : ' ") is stripped in post-processing
+    patterns = [
+        r'https://github\.com/user-attachments/(?:assets|files)/[^\s\)`"<>]+',  # Modern
+        r'https://(?:private-)?user-images\.githubusercontent\.com/[^\s\)`"<>]+',  # Legacy CDN
+    ]
+
+    # Add repo-scoped patterns (will be filtered by repository later)
+    # These patterns match ANY repo, then we filter to current repo with redirect checking
+    repo_files_pattern = r'https://github\.com/[^/]+/[^/]+/files/\d+/[^\s\)`"<>]+'
+    repo_assets_pattern = r'https://github\.com/[^/]+/[^/]+/assets/\d+/[^\s\)`"<>]+'
+    patterns.append(repo_files_pattern)
+    patterns.append(repo_assets_pattern)
+
+    def clean_url(url):
+        """Remove trailing sentence and markdown punctuation that's not part of the URL."""
+        return url.rstrip(".!?,;:'\")")
+
+    def remove_code_blocks(text):
+        """Remove markdown code blocks (fenced and inline) from text.
+
+        This prevents extracting URLs from code examples like:
+        - Fenced code blocks: ```code```
+        - Inline code: `code`
+        """
+        # Remove fenced code blocks first (```...```)
+        # DOTALL flag makes . match newlines
+        text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)
+
+        # Remove inline code (`...`)
+        # Non-greedy match between backticks
+        text = re.sub(r"`[^`]*`", "", text)
+
+        return text
+
+    def is_repo_scoped_url(url):
+        """Check if URL is a repo-scoped attachment (files or assets)."""
+        return bool(
+            re.match(r"https://github\.com/[^/]+/[^/]+/(?:files|assets)/\d+/", url)
+        )
+
+    def check_redirect_to_current_repo(url, current_repo):
+        """Check if URL redirects to current repository.
+
+        Returns True if:
+        - URL is already for current repo
+        - URL redirects (301/302) to current repo (handles renames/transfers)
+
+        Returns False otherwise (URL is for a different repo).
+        """
+        # Extract owner/repo from URL
+        match = re.match(r"https://github\.com/([^/]+)/([^/]+)/", url)
+        if not match:
+            return False
+
+        url_owner, url_repo = match.groups()
+        url_repo_full = f"{url_owner}/{url_repo}"
+
+        # Direct match - no need to check redirect
+        if url_repo_full.lower() == current_repo.lower():
+            return True
+
+        # Different repo - check if it redirects to current repo
+        # This handles repository transfers and renames
+        try:
+            import urllib.request
+            import urllib.error
+
+            # Make HEAD request with redirect following disabled
+            # We need to manually handle redirects to see the Location header
+            request = urllib.request.Request(url, method="HEAD")
+            request.add_header("User-Agent", "python-github-backup")
+
+            # Create opener that does NOT follow redirects
+            class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
+                def redirect_request(self, req, fp, code, msg, headers, newurl):
+                    return None  # Don't follow redirects
+
+            opener = urllib.request.build_opener(NoRedirectHandler)
+
+            try:
+                _ = opener.open(request, timeout=10)
+                # Got 200 - URL works as-is but for different repo
+                return False
+            except urllib.error.HTTPError as e:
+                # Check if it's a redirect (301, 302, 307, 308)
+                if e.code in (301, 302, 307, 308):
+                    location = e.headers.get("Location", "")
+                    # Check if redirect points to current repo
+                    if location:
+                        redirect_match = re.match(
+                            r"https://github\.com/([^/]+)/([^/]+)/", location
+                        )
+                        if redirect_match:
+                            redirect_owner, redirect_repo = redirect_match.groups()
+                            redirect_repo_full = f"{redirect_owner}/{redirect_repo}"
+                            return redirect_repo_full.lower() == current_repo.lower()
+                return False
+        except Exception:
+            # On any error (timeout, network issue, etc.), be conservative
+            # and exclude the URL to avoid downloading from wrong repos
+            return False
+
+    # Extract from body
+    body = item_data.get("body") or ""
+    # Remove code blocks before searching for URLs
+    body_cleaned = remove_code_blocks(body)
+    for pattern in patterns:
+        found_urls = re.findall(pattern, body_cleaned)
+        urls.extend([clean_url(url) for url in found_urls])
+
+    # Extract from issue comments
+    if "comment_data" in item_data:
+        for comment in item_data["comment_data"]:
+            comment_body = comment.get("body") or ""
+            # Remove code blocks before searching for URLs
+            comment_cleaned = remove_code_blocks(comment_body)
+            for pattern in patterns:
+                found_urls = re.findall(pattern, comment_cleaned)
+                urls.extend([clean_url(url) for url in found_urls])
+
+    # Extract from PR regular comments
+    if "comment_regular_data" in item_data:
+        for comment in item_data["comment_regular_data"]:
+            comment_body = comment.get("body") or ""
+            # Remove code blocks before searching for URLs
+            comment_cleaned = remove_code_blocks(comment_body)
+            for pattern in patterns:
+                found_urls = re.findall(pattern, comment_cleaned)
+                urls.extend([clean_url(url) for url in found_urls])
+
+    regex_urls = list(set(urls))  # dedupe
+
+    # Filter repo-scoped URLs to current repository only
+    # This handles repository transfers/renames via redirect checking
+    if repository_full_name:
+        filtered_urls = []
+        for url in regex_urls:
+            if is_repo_scoped_url(url):
+                # Check if URL belongs to current repo (or redirects to it)
+                if check_redirect_to_current_repo(url, repository_full_name):
+                    filtered_urls.append(url)
+                # else: skip URLs from other repositories
+            else:
+                # Non-repo-scoped URLs (user-attachments, CDN) - always include
+                filtered_urls.append(url)
+        regex_urls = filtered_urls
+
+    return regex_urls
+
+
+def extract_and_apply_extension(filepath, original_filename):
+    """Extract extension from original filename and rename file if needed.
+
+    Args:
+        filepath: Current file path (may have no extension)
+        original_filename: Original filename from Content-Disposition (has extension)
+
+    Returns:
+        Final filepath with extension applied
+    """
+    if not original_filename or not os.path.exists(filepath):
+        return filepath
+
+    # Get extension from original filename
+    original_ext = os.path.splitext(original_filename)[1]
+    if not original_ext:
+        return filepath
+
+    # Check if current file already has this extension
+    current_ext = os.path.splitext(filepath)[1]
+    if current_ext == original_ext:
+        return filepath
+
+    # Rename file to add extension
+    new_filepath = filepath + original_ext
+    try:
+        os.rename(filepath, new_filepath)
+        logger.debug("Renamed {0} to {1}".format(filepath, new_filepath))
+        return new_filepath
+    except Exception as e:
+        logger.warning("Could not rename {0}: {1}".format(filepath, str(e)))
+        return filepath
+
+
+def get_attachment_filename(url):
+    """Get filename from attachment URL, handling all GitHub formats.
+
+    Formats:
+    - github.com/user-attachments/assets/{uuid} → uuid (add extension later)
+    - github.com/user-attachments/files/{id}/{filename} → filename
+    - github.com/{owner}/{repo}/files/{id}/{filename} → filename
+    - user-images.githubusercontent.com/{user}/{hash}.{ext} → hash.ext
+    - private-user-images.githubusercontent.com/...?jwt=... → extract from path
+    """
+    from urllib.parse import urlparse
+
+    parsed = urlparse(url)
+    path_parts = parsed.path.split("/")
+
+    # Modern: /user-attachments/files/{id}/{filename}
+    if "user-attachments/files" in parsed.path:
+        return path_parts[-1]
+
+    # Modern: /user-attachments/assets/{uuid}
+    elif "user-attachments/assets" in parsed.path:
+        return path_parts[-1]  # extension added later via detect_and_add_extension
+
+    # Repo files: /{owner}/{repo}/files/{id}/{filename}
+    elif "/files/" in parsed.path and len(path_parts) >= 2:
+        return path_parts[-1]
+
+    # Legacy: user-images.githubusercontent.com/{user}/{hash-with-ext}
+    elif "githubusercontent.com" in parsed.netloc:
+        return path_parts[-1]  # Already has extension usually
+
+    # Fallback: use last path component
+    return path_parts[-1] if path_parts[-1] else "unknown_attachment"
+
+
+def resolve_filename_collision(filepath):
+    """Resolve filename collisions using counter suffix pattern.
+
+    If filepath exists, returns a new filepath with counter suffix.
+    Pattern: report.pdf → report_1.pdf → report_2.pdf
+
+    Also protects against manifest.json collisions by treating it as reserved.
+
+    Args:
+        filepath: Full path to file that might exist
+
+    Returns:
+        filepath that doesn't collide (may be same as input if no collision)
+    """
+    directory = os.path.dirname(filepath)
+    filename = os.path.basename(filepath)
+
+    # Protect manifest.json - it's a reserved filename
+    if filename == "manifest.json":
+        name, ext = os.path.splitext(filename)
+        counter = 1
+        while True:
+            new_filename = f"{name}_{counter}{ext}"
+            new_filepath = os.path.join(directory, new_filename)
+            if not os.path.exists(new_filepath):
+                return new_filepath
+            counter += 1
+
+    if not os.path.exists(filepath):
+        return filepath
+
+    name, ext = os.path.splitext(filename)
+
+    counter = 1
+    while True:
+        new_filename = f"{name}_{counter}{ext}"
+        new_filepath = os.path.join(directory, new_filename)
+        if not os.path.exists(new_filepath):
+            return new_filepath
+        counter += 1
+
+
+def download_attachments(args, item_cwd, item_data, number, repository, item_type="issue"):
+    """Download user-attachments from issue/PR body and comments with manifest.
+
+    Args:
+        args: Command line arguments
+        item_cwd: Working directory (issue_cwd or pulls_cwd)
+        item_data: Issue or PR data dict
+        number: Issue or PR number
+        repository: Repository dict
+        item_type: "issue" or "pull" for logging/manifest
+    """
+    import json
+    from datetime import datetime, timezone
+
+    item_type_display = "issue" if item_type == "issue" else "pull request"
+
+    urls = extract_attachment_urls(
+        item_data, issue_number=number, repository_full_name=repository["full_name"]
+    )
+    if not urls:
+        return
+
+    attachments_dir = os.path.join(item_cwd, "attachments", str(number))
+    manifest_path = os.path.join(attachments_dir, "manifest.json")
+
+    # Load existing manifest if skip_existing is enabled
+    existing_urls = set()
+    existing_metadata = []
+    if args.skip_existing and os.path.exists(manifest_path):
+        try:
+            with open(manifest_path, "r") as f:
+                existing_manifest = json.load(f)
+                all_metadata = existing_manifest.get("attachments", [])
+                # Only skip URLs that were successfully downloaded OR failed with permanent errors
+                # Retry transient failures (5xx, timeouts, network errors)
+                for item in all_metadata:
+                    if item.get("success"):
+                        existing_urls.add(item["url"])
+                    else:
+                        # Check if this is a permanent failure (don't retry) or transient (retry)
+                        http_status = item.get("http_status")
+                        if http_status in [404, 410, 451]:
+                            # Permanent failures - don't retry
+                            existing_urls.add(item["url"])
+                # Transient failures (5xx, auth errors, timeouts) will be retried
+                existing_metadata = all_metadata
+        except (json.JSONDecodeError, IOError):
+            # If manifest is corrupted, re-download everything
+            logger.warning(
+                "Corrupted manifest for {0} #{1}, will re-download".format(
+                    item_type_display, number
+                )
+            )
+            existing_urls = set()
+            existing_metadata = []
+
+    # Filter to only new URLs
+    new_urls = [url for url in urls if url not in existing_urls]
+
+    if not new_urls and existing_urls:
+        logger.debug(
+            "Skipping attachments for {0} #{1} (all {2} already downloaded)".format(
+                item_type_display, number, len(urls)
+            )
+        )
+        return
+
+    if new_urls:
+        logger.info(
+            "Downloading {0} new attachment(s) for {1} #{2}".format(
+                len(new_urls), item_type_display, number
+            )
+        )
+
+    mkdir_p(item_cwd, attachments_dir)
+
+    # Collect metadata for manifest (start with existing)
+    attachment_metadata_list = existing_metadata[:]
+
+    for url in new_urls:
+        filename = get_attachment_filename(url)
+        filepath = os.path.join(attachments_dir, filename)
+
+        # Check for collision BEFORE downloading
+        filepath = resolve_filename_collision(filepath)
+
+        # Download and get metadata
+        metadata = download_attachment_file(
+            url,
+            filepath,
+            get_auth(args, encode=not args.as_app),
+            as_app=args.as_app,
+            fine=args.token_fine is not None,
+        )
+
+        # Apply extension from Content-Disposition if available
+        if metadata["success"] and metadata.get("original_filename"):
+            final_filepath = extract_and_apply_extension(
+                filepath, metadata["original_filename"]
+            )
+            # Check for collision again ONLY if filename changed (extension was added)
+            if final_filepath != filepath:
+                final_filepath = resolve_filename_collision(final_filepath)
+            # Update saved_as to reflect actual filename
+            metadata["saved_as"] = os.path.basename(final_filepath)
+        else:
+            metadata["saved_as"] = (
+                os.path.basename(filepath) if metadata["success"] else None
+            )
+
+        attachment_metadata_list.append(metadata)
+
+    # Write manifest
+    if attachment_metadata_list:
+        manifest = {
+            "issue_number": number,
+            "issue_type": item_type,
+            "repository": f"{args.user}/{args.repository}"
+            if hasattr(args, "repository") and args.repository
+            else args.user,
+            "manifest_updated_at": datetime.now(timezone.utc).isoformat(),
+            "attachments": attachment_metadata_list,
+        }
+
+        manifest_path = os.path.join(attachments_dir, "manifest.json")
+        with open(manifest_path, "w") as f:
+            json.dump(manifest, f, indent=2)
+        logger.debug(
+            "Wrote manifest for {0} #{1}: {2} attachments".format(
+                item_type_display, number, len(attachment_metadata_list)
+            )
+        )
+
+
 def get_authenticated_user(args):
    template = "https://{0}/user".format(get_github_api_host(args))
    data = retrieve_data(args, template, single_request=True)
@@ -1157,6 +1757,10 @@ def backup_issues(args, repo_cwd, repository, repos_template):
        if args.include_issue_events or args.include_everything:
            template = events_template.format(number)
            issues[number]["event_data"] = retrieve_data(args, template)
+        if args.include_attachments:
+            download_attachments(
+                args, issue_cwd, issues[number], number, repository, item_type="issue"
+            )

        with codecs.open(issue_file + ".temp", "w", encoding="utf-8") as f:
            json_dump(issue, f)
@@ -1228,6 +1832,10 @@ def backup_pulls(args, repo_cwd, repository, repos_template):
        if args.include_pull_commits or args.include_everything:
            template = commits_template.format(number)
            pulls[number]["commit_data"] = retrieve_data(args, template)
+        if args.include_attachments:
+            download_attachments(
+                args, pulls_cwd, pulls[number], number, repository, item_type="pull"
+            )

        with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f:
            json_dump(pull, f)