"""Behavioral tests for attachment functionality.""" import json import os import tempfile from pathlib import Path from unittest.mock import Mock import pytest from github_backup import github_backup @pytest.fixture def attachment_test_setup(tmp_path): """Fixture providing setup and helper for attachment download tests.""" from unittest.mock import patch issue_cwd = tmp_path / "issues" issue_cwd.mkdir() # Mock args args = Mock() args.as_app = False args.token_fine = None args.token_classic = None args.username = None args.password = None args.osx_keychain_item_name = None args.osx_keychain_item_account = None args.user = "testuser" args.repository = "testrepo" repository = {"full_name": "testuser/testrepo"} def call_download(issue_data, issue_number=123): """Call download_attachments with mocked HTTP downloads. Returns list of URLs that were actually downloaded. """ downloaded_urls = [] def mock_download(url, path, auth, as_app, fine): downloaded_urls.append(url) return { "success": True, "saved_as": os.path.basename(path), "url": url, } with patch( "github_backup.github_backup.download_attachment_file", side_effect=mock_download, ): github_backup.download_attachments( args, str(issue_cwd), issue_data, issue_number, repository ) return downloaded_urls return { "issue_cwd": str(issue_cwd), "args": args, "repository": repository, "call_download": call_download, } class TestURLExtraction: """Test URL extraction with realistic issue content.""" def test_mixed_urls(self): issue_data = { "body": """ ## Bug Report When uploading files, I see this error. Here's a screenshot: https://github.com/user-attachments/assets/abc123def456 The logs show: https://github.com/user-attachments/files/789/error-log.txt This is similar to https://github.com/someorg/somerepo/issues/42 but different. You can also see the video at https://user-images.githubusercontent.com/12345/video-demo.mov Here's how to reproduce: ```bash # Don't extract this example URL: curl https://github.com/user-attachments/assets/example999 ``` More info at https://docs.example.com/guide Also see this inline code `https://github.com/user-attachments/files/111/inline.pdf` should not extract. Final attachment: https://github.com/user-attachments/files/222/report.pdf. """, "comment_data": [ { "body": "Here's another attachment: https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" }, { "body": """ Example code: ```python url = "https://github.com/user-attachments/assets/code-example" ``` But this is real: https://github.com/user-attachments/files/333/actual.zip """ }, ], } # Extract URLs urls = github_backup.extract_attachment_urls(issue_data) expected_urls = [ "https://github.com/user-attachments/assets/abc123def456", "https://github.com/user-attachments/files/789/error-log.txt", "https://user-images.githubusercontent.com/12345/video-demo.mov", "https://github.com/user-attachments/files/222/report.pdf", "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123", "https://github.com/user-attachments/files/333/actual.zip", ] assert set(urls) == set(expected_urls) def test_trailing_punctuation_stripped(self): """URLs with trailing punctuation should have punctuation stripped.""" issue_data = { "body": """ See this file: https://github.com/user-attachments/files/1/doc.pdf. And this one (https://github.com/user-attachments/files/2/image.png). Check it out! https://github.com/user-attachments/files/3/data.csv! """ } urls = github_backup.extract_attachment_urls(issue_data) expected = [ "https://github.com/user-attachments/files/1/doc.pdf", "https://github.com/user-attachments/files/2/image.png", "https://github.com/user-attachments/files/3/data.csv", ] assert set(urls) == set(expected) def test_deduplication_across_body_and_comments(self): """Same URL in body and comments should only appear once.""" duplicate_url = "https://github.com/user-attachments/assets/abc123" issue_data = { "body": f"First mention: {duplicate_url}", "comment_data": [ {"body": f"Second mention: {duplicate_url}"}, {"body": f"Third mention: {duplicate_url}"}, ], } urls = github_backup.extract_attachment_urls(issue_data) assert set(urls) == {duplicate_url} class TestFilenameExtraction: """Test filename extraction from different URL types.""" def test_modern_assets_url(self): """Modern assets URL returns UUID.""" url = "https://github.com/user-attachments/assets/abc123def456" filename = github_backup.get_attachment_filename(url) assert filename == "abc123def456" def test_modern_files_url(self): """Modern files URL returns filename.""" url = "https://github.com/user-attachments/files/12345/report.pdf" filename = github_backup.get_attachment_filename(url) assert filename == "report.pdf" def test_legacy_cdn_url(self): """Legacy CDN URL returns filename with extension.""" url = "https://user-images.githubusercontent.com/123456/abc-def.png" filename = github_backup.get_attachment_filename(url) assert filename == "abc-def.png" def test_private_cdn_url(self): """Private CDN URL returns filename.""" url = "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" filename = github_backup.get_attachment_filename(url) assert filename == "secret.png" def test_repo_files_url(self): """Repo-scoped files URL returns filename.""" url = "https://github.com/owner/repo/files/789/document.txt" filename = github_backup.get_attachment_filename(url) assert filename == "document.txt" class TestFilenameCollision: """Test filename collision resolution.""" def test_collision_behavior(self): """Test filename collision resolution with real files.""" with tempfile.TemporaryDirectory() as tmpdir: # No collision - file doesn't exist result = github_backup.resolve_filename_collision( os.path.join(tmpdir, "report.pdf") ) assert result == os.path.join(tmpdir, "report.pdf") # Create the file, now collision exists Path(os.path.join(tmpdir, "report.pdf")).touch() result = github_backup.resolve_filename_collision( os.path.join(tmpdir, "report.pdf") ) assert result == os.path.join(tmpdir, "report_1.pdf") # Create report_1.pdf too Path(os.path.join(tmpdir, "report_1.pdf")).touch() result = github_backup.resolve_filename_collision( os.path.join(tmpdir, "report.pdf") ) assert result == os.path.join(tmpdir, "report_2.pdf") def test_manifest_reserved(self): """manifest.json is always treated as reserved.""" with tempfile.TemporaryDirectory() as tmpdir: # Even if manifest.json doesn't exist, should get manifest_1.json result = github_backup.resolve_filename_collision( os.path.join(tmpdir, "manifest.json") ) assert result == os.path.join(tmpdir, "manifest_1.json") class TestManifestDuplicatePrevention: """Test that manifest prevents duplicate downloads (the bug fix).""" def test_manifest_filters_existing_urls(self, attachment_test_setup): """URLs in manifest are not re-downloaded.""" setup = attachment_test_setup # Create manifest with existing URLs attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") os.makedirs(attachments_dir) manifest_path = os.path.join(attachments_dir, "manifest.json") manifest = { "attachments": [ { "url": "https://github.com/user-attachments/assets/old1", "success": True, "saved_as": "old1.pdf", }, { "url": "https://github.com/user-attachments/assets/old2", "success": True, "saved_as": "old2.pdf", }, ] } with open(manifest_path, "w") as f: json.dump(manifest, f) # Issue data with 2 old URLs and 1 new URL issue_data = { "body": """ Old: https://github.com/user-attachments/assets/old1 Old: https://github.com/user-attachments/assets/old2 New: https://github.com/user-attachments/assets/new1 """ } downloaded_urls = setup["call_download"](issue_data) # Should only download the NEW URL (old ones filtered by manifest) assert len(downloaded_urls) == 1 assert downloaded_urls[0] == "https://github.com/user-attachments/assets/new1" def test_no_manifest_downloads_all(self, attachment_test_setup): """Without manifest, all URLs should be downloaded.""" setup = attachment_test_setup # Issue data with 2 URLs issue_data = { "body": """ https://github.com/user-attachments/assets/url1 https://github.com/user-attachments/assets/url2 """ } downloaded_urls = setup["call_download"](issue_data) # Should download ALL URLs (no manifest to filter) assert len(downloaded_urls) == 2 assert set(downloaded_urls) == { "https://github.com/user-attachments/assets/url1", "https://github.com/user-attachments/assets/url2", } def test_manifest_skips_permanent_failures(self, attachment_test_setup): """Manifest skips permanent failures (404, 410) but retries transient (503).""" setup = attachment_test_setup # Create manifest with different failure types attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") os.makedirs(attachments_dir) manifest_path = os.path.join(attachments_dir, "manifest.json") manifest = { "attachments": [ { "url": "https://github.com/user-attachments/assets/success", "success": True, "saved_as": "success.pdf", }, { "url": "https://github.com/user-attachments/assets/notfound", "success": False, "http_status": 404, }, { "url": "https://github.com/user-attachments/assets/gone", "success": False, "http_status": 410, }, { "url": "https://github.com/user-attachments/assets/unavailable", "success": False, "http_status": 503, }, ] } with open(manifest_path, "w") as f: json.dump(manifest, f) # Issue data has all 4 URLs issue_data = { "body": """ https://github.com/user-attachments/assets/success https://github.com/user-attachments/assets/notfound https://github.com/user-attachments/assets/gone https://github.com/user-attachments/assets/unavailable """ } downloaded_urls = setup["call_download"](issue_data) # Should only retry 503 (transient failure) # Success, 404, and 410 should be skipped assert len(downloaded_urls) == 1 assert ( downloaded_urls[0] == "https://github.com/user-attachments/assets/unavailable" )