mirror of
https://github.com/josegonzalez/python-github-backup.git
synced 2026-04-30 04:25:35 +02:00
@@ -33,6 +33,13 @@ try:
|
||||
except ImportError:
|
||||
VERSION = "unknown"
|
||||
|
||||
from .graphql_queries import (
|
||||
DISCUSSION_DETAIL_QUERY,
|
||||
DISCUSSION_LIST_QUERY,
|
||||
DISCUSSION_PAGE_SIZE,
|
||||
DISCUSSION_REPLIES_QUERY,
|
||||
)
|
||||
|
||||
FNULL = open(os.devnull, "w")
|
||||
FILE_URI_PREFIX = "file://"
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -322,6 +329,12 @@ def parse_args(args=None):
|
||||
dest="include_security_advisories",
|
||||
help="include security advisories in backup",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--discussions",
|
||||
action="store_true",
|
||||
dest="include_discussions",
|
||||
help="include discussions in backup",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repositories",
|
||||
action="store_true",
|
||||
@@ -469,7 +482,7 @@ def parse_args(args=None):
|
||||
"--attachments",
|
||||
action="store_true",
|
||||
dest="include_attachments",
|
||||
help="download user-attachments from issues and pull requests",
|
||||
help="download user-attachments from issues, pull requests, and discussions",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--throttle-limit",
|
||||
@@ -579,6 +592,31 @@ def get_github_api_host(args):
|
||||
return host
|
||||
|
||||
|
||||
def get_github_graphql_url(args):
|
||||
if args.github_host:
|
||||
return "https://{0}/api/graphql".format(args.github_host)
|
||||
|
||||
return "https://api.github.com/graphql"
|
||||
|
||||
|
||||
def get_graphql_auth(args):
|
||||
auth = get_auth(args, encode=False)
|
||||
if not auth:
|
||||
return None
|
||||
|
||||
# GraphQL expects a bearer token. Classic tokens and keychain tokens use
|
||||
# "token:x-oauth-basic" for REST Basic auth, so strip the synthetic
|
||||
# password before sending the GraphQL Authorization header.
|
||||
if (
|
||||
not getattr(args, "as_app", False)
|
||||
and getattr(args, "token_fine", None) is None
|
||||
and ":" in auth
|
||||
):
|
||||
auth = auth.split(":", 1)[0]
|
||||
|
||||
return auth
|
||||
|
||||
|
||||
def get_github_host(args):
|
||||
if args.github_host:
|
||||
host = args.github_host
|
||||
@@ -810,6 +848,87 @@ def retrieve_data(args, template, query_args=None, paginated=True):
|
||||
return list(fetch_all())
|
||||
|
||||
|
||||
def retrieve_graphql_data(args, query, variables=None, log_context=None):
|
||||
"""Fetch data from GitHub's GraphQL API."""
|
||||
auth = get_graphql_auth(args)
|
||||
if not auth:
|
||||
raise Exception("GitHub GraphQL API requires authentication")
|
||||
|
||||
variables = variables or {}
|
||||
payload = json.dumps(
|
||||
{"query": query, "variables": variables}, ensure_ascii=False
|
||||
).encode("utf-8")
|
||||
endpoint = get_github_graphql_url(args)
|
||||
|
||||
for attempt in range(args.max_retries + 1):
|
||||
request = Request(endpoint, data=payload, method="POST")
|
||||
request.add_header("Accept", "application/json")
|
||||
request.add_header("Content-Type", "application/json")
|
||||
request.add_header("Authorization", "bearer " + auth)
|
||||
log_url = endpoint
|
||||
if log_context:
|
||||
log_url = "{0} ({1})".format(log_url, log_context)
|
||||
logger.info("Requesting {0}".format(log_url))
|
||||
|
||||
http_response = make_request_with_retry(request, auth, args.max_retries)
|
||||
|
||||
status = http_response.getcode()
|
||||
if status != 200:
|
||||
raise Exception(
|
||||
f"Unexpected HTTP {status} from {endpoint} "
|
||||
f"(expected non-2xx to raise HTTPError)"
|
||||
)
|
||||
|
||||
try:
|
||||
response = json.loads(http_response.read().decode("utf-8"))
|
||||
except (IncompleteRead, json.decoder.JSONDecodeError, TimeoutError) as e:
|
||||
logger.warning(f"{type(e).__name__} reading GraphQL response")
|
||||
if attempt < args.max_retries:
|
||||
delay = calculate_retry_delay(attempt, {})
|
||||
logger.warning(
|
||||
f"Retrying GraphQL read in {delay:.1f}s "
|
||||
f"(attempt {attempt + 1}/{args.max_retries + 1})"
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
raise Exception(
|
||||
f"Failed to read GraphQL response after {args.max_retries + 1} "
|
||||
f"attempts for {endpoint}"
|
||||
)
|
||||
|
||||
if (
|
||||
remaining := int(http_response.headers.get("x-ratelimit-remaining", 0))
|
||||
) <= (args.throttle_limit or 0):
|
||||
if args.throttle_limit:
|
||||
logger.info(
|
||||
f"Throttling: {remaining} requests left, pausing {args.throttle_pause}s"
|
||||
)
|
||||
time.sleep(args.throttle_pause)
|
||||
|
||||
errors = response.get("errors") or []
|
||||
if errors:
|
||||
if any(error.get("type") == "RATE_LIMITED" for error in errors):
|
||||
if attempt < args.max_retries:
|
||||
delay = calculate_retry_delay(attempt, http_response.headers)
|
||||
logger.warning(
|
||||
f"GraphQL rate limit hit, retrying in {delay:.1f}s "
|
||||
f"(attempt {attempt + 1}/{args.max_retries + 1})"
|
||||
)
|
||||
time.sleep(delay)
|
||||
continue
|
||||
|
||||
messages = "; ".join(
|
||||
error.get("message", str(error)) for error in errors
|
||||
)
|
||||
raise Exception("GraphQL Error: {0}".format(messages))
|
||||
|
||||
return response.get("data", {})
|
||||
|
||||
raise Exception(
|
||||
f"GraphQL request failed after {args.max_retries + 1} attempts"
|
||||
) # pragma: no cover
|
||||
|
||||
|
||||
def make_request_with_retry(request, auth, max_retries=5):
|
||||
"""Make HTTP request with automatic retry for transient errors."""
|
||||
|
||||
@@ -1193,7 +1312,7 @@ def get_jwt_signed_url_via_markdown_api(url, token, repo_context):
|
||||
|
||||
|
||||
def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
|
||||
"""Extract GitHub-hosted attachment URLs from issue/PR body and comments.
|
||||
"""Extract GitHub-hosted attachment URLs from issue/PR/discussion body and comments.
|
||||
|
||||
What qualifies as an attachment?
|
||||
There is no "attachment" concept in the GitHub API - it's a user behavior pattern
|
||||
@@ -1335,33 +1454,29 @@ def extract_attachment_urls(item_data, issue_number=None, repository_full_name=N
|
||||
# and exclude the URL to avoid downloading from wrong repos
|
||||
return False
|
||||
|
||||
# Extract from body
|
||||
body = item_data.get("body") or ""
|
||||
# Remove code blocks before searching for URLs
|
||||
body_cleaned = remove_code_blocks(body)
|
||||
for pattern in patterns:
|
||||
found_urls = re.findall(pattern, body_cleaned)
|
||||
urls.extend([clean_url(url) for url in found_urls])
|
||||
def extract_from_text(text):
|
||||
text_cleaned = remove_code_blocks(text or "")
|
||||
for pattern in patterns:
|
||||
found_urls = re.findall(pattern, text_cleaned)
|
||||
urls.extend([clean_url(url) for url in found_urls])
|
||||
|
||||
# Extract from issue comments
|
||||
def extract_from_comments(comments):
|
||||
for comment in comments:
|
||||
extract_from_text(comment.get("body") or "")
|
||||
# GitHub Discussions support one level of replies. Issues and pull
|
||||
# requests don't have reply_data, so this is a no-op for them.
|
||||
extract_from_comments(comment.get("reply_data") or [])
|
||||
|
||||
# Extract from body
|
||||
extract_from_text(item_data.get("body") or "")
|
||||
|
||||
# Extract from issue comments and discussion comments
|
||||
if "comment_data" in item_data:
|
||||
for comment in item_data["comment_data"]:
|
||||
comment_body = comment.get("body") or ""
|
||||
# Remove code blocks before searching for URLs
|
||||
comment_cleaned = remove_code_blocks(comment_body)
|
||||
for pattern in patterns:
|
||||
found_urls = re.findall(pattern, comment_cleaned)
|
||||
urls.extend([clean_url(url) for url in found_urls])
|
||||
extract_from_comments(item_data["comment_data"])
|
||||
|
||||
# Extract from PR regular comments
|
||||
if "comment_regular_data" in item_data:
|
||||
for comment in item_data["comment_regular_data"]:
|
||||
comment_body = comment.get("body") or ""
|
||||
# Remove code blocks before searching for URLs
|
||||
comment_cleaned = remove_code_blocks(comment_body)
|
||||
for pattern in patterns:
|
||||
found_urls = re.findall(pattern, comment_cleaned)
|
||||
urls.extend([clean_url(url) for url in found_urls])
|
||||
extract_from_comments(item_data["comment_regular_data"])
|
||||
|
||||
regex_urls = list(set(urls)) # dedupe
|
||||
|
||||
@@ -1463,20 +1578,24 @@ def resolve_filename_collision(filepath):
|
||||
def download_attachments(
|
||||
args, item_cwd, item_data, number, repository, item_type="issue"
|
||||
):
|
||||
"""Download user-attachments from issue/PR body and comments with manifest.
|
||||
"""Download user-attachments from issue/PR/discussion body and comments with manifest.
|
||||
|
||||
Args:
|
||||
args: Command line arguments
|
||||
item_cwd: Working directory (issue_cwd or pulls_cwd)
|
||||
item_data: Issue or PR data dict
|
||||
number: Issue or PR number
|
||||
item_cwd: Working directory (issue_cwd, pulls_cwd, or discussion_cwd)
|
||||
item_data: Issue, PR, or discussion data dict
|
||||
number: Issue, PR, or discussion number
|
||||
repository: Repository dict
|
||||
item_type: "issue" or "pull" for logging/manifest
|
||||
item_type: "issue", "pull", or "discussion" for logging/manifest
|
||||
"""
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
|
||||
item_type_display = "issue" if item_type == "issue" else "pull request"
|
||||
item_type_display = {
|
||||
"issue": "issue",
|
||||
"pull": "pull request",
|
||||
"discussion": "discussion",
|
||||
}.get(item_type, item_type)
|
||||
|
||||
urls = extract_attachment_urls(
|
||||
item_data, issue_number=number, repository_full_name=repository["full_name"]
|
||||
@@ -1621,6 +1740,8 @@ def download_attachments(
|
||||
# Write manifest
|
||||
if attachment_metadata_list:
|
||||
manifest = {
|
||||
"item_number": number,
|
||||
"item_type": item_type,
|
||||
"issue_number": number,
|
||||
"issue_type": item_type,
|
||||
"repository": (
|
||||
@@ -1888,6 +2009,9 @@ def backup_repositories(args, output_directory, repositories):
|
||||
if args.include_pulls or args.include_everything:
|
||||
backup_pulls(args, repo_cwd, repository, repos_template)
|
||||
|
||||
if args.include_discussions or args.include_everything:
|
||||
backup_discussions(args, repo_cwd, repository)
|
||||
|
||||
if args.include_milestones or args.include_everything:
|
||||
backup_milestones(args, repo_cwd, repository, repos_template)
|
||||
|
||||
@@ -1922,6 +2046,317 @@ def backup_repositories(args, output_directory, repositories):
|
||||
open(last_update_path, "w").write(last_update)
|
||||
|
||||
|
||||
def _repository_owner_name(repository):
|
||||
return repository["full_name"].split("/", 1)
|
||||
|
||||
|
||||
def _connection_nodes(connection):
|
||||
return [node for node in (connection or {}).get("nodes") or [] if node]
|
||||
|
||||
|
||||
def retrieve_discussion_summaries(args, repository, since=None):
|
||||
owner, name = _repository_owner_name(repository)
|
||||
after = None
|
||||
page = 1
|
||||
summaries = []
|
||||
newest_seen = None
|
||||
discussions_enabled = None
|
||||
total_count = 0
|
||||
|
||||
while True:
|
||||
data = retrieve_graphql_data(
|
||||
args,
|
||||
DISCUSSION_LIST_QUERY,
|
||||
{
|
||||
"owner": owner,
|
||||
"name": name,
|
||||
"after": after,
|
||||
"pageSize": DISCUSSION_PAGE_SIZE,
|
||||
},
|
||||
log_context="discussion summaries {0} page {1}".format(
|
||||
repository["full_name"], page
|
||||
),
|
||||
)
|
||||
repository_data = data.get("repository")
|
||||
if repository_data is None:
|
||||
raise Exception(
|
||||
"Repository {0} not found in GraphQL response".format(
|
||||
repository["full_name"]
|
||||
)
|
||||
)
|
||||
|
||||
discussions_enabled = repository_data.get("hasDiscussionsEnabled")
|
||||
if not discussions_enabled:
|
||||
return [], None, False, 0
|
||||
|
||||
discussions = repository_data.get("discussions") or {}
|
||||
total_count = discussions.get("totalCount", total_count)
|
||||
stop = False
|
||||
|
||||
for discussion in _connection_nodes(discussions):
|
||||
updated_at = discussion.get("updatedAt")
|
||||
if updated_at and (newest_seen is None or updated_at > newest_seen):
|
||||
newest_seen = updated_at
|
||||
|
||||
if since and updated_at and updated_at < since:
|
||||
stop = True
|
||||
break
|
||||
|
||||
summaries.append(discussion)
|
||||
|
||||
page_info = discussions.get("pageInfo") or {}
|
||||
if stop or not page_info.get("hasNextPage"):
|
||||
break
|
||||
|
||||
after = page_info.get("endCursor")
|
||||
page += 1
|
||||
|
||||
return summaries, newest_seen, discussions_enabled, total_count
|
||||
|
||||
|
||||
def retrieve_discussion_comment_replies(args, comment_id, after=None, log_context=None):
|
||||
data = retrieve_graphql_data(
|
||||
args,
|
||||
DISCUSSION_REPLIES_QUERY,
|
||||
{
|
||||
"commentId": comment_id,
|
||||
"repliesCursor": after,
|
||||
"pageSize": DISCUSSION_PAGE_SIZE,
|
||||
},
|
||||
log_context=log_context,
|
||||
)
|
||||
node = data.get("node") or {}
|
||||
return node.get("replies") or {}
|
||||
|
||||
|
||||
def _discussion_comment_log_identifier(comment_node):
|
||||
return (
|
||||
comment_node.get("databaseId")
|
||||
or comment_node.get("url")
|
||||
or comment_node.get("id")
|
||||
)
|
||||
|
||||
|
||||
def _discussion_comment_with_replies(
|
||||
args, comment_node, repository_full_name=None, discussion_number=None
|
||||
):
|
||||
replies_connection = comment_node.get("replies") or {}
|
||||
replies = _connection_nodes(replies_connection)
|
||||
reply_total_count = replies_connection.get("totalCount", len(replies))
|
||||
page_info = replies_connection.get("pageInfo") or {}
|
||||
reply_page = 2
|
||||
|
||||
while page_info.get("hasNextPage"):
|
||||
log_context = None
|
||||
if repository_full_name and discussion_number is not None:
|
||||
log_context = "discussion {0}#{1} comment {2} replies page {3}".format(
|
||||
repository_full_name,
|
||||
discussion_number,
|
||||
_discussion_comment_log_identifier(comment_node),
|
||||
reply_page,
|
||||
)
|
||||
|
||||
replies_connection = retrieve_discussion_comment_replies(
|
||||
args,
|
||||
comment_node["id"],
|
||||
page_info.get("endCursor"),
|
||||
log_context=log_context,
|
||||
)
|
||||
replies.extend(_connection_nodes(replies_connection))
|
||||
page_info = replies_connection.get("pageInfo") or {}
|
||||
reply_page += 1
|
||||
|
||||
comment = {key: value for key, value in comment_node.items() if key != "replies"}
|
||||
comment["reply_count"] = reply_total_count
|
||||
comment["reply_data"] = replies
|
||||
return comment
|
||||
|
||||
|
||||
def retrieve_discussion(args, repository, number):
|
||||
owner, name = _repository_owner_name(repository)
|
||||
comments_cursor = None
|
||||
comments_page = 1
|
||||
discussion_data = None
|
||||
comments = []
|
||||
comment_total_count = 0
|
||||
|
||||
while True:
|
||||
data = retrieve_graphql_data(
|
||||
args,
|
||||
DISCUSSION_DETAIL_QUERY,
|
||||
{
|
||||
"owner": owner,
|
||||
"name": name,
|
||||
"number": number,
|
||||
"commentsCursor": comments_cursor,
|
||||
"pageSize": DISCUSSION_PAGE_SIZE,
|
||||
},
|
||||
log_context="discussion {0}#{1} details/comments page {2}".format(
|
||||
repository["full_name"], number, comments_page
|
||||
),
|
||||
)
|
||||
repository_data = data.get("repository") or {}
|
||||
discussion = repository_data.get("discussion")
|
||||
if discussion is None:
|
||||
raise Exception(
|
||||
"Discussion #{0} not found in {1}".format(
|
||||
number, repository["full_name"]
|
||||
)
|
||||
)
|
||||
|
||||
if discussion_data is None:
|
||||
discussion_data = {
|
||||
key: value for key, value in discussion.items() if key != "comments"
|
||||
}
|
||||
|
||||
comments_connection = discussion.get("comments") or {}
|
||||
comment_total_count = comments_connection.get(
|
||||
"totalCount", comment_total_count
|
||||
)
|
||||
for comment_node in _connection_nodes(comments_connection):
|
||||
comments.append(
|
||||
_discussion_comment_with_replies(
|
||||
args, comment_node, repository["full_name"], number
|
||||
)
|
||||
)
|
||||
|
||||
page_info = comments_connection.get("pageInfo") or {}
|
||||
if not page_info.get("hasNextPage"):
|
||||
break
|
||||
|
||||
comments_cursor = page_info.get("endCursor")
|
||||
comments_page += 1
|
||||
|
||||
discussion_data["comment_count"] = comment_total_count
|
||||
discussion_data["comment_data"] = comments
|
||||
return discussion_data
|
||||
|
||||
|
||||
def backup_discussions(args, repo_cwd, repository):
|
||||
discussion_cwd = os.path.join(repo_cwd, "discussions")
|
||||
if args.skip_existing and os.path.isdir(discussion_cwd):
|
||||
return
|
||||
|
||||
if not get_graphql_auth(args):
|
||||
logger.info(
|
||||
"Skipping {0} discussions since GitHub GraphQL API requires authentication".format(
|
||||
repository["full_name"]
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
discussions_since = None
|
||||
discussion_last_update_path = os.path.join(discussion_cwd, "last_update")
|
||||
if args.incremental and os.path.exists(discussion_last_update_path):
|
||||
discussions_since = open(discussion_last_update_path).read().strip()
|
||||
|
||||
logger.info("Retrieving {0} discussions".format(repository["full_name"]))
|
||||
try:
|
||||
(
|
||||
summaries,
|
||||
newest_seen,
|
||||
discussions_enabled,
|
||||
total_count,
|
||||
) = retrieve_discussion_summaries(args, repository, since=discussions_since)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Unable to retrieve discussions for {0}, skipping: {1}".format(
|
||||
repository["full_name"], e
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
if not discussions_enabled:
|
||||
logger.info(
|
||||
"Discussions are not enabled for {0}, skipping".format(
|
||||
repository["full_name"]
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
mkdir_p(repo_cwd, discussion_cwd)
|
||||
|
||||
if discussions_since:
|
||||
logger.info(
|
||||
"Saving {0} updated discussions to disk ({1} total)".format(
|
||||
len(summaries), total_count
|
||||
)
|
||||
)
|
||||
else:
|
||||
logger.info("Saving {0} discussions to disk".format(len(summaries)))
|
||||
|
||||
written_count = 0
|
||||
skipped_count = 0
|
||||
had_errors = False
|
||||
for summary in summaries:
|
||||
number = summary["number"]
|
||||
discussion_file = os.path.join(discussion_cwd, "{0}.json".format(number))
|
||||
|
||||
if args.incremental_by_files and os.path.isfile(discussion_file):
|
||||
modified = os.path.getmtime(discussion_file)
|
||||
modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
if modified > summary["updatedAt"]:
|
||||
logger.info(
|
||||
"Skipping discussion {0} because it wasn't modified since last backup".format(
|
||||
number
|
||||
)
|
||||
)
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
discussion = retrieve_discussion(args, repository, number)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Unable to retrieve discussion {0}#{1}, skipping: {2}".format(
|
||||
repository["full_name"], number, e
|
||||
)
|
||||
)
|
||||
had_errors = True
|
||||
continue
|
||||
|
||||
if args.include_attachments:
|
||||
download_attachments(
|
||||
args,
|
||||
discussion_cwd,
|
||||
discussion,
|
||||
number,
|
||||
repository,
|
||||
item_type="discussion",
|
||||
)
|
||||
|
||||
if json_dump_if_changed(discussion, discussion_file):
|
||||
written_count += 1
|
||||
|
||||
if (
|
||||
args.incremental
|
||||
and not had_errors
|
||||
and newest_seen
|
||||
and (not discussions_since or newest_seen > discussions_since)
|
||||
):
|
||||
open(discussion_last_update_path, "w").write(newest_seen)
|
||||
|
||||
attempted_count = len(summaries) - skipped_count
|
||||
if not summaries:
|
||||
logger.info("No discussions to save")
|
||||
elif attempted_count == 0:
|
||||
logger.info("{0} discussions skipped".format(skipped_count))
|
||||
elif written_count == attempted_count:
|
||||
logger.info("Saved {0} discussions to disk".format(written_count))
|
||||
elif written_count == 0:
|
||||
logger.info(
|
||||
"{0} discussions unchanged, skipped write".format(attempted_count)
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Saved {0} discussions to disk ({1} unchanged, {2} skipped)".format(
|
||||
written_count,
|
||||
attempted_count - written_count,
|
||||
skipped_count,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def backup_issues(args, repo_cwd, repository, repos_template):
|
||||
has_issues_dir = os.path.isdir("{0}/issues/.git".format(repo_cwd))
|
||||
if args.skip_existing and has_issues_dir:
|
||||
|
||||
Reference in New Issue
Block a user