Merge pull request #502 from mrexodia/discussions

Add support for discussions
2026-04-29 20:15:36 +02:00 · 2026-04-29 00:42:53 -04:00
parent ed29a917ca 4d022d94d0
commit 013b27208e
7 changed files with 1042 additions and 41 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -3,6 +3,8 @@ Changelog

 Unreleased
 ----------
+- Add GitHub Discussions backups via GraphQL, including comments, replies,
+  optional attachment downloads, and per-repository incremental checkpoints.
 - Add ``--token-from-gh`` to read authentication from ``gh auth token``.


--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,7 @@ github-backup

 |PyPI| |Python Versions|

-The package can be used to backup an *entire* `Github <https://github.com/>`_ organization, repository or user account, including starred repos, issues and wikis in the most appropriate format (clones for wikis, json files for issues).
+The package can be used to backup an *entire* `Github <https://github.com/>`_ organization, repository or user account, including starred repos, issues, discussions and wikis in the most appropriate format (clones for wikis, json files for issues and discussions).

 Requirements
 ============
@@ -44,8 +44,9 @@ CLI Help output::
                  [--issues] [--issue-comments] [--issue-events] [--pulls]
                  [--pull-comments] [--pull-commits] [--pull-details]
                  [--labels] [--hooks] [--milestones] [--security-advisories]
-                  [--repositories] [--bare] [--no-prune] [--lfs] [--wikis]
-                  [--gists] [--starred-gists] [--skip-archived] [--skip-existing]
+                  [--discussions] [--repositories] [--bare] [--no-prune]
+                  [--lfs] [--wikis] [--gists] [--starred-gists]
+                  [--skip-archived] [--skip-existing]
                  [-L [LANGUAGES ...]] [-N NAME_REGEX] [-H GITHUB_HOST]
                  [-O] [-R REPOSITORY] [-P] [-F] [--prefer-ssh] [-v]
                  [--keychain-name OSX_KEYCHAIN_ITEM_NAME]
@@ -104,6 +105,7 @@ CLI Help output::
      --milestones          include milestones in backup
      --security-advisories
                            include security advisories in backup
+      --discussions         include discussions in backup
      --repositories        include repository clone in backup
      --bare                clone bare repositories
      --no-prune            disable prune option for git fetch
@@ -144,8 +146,8 @@ CLI Help output::
                            applies if including releases
      --skip-assets-on [SKIP_ASSETS_ON ...]
                            skip asset downloads for these repositories
-      --attachments         download user-attachments from issues and pull
-                            requests
+      --attachments         download user-attachments from issues, pull requests,
+                            and discussions
      --throttle-limit THROTTLE_LIMIT
                            start throttling of GitHub API requests after this
                            amount of API requests remain
@@ -184,7 +186,7 @@ Customise the permissions for your use case, but for a personal account full bac

 **User permissions**: Read access to followers, starring, and watching.

-**Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks.
+**Repository permissions**: Read access to contents, discussions, issues, metadata, pull requests, and webhooks.


 GitHub Apps
@@ -265,9 +267,9 @@ LFS objects are fetched for all refs, not just the current checkout, ensuring a
 About Attachments
 -----------------

-When you use the ``--attachments`` option with ``--issues`` or ``--pulls``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue and pull request descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently.
+When you use the ``--attachments`` option with ``--issues``, ``--pulls`` or ``--discussions``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue, pull request and discussion descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently.

-Attachments are saved to ``issues/attachments/{issue_number}/`` and ``pulls/attachments/{pull_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains:
+Attachments are saved to ``issues/attachments/{issue_number}/``, ``pulls/attachments/{pull_number}/`` and ``discussions/attachments/{discussion_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains:

 - The downloaded attachment files (named by their GitHub identifier with appropriate file extensions)
 - If multiple attachments have the same filename, conflicts are resolved with numeric suffixes (e.g., ``report.pdf``, ``report_1.pdf``, ``report_2.pdf``)
@@ -287,6 +289,16 @@ The tool automatically extracts file extensions from HTTP headers to ensure file
 **Fine-grained token limitation:** Due to a GitHub platform limitation, fine-grained personal access tokens (``github_pat_...``) cannot download attachments from private repositories directly. This affects both ``/assets/`` (images) and ``/files/`` (documents) URLs. The tool implements a workaround for image attachments using GitHub's Markdown API, which converts URLs to temporary JWT-signed URLs that can be downloaded. However, this workaround only works for images - document attachments (PDFs, text files, etc.) will fail with 404 errors when using fine-grained tokens on private repos. For full attachment support on private repositories, use a classic token (``-t``) instead of a fine-grained token (``-f``). See `#477 <https://github.com/josegonzalez/python-github-backup/issues/477>`_ for details.


+About Discussions
+-----------------
+
+GitHub Discussions are backed up with GitHub's GraphQL API because the REST API does not expose discussions. Use ``--discussions`` to save each discussion as JSON under ``repositories/{repo}/discussions/{number}.json``. Discussion backups include the discussion body and metadata, category information, comments, and comment replies.
+
+``--discussions`` is included in ``--all``. Unlike most REST API-backed resources, discussions require authentication because GitHub's GraphQL API requires a token. Fine-grained personal access tokens and GitHub Apps need read access to the repository's Discussions permission.
+
+Incremental backups use a per-repository checkpoint at ``repositories/{repo}/discussions/last_update`` based on discussion ``updatedAt`` timestamps. This is separate from the repository-level ``last_update`` file so discussion activity is not missed if the repository's own update timestamp does not change. If you enable ``--discussions`` on an existing incremental backup, the first run performs a full discussions backup for each repository and creates the discussions checkpoint for future runs.
+
+
 About security advisories
 -------------------------

@@ -419,14 +431,14 @@ Quietly and incrementally backup useful Github user data (public and private rep
    export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
    GH_USER=YOUR-GITHUB-USER

-    github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER
+    github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER
    
 Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. ::

    export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
    GH_USER=YOUR-GITHUB-USER

-    github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER
+    github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER

 Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only)::

@@ -442,7 +454,7 @@ This tool creates backups only, there is no inbuilt restore command.
    cd /tmp/white-house/repositories/petitions/repository
    git push --mirror git@github.com:WhiteHouse/petitions.git

-**Issues, pull requests, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations:
+**Issues, pull requests, discussions, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations:

 - New issue/PR numbers are assigned (original numbers cannot be set)
 - Timestamps reflect creation time (original dates cannot be set)
--- a/github_backup/github_backup.py
+++ b/github_backup/github_backup.py
@@ -33,6 +33,13 @@ try:
 except ImportError:
    VERSION = "unknown"

+from .graphql_queries import (
+    DISCUSSION_DETAIL_QUERY,
+    DISCUSSION_LIST_QUERY,
+    DISCUSSION_PAGE_SIZE,
+    DISCUSSION_REPLIES_QUERY,
+)
+
 FNULL = open(os.devnull, "w")
 FILE_URI_PREFIX = "file://"
 logger = logging.getLogger(__name__)
@@ -322,6 +329,12 @@ def parse_args(args=None):
        dest="include_security_advisories",
        help="include security advisories in backup",
    )
+    parser.add_argument(
+        "--discussions",
+        action="store_true",
+        dest="include_discussions",
+        help="include discussions in backup",
+    )
    parser.add_argument(
        "--repositories",
        action="store_true",
@@ -469,7 +482,7 @@ def parse_args(args=None):
        "--attachments",
        action="store_true",
        dest="include_attachments",
-        help="download user-attachments from issues and pull requests",
+        help="download user-attachments from issues, pull requests, and discussions",
    )
    parser.add_argument(
        "--throttle-limit",
@@ -579,6 +592,31 @@ def get_github_api_host(args):
    return host


+def get_github_graphql_url(args):
+    if args.github_host:
+        return "https://{0}/api/graphql".format(args.github_host)
+
+    return "https://api.github.com/graphql"
+
+
+def get_graphql_auth(args):
+    auth = get_auth(args, encode=False)
+    if not auth:
+        return None
+
+    # GraphQL expects a bearer token. Classic tokens and keychain tokens use
+    # "token:x-oauth-basic" for REST Basic auth, so strip the synthetic
+    # password before sending the GraphQL Authorization header.
+    if (
+        not getattr(args, "as_app", False)
+        and getattr(args, "token_fine", None) is None
+        and ":" in auth
+    ):
+        auth = auth.split(":", 1)[0]
+
+    return auth
+
+
 def get_github_host(args):
    if args.github_host:
        host = args.github_host
@@ -810,6 +848,87 @@ def retrieve_data(args, template, query_args=None, paginated=True):
    return list(fetch_all())


+def retrieve_graphql_data(args, query, variables=None, log_context=None):
+    """Fetch data from GitHub's GraphQL API."""
+    auth = get_graphql_auth(args)
+    if not auth:
+        raise Exception("GitHub GraphQL API requires authentication")
+
+    variables = variables or {}
+    payload = json.dumps(
+        {"query": query, "variables": variables}, ensure_ascii=False
+    ).encode("utf-8")
+    endpoint = get_github_graphql_url(args)
+
+    for attempt in range(args.max_retries + 1):
+        request = Request(endpoint, data=payload, method="POST")
+        request.add_header("Accept", "application/json")
+        request.add_header("Content-Type", "application/json")
+        request.add_header("Authorization", "bearer " + auth)
+        log_url = endpoint
+        if log_context:
+            log_url = "{0} ({1})".format(log_url, log_context)
+        logger.info("Requesting {0}".format(log_url))
+
+        http_response = make_request_with_retry(request, auth, args.max_retries)
+
+        status = http_response.getcode()
+        if status != 200:
+            raise Exception(
+                f"Unexpected HTTP {status} from {endpoint} "
+                f"(expected non-2xx to raise HTTPError)"
+            )
+
+        try:
+            response = json.loads(http_response.read().decode("utf-8"))
+        except (IncompleteRead, json.decoder.JSONDecodeError, TimeoutError) as e:
+            logger.warning(f"{type(e).__name__} reading GraphQL response")
+            if attempt < args.max_retries:
+                delay = calculate_retry_delay(attempt, {})
+                logger.warning(
+                    f"Retrying GraphQL read in {delay:.1f}s "
+                    f"(attempt {attempt + 1}/{args.max_retries + 1})"
+                )
+                time.sleep(delay)
+                continue
+            raise Exception(
+                f"Failed to read GraphQL response after {args.max_retries + 1} "
+                f"attempts for {endpoint}"
+            )
+
+        if (
+            remaining := int(http_response.headers.get("x-ratelimit-remaining", 0))
+        ) <= (args.throttle_limit or 0):
+            if args.throttle_limit:
+                logger.info(
+                    f"Throttling: {remaining} requests left, pausing {args.throttle_pause}s"
+                )
+                time.sleep(args.throttle_pause)
+
+        errors = response.get("errors") or []
+        if errors:
+            if any(error.get("type") == "RATE_LIMITED" for error in errors):
+                if attempt < args.max_retries:
+                    delay = calculate_retry_delay(attempt, http_response.headers)
+                    logger.warning(
+                        f"GraphQL rate limit hit, retrying in {delay:.1f}s "
+                        f"(attempt {attempt + 1}/{args.max_retries + 1})"
+                    )
+                    time.sleep(delay)
+                    continue
+
+            messages = "; ".join(
+                error.get("message", str(error)) for error in errors
+            )
+            raise Exception("GraphQL Error: {0}".format(messages))
+
+        return response.get("data", {})
+
+    raise Exception(
+        f"GraphQL request failed after {args.max_retries + 1} attempts"
+    )  # pragma: no cover
+
+
 def make_request_with_retry(request, auth, max_retries=5):
    """Make HTTP request with automatic retry for transient errors."""

@@ -1193,7 +1312,7 @@ def get_jwt_signed_url_via_markdown_api(url, token, repo_context):


 def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
-    """Extract GitHub-hosted attachment URLs from issue/PR body and comments.
+    """Extract GitHub-hosted attachment URLs from issue/PR/discussion body and comments.

    What qualifies as an attachment?
    There is no "attachment" concept in the GitHub API - it's a user behavior pattern
@@ -1335,33 +1454,29 @@ def extract_attachment_urls(item_data, issue_number=None, repository_full_name=N
            # and exclude the URL to avoid downloading from wrong repos
            return False

-    # Extract from body
-    body = item_data.get("body") or ""
-    # Remove code blocks before searching for URLs
-    body_cleaned = remove_code_blocks(body)
-    for pattern in patterns:
-        found_urls = re.findall(pattern, body_cleaned)
-        urls.extend([clean_url(url) for url in found_urls])
+    def extract_from_text(text):
+        text_cleaned = remove_code_blocks(text or "")
+        for pattern in patterns:
+            found_urls = re.findall(pattern, text_cleaned)
+            urls.extend([clean_url(url) for url in found_urls])

-    # Extract from issue comments
+    def extract_from_comments(comments):
+        for comment in comments:
+            extract_from_text(comment.get("body") or "")
+            # GitHub Discussions support one level of replies. Issues and pull
+            # requests don't have reply_data, so this is a no-op for them.
+            extract_from_comments(comment.get("reply_data") or [])
+
+    # Extract from body
+    extract_from_text(item_data.get("body") or "")
+
+    # Extract from issue comments and discussion comments
    if "comment_data" in item_data:
-        for comment in item_data["comment_data"]:
-            comment_body = comment.get("body") or ""
-            # Remove code blocks before searching for URLs
-            comment_cleaned = remove_code_blocks(comment_body)
-            for pattern in patterns:
-                found_urls = re.findall(pattern, comment_cleaned)
-                urls.extend([clean_url(url) for url in found_urls])
+        extract_from_comments(item_data["comment_data"])

    # Extract from PR regular comments
    if "comment_regular_data" in item_data:
-        for comment in item_data["comment_regular_data"]:
-            comment_body = comment.get("body") or ""
-            # Remove code blocks before searching for URLs
-            comment_cleaned = remove_code_blocks(comment_body)
-            for pattern in patterns:
-                found_urls = re.findall(pattern, comment_cleaned)
-                urls.extend([clean_url(url) for url in found_urls])
+        extract_from_comments(item_data["comment_regular_data"])

    regex_urls = list(set(urls))  # dedupe

@@ -1463,20 +1578,24 @@ def resolve_filename_collision(filepath):
 def download_attachments(
    args, item_cwd, item_data, number, repository, item_type="issue"
 ):
-    """Download user-attachments from issue/PR body and comments with manifest.
+    """Download user-attachments from issue/PR/discussion body and comments with manifest.

    Args:
        args: Command line arguments
-        item_cwd: Working directory (issue_cwd or pulls_cwd)
-        item_data: Issue or PR data dict
-        number: Issue or PR number
+        item_cwd: Working directory (issue_cwd, pulls_cwd, or discussion_cwd)
+        item_data: Issue, PR, or discussion data dict
+        number: Issue, PR, or discussion number
        repository: Repository dict
-        item_type: "issue" or "pull" for logging/manifest
+        item_type: "issue", "pull", or "discussion" for logging/manifest
    """
    import json
    from datetime import datetime, timezone

-    item_type_display = "issue" if item_type == "issue" else "pull request"
+    item_type_display = {
+        "issue": "issue",
+        "pull": "pull request",
+        "discussion": "discussion",
+    }.get(item_type, item_type)

    urls = extract_attachment_urls(
        item_data, issue_number=number, repository_full_name=repository["full_name"]
@@ -1621,6 +1740,8 @@ def download_attachments(
    # Write manifest
    if attachment_metadata_list:
        manifest = {
+            "item_number": number,
+            "item_type": item_type,
            "issue_number": number,
            "issue_type": item_type,
            "repository": (
@@ -1888,6 +2009,9 @@ def backup_repositories(args, output_directory, repositories):
            if args.include_pulls or args.include_everything:
                backup_pulls(args, repo_cwd, repository, repos_template)

+            if args.include_discussions or args.include_everything:
+                backup_discussions(args, repo_cwd, repository)
+
            if args.include_milestones or args.include_everything:
                backup_milestones(args, repo_cwd, repository, repos_template)

@@ -1922,6 +2046,317 @@ def backup_repositories(args, output_directory, repositories):
        open(last_update_path, "w").write(last_update)


+def _repository_owner_name(repository):
+    return repository["full_name"].split("/", 1)
+
+
+def _connection_nodes(connection):
+    return [node for node in (connection or {}).get("nodes") or [] if node]
+
+
+def retrieve_discussion_summaries(args, repository, since=None):
+    owner, name = _repository_owner_name(repository)
+    after = None
+    page = 1
+    summaries = []
+    newest_seen = None
+    discussions_enabled = None
+    total_count = 0
+
+    while True:
+        data = retrieve_graphql_data(
+            args,
+            DISCUSSION_LIST_QUERY,
+            {
+                "owner": owner,
+                "name": name,
+                "after": after,
+                "pageSize": DISCUSSION_PAGE_SIZE,
+            },
+            log_context="discussion summaries {0} page {1}".format(
+                repository["full_name"], page
+            ),
+        )
+        repository_data = data.get("repository")
+        if repository_data is None:
+            raise Exception(
+                "Repository {0} not found in GraphQL response".format(
+                    repository["full_name"]
+                )
+            )
+
+        discussions_enabled = repository_data.get("hasDiscussionsEnabled")
+        if not discussions_enabled:
+            return [], None, False, 0
+
+        discussions = repository_data.get("discussions") or {}
+        total_count = discussions.get("totalCount", total_count)
+        stop = False
+
+        for discussion in _connection_nodes(discussions):
+            updated_at = discussion.get("updatedAt")
+            if updated_at and (newest_seen is None or updated_at > newest_seen):
+                newest_seen = updated_at
+
+            if since and updated_at and updated_at < since:
+                stop = True
+                break
+
+            summaries.append(discussion)
+
+        page_info = discussions.get("pageInfo") or {}
+        if stop or not page_info.get("hasNextPage"):
+            break
+
+        after = page_info.get("endCursor")
+        page += 1
+
+    return summaries, newest_seen, discussions_enabled, total_count
+
+
+def retrieve_discussion_comment_replies(args, comment_id, after=None, log_context=None):
+    data = retrieve_graphql_data(
+        args,
+        DISCUSSION_REPLIES_QUERY,
+        {
+            "commentId": comment_id,
+            "repliesCursor": after,
+            "pageSize": DISCUSSION_PAGE_SIZE,
+        },
+        log_context=log_context,
+    )
+    node = data.get("node") or {}
+    return node.get("replies") or {}
+
+
+def _discussion_comment_log_identifier(comment_node):
+    return (
+        comment_node.get("databaseId")
+        or comment_node.get("url")
+        or comment_node.get("id")
+    )
+
+
+def _discussion_comment_with_replies(
+    args, comment_node, repository_full_name=None, discussion_number=None
+):
+    replies_connection = comment_node.get("replies") or {}
+    replies = _connection_nodes(replies_connection)
+    reply_total_count = replies_connection.get("totalCount", len(replies))
+    page_info = replies_connection.get("pageInfo") or {}
+    reply_page = 2
+
+    while page_info.get("hasNextPage"):
+        log_context = None
+        if repository_full_name and discussion_number is not None:
+            log_context = "discussion {0}#{1} comment {2} replies page {3}".format(
+                repository_full_name,
+                discussion_number,
+                _discussion_comment_log_identifier(comment_node),
+                reply_page,
+            )
+
+        replies_connection = retrieve_discussion_comment_replies(
+            args,
+            comment_node["id"],
+            page_info.get("endCursor"),
+            log_context=log_context,
+        )
+        replies.extend(_connection_nodes(replies_connection))
+        page_info = replies_connection.get("pageInfo") or {}
+        reply_page += 1
+
+    comment = {key: value for key, value in comment_node.items() if key != "replies"}
+    comment["reply_count"] = reply_total_count
+    comment["reply_data"] = replies
+    return comment
+
+
+def retrieve_discussion(args, repository, number):
+    owner, name = _repository_owner_name(repository)
+    comments_cursor = None
+    comments_page = 1
+    discussion_data = None
+    comments = []
+    comment_total_count = 0
+
+    while True:
+        data = retrieve_graphql_data(
+            args,
+            DISCUSSION_DETAIL_QUERY,
+            {
+                "owner": owner,
+                "name": name,
+                "number": number,
+                "commentsCursor": comments_cursor,
+                "pageSize": DISCUSSION_PAGE_SIZE,
+            },
+            log_context="discussion {0}#{1} details/comments page {2}".format(
+                repository["full_name"], number, comments_page
+            ),
+        )
+        repository_data = data.get("repository") or {}
+        discussion = repository_data.get("discussion")
+        if discussion is None:
+            raise Exception(
+                "Discussion #{0} not found in {1}".format(
+                    number, repository["full_name"]
+                )
+            )
+
+        if discussion_data is None:
+            discussion_data = {
+                key: value for key, value in discussion.items() if key != "comments"
+            }
+
+        comments_connection = discussion.get("comments") or {}
+        comment_total_count = comments_connection.get(
+            "totalCount", comment_total_count
+        )
+        for comment_node in _connection_nodes(comments_connection):
+            comments.append(
+                _discussion_comment_with_replies(
+                    args, comment_node, repository["full_name"], number
+                )
+            )
+
+        page_info = comments_connection.get("pageInfo") or {}
+        if not page_info.get("hasNextPage"):
+            break
+
+        comments_cursor = page_info.get("endCursor")
+        comments_page += 1
+
+    discussion_data["comment_count"] = comment_total_count
+    discussion_data["comment_data"] = comments
+    return discussion_data
+
+
+def backup_discussions(args, repo_cwd, repository):
+    discussion_cwd = os.path.join(repo_cwd, "discussions")
+    if args.skip_existing and os.path.isdir(discussion_cwd):
+        return
+
+    if not get_graphql_auth(args):
+        logger.info(
+            "Skipping {0} discussions since GitHub GraphQL API requires authentication".format(
+                repository["full_name"]
+            )
+        )
+        return
+
+    discussions_since = None
+    discussion_last_update_path = os.path.join(discussion_cwd, "last_update")
+    if args.incremental and os.path.exists(discussion_last_update_path):
+        discussions_since = open(discussion_last_update_path).read().strip()
+
+    logger.info("Retrieving {0} discussions".format(repository["full_name"]))
+    try:
+        (
+            summaries,
+            newest_seen,
+            discussions_enabled,
+            total_count,
+        ) = retrieve_discussion_summaries(args, repository, since=discussions_since)
+    except Exception as e:
+        logger.warning(
+            "Unable to retrieve discussions for {0}, skipping: {1}".format(
+                repository["full_name"], e
+            )
+        )
+        return
+
+    if not discussions_enabled:
+        logger.info(
+            "Discussions are not enabled for {0}, skipping".format(
+                repository["full_name"]
+            )
+        )
+        return
+
+    mkdir_p(repo_cwd, discussion_cwd)
+
+    if discussions_since:
+        logger.info(
+            "Saving {0} updated discussions to disk ({1} total)".format(
+                len(summaries), total_count
+            )
+        )
+    else:
+        logger.info("Saving {0} discussions to disk".format(len(summaries)))
+
+    written_count = 0
+    skipped_count = 0
+    had_errors = False
+    for summary in summaries:
+        number = summary["number"]
+        discussion_file = os.path.join(discussion_cwd, "{0}.json".format(number))
+
+        if args.incremental_by_files and os.path.isfile(discussion_file):
+            modified = os.path.getmtime(discussion_file)
+            modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
+            if modified > summary["updatedAt"]:
+                logger.info(
+                    "Skipping discussion {0} because it wasn't modified since last backup".format(
+                        number
+                    )
+                )
+                skipped_count += 1
+                continue
+
+        try:
+            discussion = retrieve_discussion(args, repository, number)
+        except Exception as e:
+            logger.warning(
+                "Unable to retrieve discussion {0}#{1}, skipping: {2}".format(
+                    repository["full_name"], number, e
+                )
+            )
+            had_errors = True
+            continue
+
+        if args.include_attachments:
+            download_attachments(
+                args,
+                discussion_cwd,
+                discussion,
+                number,
+                repository,
+                item_type="discussion",
+            )
+
+        if json_dump_if_changed(discussion, discussion_file):
+            written_count += 1
+
+    if (
+        args.incremental
+        and not had_errors
+        and newest_seen
+        and (not discussions_since or newest_seen > discussions_since)
+    ):
+        open(discussion_last_update_path, "w").write(newest_seen)
+
+    attempted_count = len(summaries) - skipped_count
+    if not summaries:
+        logger.info("No discussions to save")
+    elif attempted_count == 0:
+        logger.info("{0} discussions skipped".format(skipped_count))
+    elif written_count == attempted_count:
+        logger.info("Saved {0} discussions to disk".format(written_count))
+    elif written_count == 0:
+        logger.info(
+            "{0} discussions unchanged, skipped write".format(attempted_count)
+        )
+    else:
+        logger.info(
+            "Saved {0} discussions to disk ({1} unchanged, {2} skipped)".format(
+                written_count,
+                attempted_count - written_count,
+                skipped_count,
+            )
+        )
+
+
 def backup_issues(args, repo_cwd, repository, repos_template):
    has_issues_dir = os.path.isdir("{0}/issues/.git".format(repo_cwd))
    if args.skip_existing and has_issues_dir:
--- a/github_backup/graphql_queries.py
+++ b/github_backup/graphql_queries.py
@@ -0,0 +1,292 @@
+"""GraphQL query templates used by github-backup."""
+
+DISCUSSION_PAGE_SIZE = 100
+
+DISCUSSION_LIST_QUERY = """
+query($owner: String!, $name: String!, $after: String, $pageSize: Int!) {
+  repository(owner: $owner, name: $name) {
+    hasDiscussionsEnabled
+    discussions(
+      first: $pageSize,
+      after: $after,
+      orderBy: {field: UPDATED_AT, direction: DESC}
+    ) {
+      totalCount
+      nodes {
+        id
+        number
+        title
+        updatedAt
+      }
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+    }
+  }
+}
+"""
+
+DISCUSSION_DETAIL_QUERY = """
+query(
+  $owner: String!,
+  $name: String!,
+  $number: Int!,
+  $commentsCursor: String,
+  $pageSize: Int!
+) {
+  repository(owner: $owner, name: $name) {
+    discussion(number: $number) {
+      activeLockReason
+      answer {
+        id
+        databaseId
+        url
+      }
+      answerChosenAt
+      answerChosenBy {
+        ...ActorFields
+      }
+      author {
+        ...ActorFields
+      }
+      authorAssociation
+      body
+      bodyHTML
+      bodyText
+      category {
+        createdAt
+        description
+        emoji
+        emojiHTML
+        id
+        isAnswerable
+        name
+        slug
+        updatedAt
+      }
+      closed
+      closedAt
+      createdAt
+      createdViaEmail
+      databaseId
+      editor {
+        ...ActorFields
+      }
+      id
+      includesCreatedEdit
+      isAnswered
+      labels(first: 100) {
+        totalCount
+        nodes {
+          id
+          name
+          color
+          description
+        }
+      }
+      lastEditedAt
+      locked
+      number
+      poll {
+        id
+        question
+        totalVoteCount
+        options(first: 100) {
+          totalCount
+          nodes {
+            id
+            option
+            totalVoteCount
+          }
+        }
+      }
+      publishedAt
+      reactionGroups {
+        ...ReactionGroupFields
+      }
+      resourcePath
+      stateReason
+      title
+      updatedAt
+      upvoteCount
+      url
+      comments(first: $pageSize, after: $commentsCursor) {
+        totalCount
+        nodes {
+          ...DiscussionCommentFields
+          replies(first: $pageSize) {
+            totalCount
+            nodes {
+              ...DiscussionReplyFields
+            }
+            pageInfo {
+              hasNextPage
+              endCursor
+            }
+          }
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+    }
+  }
+}
+
+fragment ActorFields on Actor {
+  avatarUrl
+  login
+  resourcePath
+  url
+}
+
+fragment ReactionGroupFields on ReactionGroup {
+  content
+  reactors {
+    totalCount
+  }
+}
+
+fragment DiscussionCommentFields on DiscussionComment {
+  author {
+    ...ActorFields
+  }
+  authorAssociation
+  body
+  bodyHTML
+  bodyText
+  createdAt
+  createdViaEmail
+  databaseId
+  deletedAt
+  editor {
+    ...ActorFields
+  }
+  id
+  includesCreatedEdit
+  isAnswer
+  isMinimized
+  lastEditedAt
+  minimizedReason
+  publishedAt
+  reactionGroups {
+    ...ReactionGroupFields
+  }
+  replyTo {
+    id
+    databaseId
+    url
+  }
+  resourcePath
+  updatedAt
+  upvoteCount
+  url
+}
+
+fragment DiscussionReplyFields on DiscussionComment {
+  author {
+    ...ActorFields
+  }
+  authorAssociation
+  body
+  bodyHTML
+  bodyText
+  createdAt
+  createdViaEmail
+  databaseId
+  deletedAt
+  editor {
+    ...ActorFields
+  }
+  id
+  includesCreatedEdit
+  isAnswer
+  isMinimized
+  lastEditedAt
+  minimizedReason
+  publishedAt
+  reactionGroups {
+    ...ReactionGroupFields
+  }
+  replyTo {
+    id
+    databaseId
+    url
+  }
+  resourcePath
+  updatedAt
+  upvoteCount
+  url
+}
+"""
+
+DISCUSSION_REPLIES_QUERY = """
+query($commentId: ID!, $repliesCursor: String, $pageSize: Int!) {
+  node(id: $commentId) {
+    ... on DiscussionComment {
+      replies(first: $pageSize, after: $repliesCursor) {
+        totalCount
+        nodes {
+          ...DiscussionReplyFields
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+    }
+  }
+}
+
+fragment ActorFields on Actor {
+  avatarUrl
+  login
+  resourcePath
+  url
+}
+
+fragment ReactionGroupFields on ReactionGroup {
+  content
+  reactors {
+    totalCount
+  }
+}
+
+fragment DiscussionReplyFields on DiscussionComment {
+  author {
+    ...ActorFields
+  }
+  authorAssociation
+  body
+  bodyHTML
+  bodyText
+  createdAt
+  createdViaEmail
+  databaseId
+  deletedAt
+  editor {
+    ...ActorFields
+  }
+  id
+  includesCreatedEdit
+  isAnswer
+  isMinimized
+  lastEditedAt
+  minimizedReason
+  publishedAt
+  reactionGroups {
+    ...ReactionGroupFields
+  }
+  replyTo {
+    id
+    databaseId
+    url
+  }
+  resourcePath
+  updatedAt
+  upvoteCount
+  url
+}
+"""
--- a/tests/test_auth.py
+++ b/tests/test_auth.py
@@ -56,6 +56,16 @@ def test_token_from_gh_is_cached(create_args):
    mock_check_output.assert_called_once()


+def test_graphql_auth_strips_basic_auth_suffix_for_gh_cli_token(create_args):
+    args = create_args(token_from_gh=True)
+
+    with patch(
+        "github_backup.github_backup.subprocess.check_output",
+        return_value=b"gho_graphql_token\n",
+    ):
+        assert github_backup.get_graphql_auth(args) == "gho_graphql_token"
+
+
 def test_token_from_gh_rejects_as_app(create_args):
    args = create_args(token_from_gh=True, as_app=True)

--- a/tests/test_discussions.py
+++ b/tests/test_discussions.py
@@ -0,0 +1,222 @@
+"""Tests for GitHub Discussions backup support."""
+
+import json
+import os
+from unittest.mock import patch
+
+from github_backup import github_backup
+
+
+def test_parse_args_discussions_flag():
+    args = github_backup.parse_args(["--discussions", "testuser"])
+    assert args.include_discussions is True
+
+
+def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args):
+    args = create_args()
+    repository = {"full_name": "owner/repo"}
+
+    page = {
+        "repository": {
+            "hasDiscussionsEnabled": True,
+            "discussions": {
+                "totalCount": 3,
+                "nodes": [
+                    {"number": 3, "title": "new", "updatedAt": "2026-02-01T00:00:00Z"},
+                    {"number": 2, "title": "also new", "updatedAt": "2026-01-10T00:00:00Z"},
+                    {"number": 1, "title": "old", "updatedAt": "2025-12-01T00:00:00Z"},
+                ],
+                "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"},
+            },
+        }
+    }
+
+    with patch(
+        "github_backup.github_backup.retrieve_graphql_data", return_value=page
+    ) as mock_retrieve:
+        summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
+            args, repository, since="2026-01-01T00:00:00Z"
+        )
+
+    assert enabled is True
+    assert total == 3
+    assert newest == "2026-02-01T00:00:00Z"
+    assert [item["number"] for item in summaries] == [3, 2]
+    # The old discussion stops pagination, so the next page is not requested.
+    assert mock_retrieve.call_count == 1
+    assert (
+        mock_retrieve.call_args.kwargs["log_context"]
+        == "discussion summaries owner/repo page 1"
+    )
+
+
+def test_retrieve_discussion_summaries_disabled_discussions(create_args):
+    args = create_args()
+    repository = {"full_name": "owner/repo"}
+
+    with patch(
+        "github_backup.github_backup.retrieve_graphql_data",
+        return_value={"repository": {"hasDiscussionsEnabled": False}},
+    ):
+        summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries(
+            args, repository
+        )
+
+    assert summaries == []
+    assert newest is None
+    assert enabled is False
+    assert total == 0
+
+
+def _comment(comment_id, body, replies=None, replies_has_next=False):
+    replies = replies or []
+    return {
+        "id": comment_id,
+        "body": body,
+        "replies": {
+            "totalCount": len(replies) + (1 if replies_has_next else 0),
+            "nodes": replies,
+            "pageInfo": {
+                "hasNextPage": replies_has_next,
+                "endCursor": "REPLIES2" if replies_has_next else None,
+            },
+        },
+    }
+
+
+def _discussion_page(comment_nodes, has_next=False):
+    return {
+        "repository": {
+            "discussion": {
+                "number": 42,
+                "title": "Discussion title",
+                "updatedAt": "2026-02-01T00:00:00Z",
+                "comments": {
+                    "totalCount": 2,
+                    "nodes": comment_nodes,
+                    "pageInfo": {
+                        "hasNextPage": has_next,
+                        "endCursor": "COMMENTS2" if has_next else None,
+                    },
+                },
+            }
+        }
+    }
+
+
+def test_retrieve_discussion_paginates_comments_and_replies(create_args):
+    args = create_args()
+    repository = {"full_name": "owner/repo"}
+
+    reply_1 = {"id": "reply-1", "body": "first reply"}
+    reply_2 = {"id": "reply-2", "body": "second reply"}
+    comment_1 = _comment("comment-1", "first comment", [reply_1], replies_has_next=True)
+    comment_2 = _comment("comment-2", "second comment")
+
+    responses = [
+        _discussion_page([comment_1], has_next=True),
+        {
+            "node": {
+                "replies": {
+                    "totalCount": 2,
+                    "nodes": [reply_2],
+                    "pageInfo": {"hasNextPage": False, "endCursor": None},
+                }
+            }
+        },
+        _discussion_page([comment_2], has_next=False),
+    ]
+
+    with patch(
+        "github_backup.github_backup.retrieve_graphql_data", side_effect=responses
+    ) as mock_retrieve:
+        discussion = github_backup.retrieve_discussion(args, repository, 42)
+
+    assert discussion["number"] == 42
+    assert discussion["comment_count"] == 2
+    assert len(discussion["comment_data"]) == 2
+    assert discussion["comment_data"][0]["body"] == "first comment"
+    assert discussion["comment_data"][0]["reply_count"] == 2
+    assert [r["body"] for r in discussion["comment_data"][0]["reply_data"]] == [
+        "first reply",
+        "second reply",
+    ]
+    assert discussion["comment_data"][1]["body"] == "second comment"
+    assert mock_retrieve.call_count == 3
+    assert [
+        call.kwargs["log_context"] for call in mock_retrieve.call_args_list
+    ] == [
+        "discussion owner/repo#42 details/comments page 1",
+        "discussion owner/repo#42 comment comment-1 replies page 2",
+        "discussion owner/repo#42 details/comments page 2",
+    ]
+
+
+def test_backup_discussions_uses_incremental_checkpoint(create_args, tmp_path):
+    args = create_args(token_classic="fake_token", include_discussions=True, incremental=True)
+    repository = {"full_name": "owner/repo"}
+    discussions_dir = tmp_path / "discussions"
+    discussions_dir.mkdir()
+    (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z")
+
+    def fake_summaries(passed_args, passed_repository, since=None):
+        assert passed_args is args
+        assert passed_repository == repository
+        assert since == "2026-01-01T00:00:00Z"
+        return (
+            [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}],
+            "2026-02-01T00:00:00Z",
+            True,
+            1,
+        )
+
+    with patch(
+        "github_backup.github_backup.retrieve_discussion_summaries",
+        side_effect=fake_summaries,
+    ), patch(
+        "github_backup.github_backup.retrieve_discussion",
+        return_value={"number": 7, "title": "updated"},
+    ):
+        github_backup.backup_discussions(args, tmp_path, repository)
+
+    with open(discussions_dir / "7.json", encoding="utf-8") as f:
+        assert json.load(f) == {"number": 7, "title": "updated"}
+    assert (discussions_dir / "last_update").read_text() == "2026-02-01T00:00:00Z"
+
+
+def test_backup_discussions_does_not_advance_checkpoint_on_discussion_error(
+    create_args, tmp_path
+):
+    args = create_args(token_classic="fake_token", include_discussions=True, incremental=True)
+    repository = {"full_name": "owner/repo"}
+    discussions_dir = tmp_path / "discussions"
+    discussions_dir.mkdir()
+    (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z")
+
+    with patch(
+        "github_backup.github_backup.retrieve_discussion_summaries",
+        return_value=(
+            [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}],
+            "2026-02-01T00:00:00Z",
+            True,
+            1,
+        ),
+    ), patch(
+        "github_backup.github_backup.retrieve_discussion",
+        side_effect=Exception("temporary GraphQL error"),
+    ):
+        github_backup.backup_discussions(args, tmp_path, repository)
+
+    assert (discussions_dir / "last_update").read_text() == "2026-01-01T00:00:00Z"
+    assert not os.path.exists(discussions_dir / "7.json")
+
+
+def test_backup_discussions_skips_without_auth(create_args, tmp_path):
+    args = create_args(include_discussions=True)
+    repository = {"full_name": "owner/repo"}
+
+    with patch("github_backup.github_backup.retrieve_discussion_summaries") as mock_retrieve:
+        github_backup.backup_discussions(args, tmp_path, repository)
+
+    assert not mock_retrieve.called
+    assert not os.path.exists(tmp_path / "discussions")
--- a/tests/test_retrieve_data.py
+++ b/tests/test_retrieve_data.py
@@ -1,6 +1,7 @@
 """Tests for retrieve_data function."""

 import json
+import logging
 import socket
 from unittest.mock import Mock, patch
 from urllib.error import HTTPError, URLError
@@ -355,6 +356,33 @@ class TestMakeRequestWithRetry:
        )  # 1 initial + 5 retries = 6 attempts


+class TestRetrieveGraphqlDataLogging:
+    """Tests for GraphQL request logging."""
+
+    def test_logs_graphql_context(self, create_args, caplog):
+        args = create_args(token_classic="fake_token")
+        mock_response = Mock()
+        mock_response.getcode.return_value = 200
+        mock_response.read.return_value = json.dumps({"data": {}}).encode("utf-8")
+        mock_response.headers = {"x-ratelimit-remaining": "5000"}
+
+        caplog.set_level(logging.INFO, logger="github_backup.github_backup")
+        with patch(
+            "github_backup.github_backup.make_request_with_retry",
+            return_value=mock_response,
+        ):
+            github_backup.retrieve_graphql_data(
+                args,
+                "query { viewer { login } }",
+                log_context="discussion owner/repo#1",
+            )
+
+        assert (
+            "Requesting https://api.github.com/graphql (discussion owner/repo#1)"
+            in caplog.text
+        )
+
+
 class TestRetrieveDataThrottling:
    """Tests for throttling behavior in retrieve_data."""