agent-skills/sn-search-academic/scripts/pmc_paper.py

#!/usr/bin/env python3
"""
PMC 论文全文章节阅读器。

通过 NCBI E-utilities 获取 PubMed Central 全文 XML（JATS 格式），支持：
  - 列出论文所有章节结构（含子章节层级）
  - 按章节名称提取正文内容（大小写不敏感，支持部分匹配）
  - 通过 PMID 自动解析到 PMC ID

用法：
  python3 pmc_paper.py PMC11119143                          # 列出章节
  python3 pmc_paper.py 11119143                             # 同上（自动补 PMC 前缀）
  python3 pmc_paper.py PMC11119143 --section introduction   # 读取指定章节
  python3 pmc_paper.py --pmid 38786024 --section method     # 从 PMID 出发
"""
from __future__ import annotations

import argparse
import re
import sys
import xml.etree.ElementTree as ET
from typing import Any

from search_utils import get_client, print_json

EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
ELINK_URL  = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"

# ── ID 处理 ───────────────────────────────────────────────────────────────────

def normalize_pmc_id(raw: str) -> str:
    """规范化 PMC ID：去掉 'PMC' 前缀，只保留数字部分。"""
    return re.sub(r"^[Pp][Mm][Cc]", "", raw.strip())


def pmid_to_pmc(pmid: str, api_key: str | None = None) -> str | None:
    """通过 elink 将 PMID 转换为 PMC ID（数字形式）。"""
    params: dict[str, Any] = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "retmode": "json",
    }
    if api_key:
        params["api_key"] = api_key

    with get_client(timeout=20) as client:
        resp = client.get(ELINK_URL, params=params)
        resp.raise_for_status()

    data = resp.json()
    for linkset in data.get("linksets", []):
        for db in linkset.get("linksetdbs", []):
            if db.get("dbto") == "pmc" and db.get("linkname") == "pubmed_pmc":
                links = db.get("links", [])
                if links:
                    return str(links[0])
    return None


# ── XML 拉取 ──────────────────────────────────────────────────────────────────

def fetch_pmc_xml(pmc_num: str, api_key: str | None = None) -> ET.Element:
    """获取 PMC 全文 XML，返回根元素。"""
    params: dict[str, Any] = {
        "db": "pmc",
        "id": pmc_num,
        "rettype": "xml",
        "retmode": "xml",
    }
    if api_key:
        params["api_key"] = api_key

    with get_client(timeout=45) as client:
        resp = client.get(EFETCH_URL, params=params)
        resp.raise_for_status()

    root = ET.fromstring(resp.text)

    # 检查是否找到论文
    article = root.find(".//article")
    if article is None:
        raise ValueError(
            f"PMC{pmc_num} 未找到全文。"
            "可能原因：该论文不在 PMC 开放获取库中，或 ID 有误。"
        )
    return root


# ── JATS XML 文本提取 ─────────────────────────────────────────────────────────

# 跳过这些标签的全部内容（噪音节点）
_SKIP_TAGS = {"ref", "ref-list", "fn", "fn-group", "permissions", "author-notes",
              "glossary", "ack"}  # ack=Acknowledgements，可按需保留

# 转为占位符的标签
_FORMULA_TAGS = {"disp-formula", "inline-formula", "mml:math", "tex-math"}


def _elem_to_text(elem: ET.Element, depth: int = 0) -> str:
    """
    将 JATS XML 元素递归转为可读文本。

    处理规则：
    - <p>: 段落，末尾加换行
    - <title>: 跳过（章节标题在上层已处理）
    - <sec>: 子章节，递归（用缩进区分层级）
    - <list>/<list-item>: 转为 bullet 列表
    - <disp-formula>/<inline-formula>: 替换为 [FORMULA]
    - <fig>: 跳过图像内容，保留 caption
    - <table-wrap>: 保留 label+caption
    - <xref>/<ext-link>: 直接取文本内容
    - <bold>/<italic>/<underline>: 取文本内容
    """
    tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag  # 去 namespace

    if tag in _SKIP_TAGS:
        return ""

    if tag in _FORMULA_TAGS:
        return " [FORMULA] "

    if tag == "title":
        return ""  # 由调用方处理

    if tag == "p":
        text = _collect_text(elem)
        return text.strip() + "\n\n" if text.strip() else ""

    if tag in ("bold", "italic", "underline", "named-content", "styled-content",
               "ext-link", "uri", "xref", "sup", "sub", "monospace"):
        return _collect_text(elem)

    if tag == "list":
        parts = []
        for li in elem.findall("list-item"):
            item_text = "".join(_elem_to_text(c) for c in li).strip()
            if item_text:
                parts.append(f"• {item_text}")
        return "\n".join(parts) + "\n\n" if parts else ""

    if tag == "disp-quote":
        text = "".join(_elem_to_text(c) for c in elem).strip()
        return f"> {text}\n\n" if text else ""

    if tag == "fig":
        # 只保留 caption
        caption = elem.find(".//caption")
        if caption is not None:
            cap_text = "".join(_elem_to_text(c) for c in caption).strip()
            label = elem.findtext("label", "Figure")
            return f"[{label}: {cap_text}]\n\n" if cap_text else ""
        return ""

    if tag == "table-wrap":
        label = elem.findtext("label", "Table")
        caption = elem.find(".//caption")
        cap_text = ""
        if caption is not None:
            cap_text = "".join(_elem_to_text(c) for c in caption).strip()
        return f"[{label}: {cap_text}]\n\n" if cap_text else f"[{label}]\n\n"

    if tag == "sec":
        # 子章节：递归处理，标题加缩进
        sub_title_elem = elem.find("title")
        sub_title = ""
        if sub_title_elem is not None:
            sub_title = _collect_text(sub_title_elem).strip()

        parts = []
        if sub_title:
            indent = "  " * depth
            parts.append(f"\n{indent}### {sub_title}\n\n")
        for child in elem:
            child_tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
            if child_tag == "title":
                continue
            parts.append(_elem_to_text(child, depth + 1))
        return "".join(parts)

    # 默认：递归子节点
    return "".join(_elem_to_text(c, depth) for c in elem)


def _collect_text(elem: ET.Element) -> str:
    """收集元素的所有文本（含子节点，跳过公式）。"""
    parts = []
    if elem.text:
        parts.append(elem.text)
    for child in elem:
        child_tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
        if child_tag in _FORMULA_TAGS:
            parts.append("[FORMULA]")
        elif child_tag in _SKIP_TAGS:
            pass
        else:
            parts.append(_collect_text(child))
        if child.tail:
            parts.append(child.tail)
    return "".join(parts)


# ── 章节提取 ──────────────────────────────────────────────────────────────────

def _extract_sections_from(container: ET.Element, level: int = 1) -> list[dict[str, Any]]:
    """递归提取 sec 节点，返回扁平章节列表。"""
    sections: list[dict[str, Any]] = []
    for sec in container.findall("sec"):
        title_elem = sec.find("title")
        title = _collect_text(title_elem).strip() if title_elem is not None else f"Section {len(sections)+1}"

        # 正文：本 sec 的直接子节点（排除 sec 和 title）
        text_parts = []
        for child in sec:
            child_tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
            if child_tag in ("title", "sec"):
                continue
            text_parts.append(_elem_to_text(child))

        text = "".join(text_parts).strip()

        # 子章节递归
        subsections = _extract_sections_from(sec, level + 1)

        sections.append({
            "name": title,
            "level": level,
            "text": text,
            "subsections": subsections,
        })
    return sections


def extract_all_sections(root: ET.Element) -> list[dict[str, Any]]:
    """
    从 PMC JATS XML 提取所有章节。
    顺序：Abstract → Body sections（含子章节）
    """
    sections: list[dict[str, Any]] = []

    article = root.find(".//article")
    if article is None:
        return sections

    # ── 摘要 ──
    abstract = article.find(".//abstract")
    if abstract is not None:
        # 结构化摘要（含 sec）
        if abstract.findall("sec"):
            abs_parts = []
            for sec in abstract.findall("sec"):
                sec_title = sec.findtext("title", "")
                sec_text_parts = []
                for child in sec:
                    if child.tag != "title":
                        sec_text_parts.append(_elem_to_text(child))
                part = "".join(sec_text_parts).strip()
                if sec_title:
                    abs_parts.append(f"{sec_title}: {part}")
                else:
                    abs_parts.append(part)
            abs_text = "\n\n".join(abs_parts)
        else:
            abs_text = "".join(_elem_to_text(c) for c in abstract).strip()

        if abs_text:
            sections.append({"name": "Abstract", "level": 0, "text": abs_text, "subsections": []})

    # ── Body ──
    body = article.find(".//body")
    if body is not None:
        sections.extend(_extract_sections_from(body, level=1))

    return sections


# ── 章节匹配 ──────────────────────────────────────────────────────────────────

def _flatten_sections(sections: list[dict], result: list | None = None) -> list[dict]:
    """将嵌套章节扁平化，便于搜索。"""
    if result is None:
        result = []
    for s in sections:
        result.append(s)
        _flatten_sections(s.get("subsections", []), result)
    return result


def match_section(sections: list[dict], query: str) -> dict | None:
    """大小写不敏感 + 去数字前缀的模糊匹配（搜索所有层级）。"""
    q = query.lower().strip()
    flat = _flatten_sections(sections)

    def clean(name: str) -> str:
        return re.sub(r"^\d+[\.\s]+", "", name).lower().strip()

    # 精确匹配
    for s in flat:
        if s["name"].lower() == q or clean(s["name"]) == q:
            return s

    # 包含/前缀匹配
    for s in flat:
        c = clean(s["name"])
        if c.startswith(q) or q in c:
            return s

    return None


# ── 对外接口 ──────────────────────────────────────────────────────────────────

def _section_outline(sections: list[dict], depth: int = 0) -> list[dict]:
    """生成章节目录（只含 name 和 level，递归）。"""
    outline = []
    for s in sections:
        outline.append({"name": s["name"], "level": s["level"]})
        if s.get("subsections"):
            outline.extend(_section_outline(s["subsections"], depth + 1))
    return outline


def cmd_list_sections(pmc_num: str, api_key: str | None = None) -> dict[str, Any]:
    """列出 PMC 论文所有章节目录。"""
    root = fetch_pmc_xml(pmc_num, api_key)
    sections = extract_all_sections(root)

    # 从 XML 拿标题
    title = root.findtext(".//article-title", "")
    pmid = root.findtext(".//article-id[@pub-id-type='pmid']", "")

    return {
        "success": True,
        "pmc_id": f"PMC{pmc_num}",
        "pmid": pmid or None,
        "title": title,
        "pmc_url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_num}/",
        "section_count": len(_flatten_sections(sections)),
        "sections": _section_outline(sections),
        "error": None,
    }


def cmd_read_section(pmc_num: str, section_name: str, api_key: str | None = None) -> dict[str, Any]:
    """读取指定章节的正文内容（含子章节文本）。"""
    root = fetch_pmc_xml(pmc_num, api_key)
    sections = extract_all_sections(root)
    matched = match_section(sections, section_name)

    if matched is None:
        flat = _flatten_sections(sections)
        available = [s["name"] for s in flat]
        return {
            "success": False,
            "pmc_id": f"PMC{pmc_num}",
            "section": section_name,
            "content": None,
            "error": f"未找到章节 '{section_name}'，可用章节：{available}",
        }

    # 合并本节文本 + 子章节文本
    def collect_text(s: dict) -> str:
        parts = [s["text"]]
        for sub in s.get("subsections", []):
            sub_text = collect_text(sub)
            if sub_text.strip():
                parts.append(f"\n### {sub['name']}\n\n{sub_text}")
        return "\n\n".join(p for p in parts if p.strip())

    content = collect_text(matched)

    return {
        "success": True,
        "pmc_id": f"PMC{pmc_num}",
        "pmc_url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_num}/",
        "section": matched["name"],
        "level": matched["level"],
        "content": content,
        "char_count": len(content),
        "error": None,
    }


# ── CLI ───────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="PMC 论文全文章节阅读器",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例：
  python3 pmc_paper.py PMC11119143                           列出所有章节
  python3 pmc_paper.py 11119143                              同上（自动补前缀）
  python3 pmc_paper.py PMC11119143 --section introduction    读取 Introduction
  python3 pmc_paper.py PMC11119143 --section method          读取 Methods
  python3 pmc_paper.py --pmid 38786024                       从 PMID 列章节
  python3 pmc_paper.py --pmid 38786024 --section conclusion  从 PMID 读章节
""",
    )
    parser.add_argument(
        "pmc_id", nargs="?",
        help="PMC ID（如 PMC11119143 或 11119143）。与 --pmid 二选一。",
    )
    parser.add_argument(
        "--pmid",
        help="PubMed ID，自动转换为 PMC ID（需要论文在 PMC 开放获取库中）",
    )
    parser.add_argument(
        "--section", "-s",
        metavar="SECTION_NAME",
        help="要读取的章节名（大小写不敏感，支持部分匹配）。不指定则列出所有章节。",
    )
    parser.add_argument(
        "--api-key",
        help="NCBI API Key（可选，提升限额从 3 req/s 到 10 req/s）",
    )
    args = parser.parse_args()

    api_key = getattr(args, "api_key", None)

    try:
        # 解析 PMC 数字 ID
        if args.pmid:
            pmc_num = pmid_to_pmc(args.pmid, api_key)
            if not pmc_num:
                print_json({
                    "success": False,
                    "pmid": args.pmid,
                    "error": f"PMID {args.pmid} 在 PMC 中无对应全文。该论文可能未开放获取。",
                })
                sys.exit(1)
        elif args.pmc_id:
            pmc_num = normalize_pmc_id(args.pmc_id)
        else:
            parser.error("请提供 PMC ID 或使用 --pmid 指定 PubMed ID")

        if args.section:
            result = cmd_read_section(pmc_num, args.section.strip(), api_key)
        else:
            result = cmd_list_sections(pmc_num, api_key)

        print_json(result)

    except Exception as e:
        print_json({
            "success": False,
            "pmc_id": f"PMC{pmc_num}" if "pmc_num" in dir() else None,
            "error": str(e),
        })
        sys.exit(1)


if __name__ == "__main__":
    main()