first commit

2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions
--- a/sn-search-academic/scripts/pubmed_search.py
+++ b/sn-search-academic/scripts/pubmed_search.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""PubMed 生物医学文献搜索。通过 NCBI E-utilities API。"""
+from __future__ import annotations
+
+import sys
+import xml.etree.ElementTree as ET
+
+from search_utils import build_parser, get_client, make_item, make_result, print_json
+
+ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+
+def search(query: str, limit: int, api_key: str | None = None) -> list[dict]:
+    """执行 PubMed 搜索（两步：esearch 获取 PMID，efetch 获取完整记录含摘要）。"""
+    base_params: dict = {"api_key": api_key} if api_key else {}
+
+    # Step 1: esearch 获取 PMID 列表
+    with get_client(timeout=30) as client:
+        resp = client.get(ESEARCH_URL, params={
+            **base_params,
+            "db": "pubmed",
+            "term": query,
+            "retmax": min(limit, 100),
+            "retmode": "json",
+            "sort": "relevance",
+        })
+        resp.raise_for_status()
+        pmids = resp.json().get("esearchresult", {}).get("idlist", [])
+
+    if not pmids:
+        return []
+
+    # Step 2: efetch 获取完整 XML 记录（含摘要）
+    with get_client(timeout=30) as client:
+        resp = client.get(EFETCH_URL, params={
+            **base_params,
+            "db": "pubmed",
+            "id": ",".join(pmids[:limit]),
+            "rettype": "xml",
+            "retmode": "xml",
+        })
+        resp.raise_for_status()
+
+    root = ET.fromstring(resp.text)
+    items = []
+
+    for article in root.findall(".//PubmedArticle"):
+        medline = article.find("MedlineCitation")
+        if medline is None:
+            continue
+
+        pmid_elem = medline.find("PMID")
+        pmid = pmid_elem.text if pmid_elem is not None else ""
+
+        article_data = medline.find("Article")
+        if article_data is None:
+            continue
+
+        # 标题
+        title_elem = article_data.find("ArticleTitle")
+        title = "".join(title_elem.itertext()) if title_elem is not None else ""
+
+        # 摘要（支持结构化摘要，如 BACKGROUND/METHODS/RESULTS/CONCLUSIONS）
+        abstract_parts = []
+        abstract_elem = article_data.find("Abstract")
+        if abstract_elem is not None:
+            for ab in abstract_elem.findall("AbstractText"):
+                label = ab.get("Label")
+                text = "".join(ab.itertext()).strip()
+                if label:
+                    abstract_parts.append(f"{label}: {text}")
+                else:
+                    abstract_parts.append(text)
+        abstract = " ".join(abstract_parts)
+
+        # 作者
+        authors = []
+        author_list = article_data.find("AuthorList")
+        if author_list is not None:
+            for author in author_list.findall("Author"):
+                last = author.findtext("LastName", "")
+                fore = author.findtext("ForeName", "")
+                name = f"{fore} {last}".strip() if fore else last
+                if name:
+                    authors.append(name)
+
+        # 期刊信息
+        journal = article_data.find("Journal")
+        journal_name = ""
+        pub_date = ""
+        volume = ""
+        issue = ""
+        if journal is not None:
+            journal_name = journal.findtext("Title", "") or journal.findtext("ISOAbbreviation", "")
+            ji = journal.find("JournalIssue")
+            if ji is not None:
+                volume = ji.findtext("Volume", "")
+                issue = ji.findtext("Issue", "")
+                pd = ji.find("PubDate")
+                if pd is not None:
+                    year = pd.findtext("Year", "")
+                    month = pd.findtext("Month", "")
+                    day = pd.findtext("Day", "")
+                    pub_date = " ".join(filter(None, [year, month, day]))
+
+        # 页码
+        pages = article_data.findtext(".//MedlinePgn", "")
+
+        # DOI 和 PMC ID（从 ArticleIdList 提取）
+        doi = None
+        pmc_id = None
+        for id_elem in article.findall(".//ArticleId"):
+            id_type = id_elem.get("IdType", "")
+            if id_type == "doi":
+                doi = id_elem.text
+            elif id_type == "pmc" and id_elem.text:
+                # 规范化：去掉 "PMC" 前缀，只保留数字
+                pmc_id = id_elem.text.lstrip("PMCpmc").strip() or id_elem.text
+
+        # MeSH 关键词
+        keywords = [kw.text for kw in medline.findall(".//Keyword") if kw.text]
+
+        # 文献类型
+        pub_types = [pt.text for pt in article_data.findall(".//PublicationType") if pt.text]
+
+        url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
+        pmc_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/" if pmc_id else None
+
+        items.append(make_item(
+            title=title,
+            url=url,
+            snippet=abstract,
+            authors=authors,
+            pmid=pmid,
+            pmc_id=f"PMC{pmc_id}" if pmc_id else None,
+            pmc_url=pmc_url,
+            journal=journal_name if journal_name else None,
+            pub_date=pub_date if pub_date else None,
+            volume=volume if volume else None,
+            issue=issue if issue else None,
+            pages=pages if pages else None,
+            keywords=keywords if keywords else None,
+            pub_types=pub_types if pub_types else None,
+            doi=doi,
+        ))
+
+    return items
+
+
+def main():
+    parser = build_parser("搜索 PubMed 生物医学文献")
+    parser.add_argument("--api-key", help="NCBI API Key（可选，限额从 3 req/s 提升至 10 req/s）")
+    args = parser.parse_args()
+
+    try:
+        items = search(args.query, args.limit, getattr(args, "api_key", None))
+        print_json(make_result(True, args.query, "pubmed", items))
+    except Exception as e:
+        print_json(make_result(False, args.query, "pubmed", [], str(e)))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()