first commit

2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions
--- a/sn-search-academic/scripts/arxiv_search.py
+++ b/sn-search-academic/scripts/arxiv_search.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""
+ArXiv 论文搜索。通过 ArXiv API（返回 Atom XML）。
+
+支持：
+  - 全文 / 标题 / 摘要 / 作者字段搜索
+  - 分类过滤、排序
+  - 按 ID 列表直接拉取论文元数据
+  - 布尔组合查询（AND / OR / ANDNOT）
+
+示例：
+  python3 arxiv_search.py "attention mechanism"
+  python3 arxiv_search.py "transformer" --category cs.CL --sort date
+  python3 arxiv_search.py "diffusion model" --author "ho jonathan"
+  python3 arxiv_search.py "ViT" --title-only
+  python3 arxiv_search.py --id-list 2409.05591,2301.00001
+"""
+from __future__ import annotations
+
+import sys
+import xml.etree.ElementTree as ET
+
+from search_utils import build_parser, get_client, make_item, make_result, print_json
+
+API_URL = "https://export.arxiv.org/api/query"
+
+# Atom XML 命名空间
+NS = {
+    "atom": "http://www.w3.org/2005/Atom",
+    "arxiv": "http://arxiv.org/schemas/atom",
+}
+
+
+def build_search_query(
+    query: str,
+    category: str | None = None,
+    author: str | None = None,
+    title_only: bool = False,
+) -> str:
+    """
+    构建 arXiv 查询字符串。
+
+    字段前缀：
+      all:  全字段（默认）
+      ti:   仅标题
+      au:   作者（支持通配 au:smi*）
+      abs:  摘要
+      cat:  分类
+    布尔运算符必须大写：AND / OR / ANDNOT
+    """
+    # 主查询字段
+    field = "ti" if title_only else "all"
+    parts = [f"{field}:{query}"]
+
+    if author:
+        # 多个作者用 OR 连接，支持 "lastname firstname" 格式
+        author_terms = [f"au:{a.strip()}" for a in author.split(",") if a.strip()]
+        if author_terms:
+            parts.append(f"({' OR '.join(author_terms)})")
+
+    if category:
+        parts.append(f"cat:{category}")
+
+    return " AND ".join(parts)
+
+
+def fetch_by_ids(id_list: list[str], limit: int) -> list[dict]:
+    """通过 ID 列表直接获取论文元数据（不做文本搜索）。"""
+    params = {
+        "id_list": ",".join(id_list[:limit]),
+        "max_results": min(len(id_list), limit, 100),
+    }
+    with get_client(timeout=30, headers={"Accept": "application/xml"}) as client:
+        resp = client.get(API_URL, params=params)
+        resp.raise_for_status()
+    return _parse_entries(ET.fromstring(resp.text), limit)
+
+
+def search(
+    query: str,
+    limit: int,
+    category: str | None = None,
+    sort_by: str = "relevance",
+    author: str | None = None,
+    title_only: bool = False,
+) -> list[dict]:
+    """执行 ArXiv 关键词搜索。"""
+    search_query = build_search_query(query, category, author, title_only)
+
+    sort_map = {
+        "relevance": "relevance",
+        "date": "lastUpdatedDate",
+        "submitted": "submittedDate",
+    }
+
+    params = {
+        "search_query": search_query,
+        "start": 0,
+        "max_results": min(limit, 100),
+        "sortBy": sort_map.get(sort_by, "relevance"),
+        "sortOrder": "descending",
+    }
+
+    with get_client(timeout=30, headers={"Accept": "application/xml"}) as client:
+        resp = client.get(API_URL, params=params)
+        resp.raise_for_status()
+
+    return _parse_entries(ET.fromstring(resp.text), limit)
+
+
+def _parse_entries(root: ET.Element, limit: int) -> list[dict]:
+    """从 Atom XML 解析论文条目。"""
+    items = []
+
+    for entry in root.findall("atom:entry", NS)[:limit]:
+        title = _text(entry, "atom:title").replace("\n", " ").strip()
+        summary = _text(entry, "atom:summary").replace("\n", " ").strip()
+        published = _text(entry, "atom:published")
+        updated = _text(entry, "atom:updated")
+
+        # 获取论文链接（优先 abs 页面）
+        url = ""
+        pdf_url = ""
+        for link in entry.findall("atom:link", NS):
+            href = link.get("href", "")
+            if link.get("title") == "pdf":
+                pdf_url = href
+            elif link.get("type") == "text/html" or "/abs/" in href:
+                url = href
+        if not url:
+            url = _text(entry, "atom:id")
+
+        # 从 abs URL 或 id 提取 arxiv_id
+        arxiv_id = ""
+        raw_id = _text(entry, "atom:id")
+        if "/abs/" in raw_id:
+            arxiv_id = raw_id.split("/abs/")[-1]
+        elif raw_id.startswith("http"):
+            arxiv_id = raw_id.split("/")[-1]
+
+        # 获取作者
+        authors = [_text(a, "atom:name") for a in entry.findall("atom:author", NS)]
+
+        # 获取分类
+        categories = [c.get("term", "") for c in entry.findall("atom:category", NS)]
+
+        comment = _text(entry, "arxiv:comment")
+        journal_ref = _text(entry, "arxiv:journal_ref")
+        doi = _text(entry, "arxiv:doi")
+        primary_category = entry.find("arxiv:primary_category", NS)
+        primary_cat = primary_category.get("term", "") if primary_category is not None else ""
+
+        # HTML 版本链接（较新论文有）
+        html_url = f"https://arxiv.org/html/{arxiv_id}" if arxiv_id else None
+
+        items.append(make_item(
+            title=title,
+            url=url,
+            snippet=summary,
+            arxiv_id=arxiv_id if arxiv_id else None,
+            authors=authors,
+            published=published,
+            updated=updated,
+            pdf_url=pdf_url,
+            html_url=html_url,
+            categories=categories,
+            primary_category=primary_cat if primary_cat else None,
+            comment=comment if comment else None,
+            journal_ref=journal_ref if journal_ref else None,
+            doi=doi if doi else None,
+        ))
+
+    return items
+
+
+def _text(elem: ET.Element, tag: str) -> str:
+    """安全获取子元素文本。"""
+    child = elem.find(tag, NS)
+    return child.text.strip() if child is not None and child.text else ""
+
+
+def main():
+    parser = build_parser("搜索 ArXiv 学术论文")
+    parser.add_argument("--category", "-c", help="ArXiv 分类过滤（如 cs.AI, cs.CL, math.CO）")
+    parser.add_argument(
+        "--sort", default="relevance",
+        choices=["relevance", "date", "submitted"],
+        help="排序方式（默认 relevance）",
+    )
+    parser.add_argument(
+        "--author", "-a",
+        help="按作者过滤（如 'hinton'，多个作者用逗号分隔）",
+    )
+    parser.add_argument(
+        "--title-only", action="store_true",
+        help="仅在标题中搜索（默认搜索全字段）",
+    )
+    parser.add_argument(
+        "--id-list",
+        help="直接按 arXiv ID 获取元数据，逗号分隔（如 2409.05591,2301.00001）。指定此项时 query 参数可留空。",
+    )
+    # 当使用 --id-list 时 query 可选
+    parser.prog = "arxiv_search.py"
+
+    # 为了支持 --id-list 时 query 可省略，临时让 query 可选
+    for action in parser._positionals._group_actions:
+        if action.dest == "query":
+            action.nargs = "?"
+            action.default = ""
+            break
+
+    args = parser.parse_args()
+
+    try:
+        if args.id_list:
+            id_list = [i.strip() for i in args.id_list.split(",") if i.strip()]
+            items = fetch_by_ids(id_list, args.limit)
+            query_str = f"id_list:{args.id_list}"
+        else:
+            if not args.query:
+                parser.error("请提供搜索关键词，或使用 --id-list 按 ID 查询")
+            items = search(
+                args.query,
+                args.limit,
+                category=args.category,
+                sort_by=args.sort,
+                author=args.author,
+                title_only=args.title_only,
+            )
+            query_str = args.query
+
+        print_json(make_result(True, query_str, "arxiv", items))
+    except Exception as e:
+        print_json(make_result(False, getattr(args, "query", "") or "", "arxiv", [], str(e)))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()