#!/usr/bin/env python3 """ ArXiv 论文搜索。通过 ArXiv API(返回 Atom XML)。 支持: - 全文 / 标题 / 摘要 / 作者字段搜索 - 分类过滤、排序 - 按 ID 列表直接拉取论文元数据 - 布尔组合查询(AND / OR / ANDNOT) 示例: python3 arxiv_search.py "attention mechanism" python3 arxiv_search.py "transformer" --category cs.CL --sort date python3 arxiv_search.py "diffusion model" --author "ho jonathan" python3 arxiv_search.py "ViT" --title-only python3 arxiv_search.py --id-list 2409.05591,2301.00001 """ from __future__ import annotations import sys import xml.etree.ElementTree as ET from search_utils import build_parser, get_client, make_item, make_result, print_json API_URL = "https://export.arxiv.org/api/query" # Atom XML 命名空间 NS = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", } def build_search_query( query: str, category: str | None = None, author: str | None = None, title_only: bool = False, ) -> str: """ 构建 arXiv 查询字符串。 字段前缀: all: 全字段(默认) ti: 仅标题 au: 作者(支持通配 au:smi*) abs: 摘要 cat: 分类 布尔运算符必须大写:AND / OR / ANDNOT """ # 主查询字段 field = "ti" if title_only else "all" parts = [f"{field}:{query}"] if author: # 多个作者用 OR 连接,支持 "lastname firstname" 格式 author_terms = [f"au:{a.strip()}" for a in author.split(",") if a.strip()] if author_terms: parts.append(f"({' OR '.join(author_terms)})") if category: parts.append(f"cat:{category}") return " AND ".join(parts) def fetch_by_ids(id_list: list[str], limit: int) -> list[dict]: """通过 ID 列表直接获取论文元数据(不做文本搜索)。""" params = { "id_list": ",".join(id_list[:limit]), "max_results": min(len(id_list), limit, 100), } with get_client(timeout=30, headers={"Accept": "application/xml"}) as client: resp = client.get(API_URL, params=params) resp.raise_for_status() return _parse_entries(ET.fromstring(resp.text), limit) def search( query: str, limit: int, category: str | None = None, sort_by: str = "relevance", author: str | None = None, title_only: bool = False, ) -> list[dict]: """执行 ArXiv 关键词搜索。""" search_query = build_search_query(query, category, author, title_only) sort_map = { "relevance": "relevance", "date": "lastUpdatedDate", "submitted": "submittedDate", } params = { "search_query": search_query, "start": 0, "max_results": min(limit, 100), "sortBy": sort_map.get(sort_by, "relevance"), "sortOrder": "descending", } with get_client(timeout=30, headers={"Accept": "application/xml"}) as client: resp = client.get(API_URL, params=params) resp.raise_for_status() return _parse_entries(ET.fromstring(resp.text), limit) def _parse_entries(root: ET.Element, limit: int) -> list[dict]: """从 Atom XML 解析论文条目。""" items = [] for entry in root.findall("atom:entry", NS)[:limit]: title = _text(entry, "atom:title").replace("\n", " ").strip() summary = _text(entry, "atom:summary").replace("\n", " ").strip() published = _text(entry, "atom:published") updated = _text(entry, "atom:updated") # 获取论文链接(优先 abs 页面) url = "" pdf_url = "" for link in entry.findall("atom:link", NS): href = link.get("href", "") if link.get("title") == "pdf": pdf_url = href elif link.get("type") == "text/html" or "/abs/" in href: url = href if not url: url = _text(entry, "atom:id") # 从 abs URL 或 id 提取 arxiv_id arxiv_id = "" raw_id = _text(entry, "atom:id") if "/abs/" in raw_id: arxiv_id = raw_id.split("/abs/")[-1] elif raw_id.startswith("http"): arxiv_id = raw_id.split("/")[-1] # 获取作者 authors = [_text(a, "atom:name") for a in entry.findall("atom:author", NS)] # 获取分类 categories = [c.get("term", "") for c in entry.findall("atom:category", NS)] comment = _text(entry, "arxiv:comment") journal_ref = _text(entry, "arxiv:journal_ref") doi = _text(entry, "arxiv:doi") primary_category = entry.find("arxiv:primary_category", NS) primary_cat = primary_category.get("term", "") if primary_category is not None else "" # HTML 版本链接(较新论文有) html_url = f"https://arxiv.org/html/{arxiv_id}" if arxiv_id else None items.append(make_item( title=title, url=url, snippet=summary, arxiv_id=arxiv_id if arxiv_id else None, authors=authors, published=published, updated=updated, pdf_url=pdf_url, html_url=html_url, categories=categories, primary_category=primary_cat if primary_cat else None, comment=comment if comment else None, journal_ref=journal_ref if journal_ref else None, doi=doi if doi else None, )) return items def _text(elem: ET.Element, tag: str) -> str: """安全获取子元素文本。""" child = elem.find(tag, NS) return child.text.strip() if child is not None and child.text else "" def main(): parser = build_parser("搜索 ArXiv 学术论文") parser.add_argument("--category", "-c", help="ArXiv 分类过滤(如 cs.AI, cs.CL, math.CO)") parser.add_argument( "--sort", default="relevance", choices=["relevance", "date", "submitted"], help="排序方式(默认 relevance)", ) parser.add_argument( "--author", "-a", help="按作者过滤(如 'hinton',多个作者用逗号分隔)", ) parser.add_argument( "--title-only", action="store_true", help="仅在标题中搜索(默认搜索全字段)", ) parser.add_argument( "--id-list", help="直接按 arXiv ID 获取元数据,逗号分隔(如 2409.05591,2301.00001)。指定此项时 query 参数可留空。", ) # 当使用 --id-list 时 query 可选 parser.prog = "arxiv_search.py" # 为了支持 --id-list 时 query 可省略,临时让 query 可选 for action in parser._positionals._group_actions: if action.dest == "query": action.nargs = "?" action.default = "" break args = parser.parse_args() try: if args.id_list: id_list = [i.strip() for i in args.id_list.split(",") if i.strip()] items = fetch_by_ids(id_list, args.limit) query_str = f"id_list:{args.id_list}" else: if not args.query: parser.error("请提供搜索关键词,或使用 --id-list 按 ID 查询") items = search( args.query, args.limit, category=args.category, sort_by=args.sort, author=args.author, title_only=args.title_only, ) query_str = args.query print_json(make_result(True, query_str, "arxiv", items)) except Exception as e: print_json(make_result(False, getattr(args, "query", "") or "", "arxiv", [], str(e))) sys.exit(1) if __name__ == "__main__": main()