305 lines
11 KiB
Python
305 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
ArXiv 论文章节阅读器。
|
||
|
||
通过解析 arXiv HTML 版本(LaTeXML 转换),支持:
|
||
- 列出论文所有章节结构
|
||
- 按章节名称提取正文内容(大小写不敏感,支持部分匹配)
|
||
|
||
用法:
|
||
python3 arxiv_paper.py 2409.05591 # 列出章节
|
||
python3 arxiv_paper.py 2409.05591 --section introduction # 读取指定章节
|
||
python3 arxiv_paper.py 2409.05591 --section method
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
from typing import Any
|
||
|
||
from search_utils import get_client, print_json
|
||
|
||
BeautifulSoup: Any = None
|
||
NavigableString: Any = None
|
||
Tag: Any = None
|
||
|
||
|
||
def ensure_bs4() -> None:
|
||
"""Load BeautifulSoup only when the script needs to parse paper HTML."""
|
||
global BeautifulSoup, NavigableString, Tag
|
||
if BeautifulSoup is not None:
|
||
return
|
||
|
||
try:
|
||
from bs4 import BeautifulSoup as Bs4BeautifulSoup
|
||
from bs4 import NavigableString as Bs4NavigableString
|
||
from bs4 import Tag as Bs4Tag
|
||
except ImportError:
|
||
print_json({
|
||
"success": False,
|
||
"error": "缺少 beautifulsoup4,请运行:python3 -m pip install -r skills/sn-search-academic/requirements.txt",
|
||
})
|
||
sys.exit(1)
|
||
|
||
BeautifulSoup = Bs4BeautifulSoup
|
||
NavigableString = Bs4NavigableString
|
||
Tag = Bs4Tag
|
||
|
||
HTML_BASE = "https://arxiv.org/html"
|
||
ABS_BASE = "https://arxiv.org/abs"
|
||
PDF_BASE = "https://arxiv.org/pdf"
|
||
|
||
# ── HTML 获取 ─────────────────────────────────────────────────────────────────
|
||
|
||
def fetch_html(arxiv_id: str) -> str:
|
||
"""获取 arXiv HTML 版本,不存在时抛出有意义的错误。"""
|
||
url = f"{HTML_BASE}/{arxiv_id}"
|
||
with get_client(timeout=45, headers={"Accept": "text/html,application/xhtml+xml"}) as client:
|
||
resp = client.get(url)
|
||
|
||
if resp.status_code == 404:
|
||
raise ValueError(
|
||
f"论文 {arxiv_id} 暂无 HTML 版本。"
|
||
"可能原因:论文较老(2018 年前)、非 LaTeX 来源或尚未转换。"
|
||
f"请直接阅读 PDF:{PDF_BASE}/{arxiv_id}"
|
||
)
|
||
resp.raise_for_status()
|
||
return resp.text
|
||
|
||
|
||
# ── 文本清洗 ──────────────────────────────────────────────────────────────────
|
||
|
||
def _elem_to_text(elem: Tag) -> str:
|
||
"""
|
||
将 HTML 元素转为可读文本。
|
||
- math 元素:优先用 LaTeX 注解,否则用 alttext,再降级为 [MATH]
|
||
- 图表标题:保留
|
||
- 跳过 .ltx_note(脚注编号)等噪音节点
|
||
"""
|
||
parts: list[str] = []
|
||
|
||
for node in elem.descendants:
|
||
if not isinstance(node, NavigableString):
|
||
continue
|
||
|
||
parent = node.parent
|
||
if parent is None:
|
||
continue
|
||
|
||
tag = parent.name
|
||
|
||
# 跳过脚注编号、引用上标等噪音
|
||
parent_classes = parent.get("class") or []
|
||
if any(c in parent_classes for c in ("ltx_note_mark", "ltx_ref_tag", "ltx_tag")):
|
||
continue
|
||
|
||
# math 元素:取 LaTeX 注解
|
||
if tag == "annotation":
|
||
encoding = parent.get("encoding", "")
|
||
if "tex" in encoding.lower() or "latex" in encoding.lower():
|
||
latex = node.strip()
|
||
if latex:
|
||
parts.append(f"${latex}$")
|
||
continue
|
||
|
||
# 跳过 math 内部的非注解文本(MathML 结构文本很乱)
|
||
in_math = False
|
||
for ancestor in parent.parents:
|
||
if ancestor.name == "math":
|
||
in_math = True
|
||
break
|
||
if in_math:
|
||
continue
|
||
|
||
text = str(node)
|
||
if text.strip():
|
||
parts.append(text)
|
||
|
||
raw = "".join(parts)
|
||
# 合并多余空白,保留段落换行
|
||
raw = re.sub(r"[ \t]+", " ", raw)
|
||
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
||
return raw.strip()
|
||
|
||
|
||
# ── 章节提取 ──────────────────────────────────────────────────────────────────
|
||
|
||
def extract_sections(html: str) -> list[dict[str, Any]]:
|
||
"""
|
||
从 arXiv HTML 提取所有章节(含摘要)。
|
||
|
||
返回列表,每项:
|
||
name - 章节标题(含编号,如 "1 Introduction")
|
||
level - 层级(0=摘要, 1=h2, 2=h3)
|
||
text - 正文文本
|
||
"""
|
||
ensure_bs4()
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
sections: list[dict[str, Any]] = []
|
||
|
||
# ── 摘要 ──
|
||
abstract_elem = soup.find(class_=re.compile(r"\bltx_abstract\b"))
|
||
if abstract_elem:
|
||
# 去掉 "Abstract" 标题行
|
||
for h in abstract_elem.find_all(["h2", "h6"], class_=re.compile(r"ltx_title")):
|
||
h.decompose()
|
||
abstract_text = _elem_to_text(abstract_elem)
|
||
if abstract_text:
|
||
sections.append({"name": "Abstract", "level": 0, "text": abstract_text})
|
||
|
||
# ── 正文各 section ──
|
||
for sec in soup.find_all("section", class_=re.compile(r"\bltx_section\b|\bltx_appendix\b")):
|
||
# 找本层标题(不要子 section 的标题)
|
||
heading: Tag | None = None
|
||
for h_tag in ["h2", "h3", "h4"]:
|
||
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"), recursive=False)
|
||
if candidate:
|
||
heading = candidate
|
||
break
|
||
|
||
if heading is None:
|
||
# 有些 section 标题在首个 div 里
|
||
for h_tag in ["h2", "h3", "h4"]:
|
||
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"))
|
||
if candidate:
|
||
heading = candidate
|
||
break
|
||
|
||
if heading is None:
|
||
continue
|
||
|
||
# 清理标题(去尾部 ¶ permalink、多余空白)
|
||
heading_text = heading.get_text(" ", strip=True).rstrip("¶").strip()
|
||
heading_text = re.sub(r"\s+", " ", heading_text)
|
||
level = {"h2": 1, "h3": 2, "h4": 3}.get(heading.name, 1)
|
||
|
||
# 提取本 section 的文本(排除子 section,避免重复)
|
||
sec_copy = BeautifulSoup(str(sec), "html.parser").find("section")
|
||
# 移除子 section
|
||
for child_sec in sec_copy.find_all("section", recursive=False):
|
||
child_sec.decompose()
|
||
# 移除标题自身
|
||
for h in sec_copy.find_all(["h2", "h3", "h4"], class_=re.compile(r"\bltx_title\b"), recursive=False):
|
||
h.decompose()
|
||
|
||
text = _elem_to_text(sec_copy)
|
||
|
||
if not text.strip():
|
||
continue
|
||
|
||
sections.append({"name": heading_text, "level": level, "text": text})
|
||
|
||
return sections
|
||
|
||
|
||
# ── 匹配章节名 ────────────────────────────────────────────────────────────────
|
||
|
||
def _match_section(sections: list[dict], query: str) -> dict | None:
|
||
"""大小写不敏感 + 去数字前缀的模糊匹配。"""
|
||
q = query.lower().strip()
|
||
|
||
def clean(name: str) -> str:
|
||
"""去掉 '1 ' / '1. ' 等数字前缀。"""
|
||
return re.sub(r"^\d+[\.\s]+", "", name).lower().strip()
|
||
|
||
# 精确匹配
|
||
for s in sections:
|
||
if s["name"].lower() == q or clean(s["name"]) == q:
|
||
return s
|
||
|
||
# 前缀 / 包含匹配
|
||
for s in sections:
|
||
if clean(s["name"]).startswith(q) or q in clean(s["name"]):
|
||
return s
|
||
|
||
return None
|
||
|
||
|
||
# ── 对外接口 ──────────────────────────────────────────────────────────────────
|
||
|
||
def cmd_list_sections(arxiv_id: str) -> dict[str, Any]:
|
||
"""列出论文所有章节(不含正文)。"""
|
||
html = fetch_html(arxiv_id)
|
||
sections = extract_sections(html)
|
||
return {
|
||
"success": True,
|
||
"arxiv_id": arxiv_id,
|
||
"abs_url": f"{ABS_BASE}/{arxiv_id}",
|
||
"html_url": f"{HTML_BASE}/{arxiv_id}",
|
||
"pdf_url": f"{PDF_BASE}/{arxiv_id}",
|
||
"section_count": len(sections),
|
||
"sections": [{"name": s["name"], "level": s["level"]} for s in sections],
|
||
"error": None,
|
||
}
|
||
|
||
|
||
def cmd_read_section(arxiv_id: str, section_name: str) -> dict[str, Any]:
|
||
"""读取指定章节的正文内容。"""
|
||
html = fetch_html(arxiv_id)
|
||
sections = extract_sections(html)
|
||
matched = _match_section(sections, section_name)
|
||
|
||
if matched is None:
|
||
available = [s["name"] for s in sections]
|
||
return {
|
||
"success": False,
|
||
"arxiv_id": arxiv_id,
|
||
"section": section_name,
|
||
"content": None,
|
||
"error": f"未找到章节 '{section_name}',可用章节:{available}",
|
||
}
|
||
|
||
return {
|
||
"success": True,
|
||
"arxiv_id": arxiv_id,
|
||
"abs_url": f"{ABS_BASE}/{arxiv_id}",
|
||
"section": matched["name"],
|
||
"level": matched["level"],
|
||
"content": matched["text"],
|
||
"char_count": len(matched["text"]),
|
||
"error": None,
|
||
}
|
||
|
||
|
||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="ArXiv 论文章节阅读器",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
示例:
|
||
python3 arxiv_paper.py 2409.05591 列出所有章节
|
||
python3 arxiv_paper.py 2409.05591 --section introduction 读取 Introduction
|
||
python3 arxiv_paper.py 2409.05591 --section method 读取 Method/Methods
|
||
python3 arxiv_paper.py 2409.05591 --section conclusion 读取 Conclusion
|
||
""",
|
||
)
|
||
parser.add_argument("arxiv_id", help="arXiv 论文 ID(如 2409.05591 或 2409.05591v2)")
|
||
parser.add_argument(
|
||
"--section", "-s",
|
||
metavar="SECTION_NAME",
|
||
help="要读取的章节名(大小写不敏感,支持部分匹配)。不指定则列出所有章节。",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
try:
|
||
if args.section:
|
||
result = cmd_read_section(args.arxiv_id.strip(), args.section.strip())
|
||
else:
|
||
result = cmd_list_sections(args.arxiv_id.strip())
|
||
print_json(result)
|
||
except Exception as e:
|
||
print_json({
|
||
"success": False,
|
||
"arxiv_id": args.arxiv_id,
|
||
"error": str(e),
|
||
})
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|