first commit

This commit is contained in:
Hermes Agent
2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions

View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python3
"""
ArXiv 论文章节阅读器。
通过解析 arXiv HTML 版本LaTeXML 转换),支持:
- 列出论文所有章节结构
- 按章节名称提取正文内容(大小写不敏感,支持部分匹配)
用法:
python3 arxiv_paper.py 2409.05591 # 列出章节
python3 arxiv_paper.py 2409.05591 --section introduction # 读取指定章节
python3 arxiv_paper.py 2409.05591 --section method
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from typing import Any
from search_utils import get_client, print_json
BeautifulSoup: Any = None
NavigableString: Any = None
Tag: Any = None
def ensure_bs4() -> None:
"""Load BeautifulSoup only when the script needs to parse paper HTML."""
global BeautifulSoup, NavigableString, Tag
if BeautifulSoup is not None:
return
try:
from bs4 import BeautifulSoup as Bs4BeautifulSoup
from bs4 import NavigableString as Bs4NavigableString
from bs4 import Tag as Bs4Tag
except ImportError:
print_json({
"success": False,
"error": "缺少 beautifulsoup4请运行python3 -m pip install -r skills/sn-search-academic/requirements.txt",
})
sys.exit(1)
BeautifulSoup = Bs4BeautifulSoup
NavigableString = Bs4NavigableString
Tag = Bs4Tag
HTML_BASE = "https://arxiv.org/html"
ABS_BASE = "https://arxiv.org/abs"
PDF_BASE = "https://arxiv.org/pdf"
# ── HTML 获取 ─────────────────────────────────────────────────────────────────
def fetch_html(arxiv_id: str) -> str:
"""获取 arXiv HTML 版本,不存在时抛出有意义的错误。"""
url = f"{HTML_BASE}/{arxiv_id}"
with get_client(timeout=45, headers={"Accept": "text/html,application/xhtml+xml"}) as client:
resp = client.get(url)
if resp.status_code == 404:
raise ValueError(
f"论文 {arxiv_id} 暂无 HTML 版本。"
"可能原因论文较老2018 年前)、非 LaTeX 来源或尚未转换。"
f"请直接阅读 PDF{PDF_BASE}/{arxiv_id}"
)
resp.raise_for_status()
return resp.text
# ── 文本清洗 ──────────────────────────────────────────────────────────────────
def _elem_to_text(elem: Tag) -> str:
"""
将 HTML 元素转为可读文本。
- math 元素:优先用 LaTeX 注解,否则用 alttext再降级为 [MATH]
- 图表标题:保留
- 跳过 .ltx_note脚注编号等噪音节点
"""
parts: list[str] = []
for node in elem.descendants:
if not isinstance(node, NavigableString):
continue
parent = node.parent
if parent is None:
continue
tag = parent.name
# 跳过脚注编号、引用上标等噪音
parent_classes = parent.get("class") or []
if any(c in parent_classes for c in ("ltx_note_mark", "ltx_ref_tag", "ltx_tag")):
continue
# math 元素:取 LaTeX 注解
if tag == "annotation":
encoding = parent.get("encoding", "")
if "tex" in encoding.lower() or "latex" in encoding.lower():
latex = node.strip()
if latex:
parts.append(f"${latex}$")
continue
# 跳过 math 内部的非注解文本MathML 结构文本很乱)
in_math = False
for ancestor in parent.parents:
if ancestor.name == "math":
in_math = True
break
if in_math:
continue
text = str(node)
if text.strip():
parts.append(text)
raw = "".join(parts)
# 合并多余空白,保留段落换行
raw = re.sub(r"[ \t]+", " ", raw)
raw = re.sub(r"\n{3,}", "\n\n", raw)
return raw.strip()
# ── 章节提取 ──────────────────────────────────────────────────────────────────
def extract_sections(html: str) -> list[dict[str, Any]]:
"""
从 arXiv HTML 提取所有章节(含摘要)。
返回列表,每项:
name - 章节标题(含编号,如 "1 Introduction"
level - 层级0=摘要, 1=h2, 2=h3
text - 正文文本
"""
ensure_bs4()
soup = BeautifulSoup(html, "html.parser")
sections: list[dict[str, Any]] = []
# ── 摘要 ──
abstract_elem = soup.find(class_=re.compile(r"\bltx_abstract\b"))
if abstract_elem:
# 去掉 "Abstract" 标题行
for h in abstract_elem.find_all(["h2", "h6"], class_=re.compile(r"ltx_title")):
h.decompose()
abstract_text = _elem_to_text(abstract_elem)
if abstract_text:
sections.append({"name": "Abstract", "level": 0, "text": abstract_text})
# ── 正文各 section ──
for sec in soup.find_all("section", class_=re.compile(r"\bltx_section\b|\bltx_appendix\b")):
# 找本层标题(不要子 section 的标题)
heading: Tag | None = None
for h_tag in ["h2", "h3", "h4"]:
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"), recursive=False)
if candidate:
heading = candidate
break
if heading is None:
# 有些 section 标题在首个 div 里
for h_tag in ["h2", "h3", "h4"]:
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"))
if candidate:
heading = candidate
break
if heading is None:
continue
# 清理标题(去尾部 ¶ permalink、多余空白
heading_text = heading.get_text(" ", strip=True).rstrip("").strip()
heading_text = re.sub(r"\s+", " ", heading_text)
level = {"h2": 1, "h3": 2, "h4": 3}.get(heading.name, 1)
# 提取本 section 的文本(排除子 section避免重复
sec_copy = BeautifulSoup(str(sec), "html.parser").find("section")
# 移除子 section
for child_sec in sec_copy.find_all("section", recursive=False):
child_sec.decompose()
# 移除标题自身
for h in sec_copy.find_all(["h2", "h3", "h4"], class_=re.compile(r"\bltx_title\b"), recursive=False):
h.decompose()
text = _elem_to_text(sec_copy)
if not text.strip():
continue
sections.append({"name": heading_text, "level": level, "text": text})
return sections
# ── 匹配章节名 ────────────────────────────────────────────────────────────────
def _match_section(sections: list[dict], query: str) -> dict | None:
"""大小写不敏感 + 去数字前缀的模糊匹配。"""
q = query.lower().strip()
def clean(name: str) -> str:
"""去掉 '1 ' / '1. ' 等数字前缀。"""
return re.sub(r"^\d+[\.\s]+", "", name).lower().strip()
# 精确匹配
for s in sections:
if s["name"].lower() == q or clean(s["name"]) == q:
return s
# 前缀 / 包含匹配
for s in sections:
if clean(s["name"]).startswith(q) or q in clean(s["name"]):
return s
return None
# ── 对外接口 ──────────────────────────────────────────────────────────────────
def cmd_list_sections(arxiv_id: str) -> dict[str, Any]:
"""列出论文所有章节(不含正文)。"""
html = fetch_html(arxiv_id)
sections = extract_sections(html)
return {
"success": True,
"arxiv_id": arxiv_id,
"abs_url": f"{ABS_BASE}/{arxiv_id}",
"html_url": f"{HTML_BASE}/{arxiv_id}",
"pdf_url": f"{PDF_BASE}/{arxiv_id}",
"section_count": len(sections),
"sections": [{"name": s["name"], "level": s["level"]} for s in sections],
"error": None,
}
def cmd_read_section(arxiv_id: str, section_name: str) -> dict[str, Any]:
"""读取指定章节的正文内容。"""
html = fetch_html(arxiv_id)
sections = extract_sections(html)
matched = _match_section(sections, section_name)
if matched is None:
available = [s["name"] for s in sections]
return {
"success": False,
"arxiv_id": arxiv_id,
"section": section_name,
"content": None,
"error": f"未找到章节 '{section_name}',可用章节:{available}",
}
return {
"success": True,
"arxiv_id": arxiv_id,
"abs_url": f"{ABS_BASE}/{arxiv_id}",
"section": matched["name"],
"level": matched["level"],
"content": matched["text"],
"char_count": len(matched["text"]),
"error": None,
}
# ── CLI ───────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="ArXiv 论文章节阅读器",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python3 arxiv_paper.py 2409.05591 列出所有章节
python3 arxiv_paper.py 2409.05591 --section introduction 读取 Introduction
python3 arxiv_paper.py 2409.05591 --section method 读取 Method/Methods
python3 arxiv_paper.py 2409.05591 --section conclusion 读取 Conclusion
""",
)
parser.add_argument("arxiv_id", help="arXiv 论文 ID如 2409.05591 或 2409.05591v2")
parser.add_argument(
"--section", "-s",
metavar="SECTION_NAME",
help="要读取的章节名(大小写不敏感,支持部分匹配)。不指定则列出所有章节。",
)
args = parser.parse_args()
try:
if args.section:
result = cmd_read_section(args.arxiv_id.strip(), args.section.strip())
else:
result = cmd_list_sections(args.arxiv_id.strip())
print_json(result)
except Exception as e:
print_json({
"success": False,
"arxiv_id": args.arxiv_id,
"error": str(e),
})
sys.exit(1)
if __name__ == "__main__":
main()