first commit
This commit is contained in:
304
sn-search-academic/scripts/arxiv_paper.py
Normal file
304
sn-search-academic/scripts/arxiv_paper.py
Normal file
@@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ArXiv 论文章节阅读器。
|
||||
|
||||
通过解析 arXiv HTML 版本(LaTeXML 转换),支持:
|
||||
- 列出论文所有章节结构
|
||||
- 按章节名称提取正文内容(大小写不敏感,支持部分匹配)
|
||||
|
||||
用法:
|
||||
python3 arxiv_paper.py 2409.05591 # 列出章节
|
||||
python3 arxiv_paper.py 2409.05591 --section introduction # 读取指定章节
|
||||
python3 arxiv_paper.py 2409.05591 --section method
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
from search_utils import get_client, print_json
|
||||
|
||||
BeautifulSoup: Any = None
|
||||
NavigableString: Any = None
|
||||
Tag: Any = None
|
||||
|
||||
|
||||
def ensure_bs4() -> None:
|
||||
"""Load BeautifulSoup only when the script needs to parse paper HTML."""
|
||||
global BeautifulSoup, NavigableString, Tag
|
||||
if BeautifulSoup is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup as Bs4BeautifulSoup
|
||||
from bs4 import NavigableString as Bs4NavigableString
|
||||
from bs4 import Tag as Bs4Tag
|
||||
except ImportError:
|
||||
print_json({
|
||||
"success": False,
|
||||
"error": "缺少 beautifulsoup4,请运行:python3 -m pip install -r skills/sn-search-academic/requirements.txt",
|
||||
})
|
||||
sys.exit(1)
|
||||
|
||||
BeautifulSoup = Bs4BeautifulSoup
|
||||
NavigableString = Bs4NavigableString
|
||||
Tag = Bs4Tag
|
||||
|
||||
HTML_BASE = "https://arxiv.org/html"
|
||||
ABS_BASE = "https://arxiv.org/abs"
|
||||
PDF_BASE = "https://arxiv.org/pdf"
|
||||
|
||||
# ── HTML 获取 ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def fetch_html(arxiv_id: str) -> str:
|
||||
"""获取 arXiv HTML 版本,不存在时抛出有意义的错误。"""
|
||||
url = f"{HTML_BASE}/{arxiv_id}"
|
||||
with get_client(timeout=45, headers={"Accept": "text/html,application/xhtml+xml"}) as client:
|
||||
resp = client.get(url)
|
||||
|
||||
if resp.status_code == 404:
|
||||
raise ValueError(
|
||||
f"论文 {arxiv_id} 暂无 HTML 版本。"
|
||||
"可能原因:论文较老(2018 年前)、非 LaTeX 来源或尚未转换。"
|
||||
f"请直接阅读 PDF:{PDF_BASE}/{arxiv_id}"
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
|
||||
|
||||
# ── 文本清洗 ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _elem_to_text(elem: Tag) -> str:
|
||||
"""
|
||||
将 HTML 元素转为可读文本。
|
||||
- math 元素:优先用 LaTeX 注解,否则用 alttext,再降级为 [MATH]
|
||||
- 图表标题:保留
|
||||
- 跳过 .ltx_note(脚注编号)等噪音节点
|
||||
"""
|
||||
parts: list[str] = []
|
||||
|
||||
for node in elem.descendants:
|
||||
if not isinstance(node, NavigableString):
|
||||
continue
|
||||
|
||||
parent = node.parent
|
||||
if parent is None:
|
||||
continue
|
||||
|
||||
tag = parent.name
|
||||
|
||||
# 跳过脚注编号、引用上标等噪音
|
||||
parent_classes = parent.get("class") or []
|
||||
if any(c in parent_classes for c in ("ltx_note_mark", "ltx_ref_tag", "ltx_tag")):
|
||||
continue
|
||||
|
||||
# math 元素:取 LaTeX 注解
|
||||
if tag == "annotation":
|
||||
encoding = parent.get("encoding", "")
|
||||
if "tex" in encoding.lower() or "latex" in encoding.lower():
|
||||
latex = node.strip()
|
||||
if latex:
|
||||
parts.append(f"${latex}$")
|
||||
continue
|
||||
|
||||
# 跳过 math 内部的非注解文本(MathML 结构文本很乱)
|
||||
in_math = False
|
||||
for ancestor in parent.parents:
|
||||
if ancestor.name == "math":
|
||||
in_math = True
|
||||
break
|
||||
if in_math:
|
||||
continue
|
||||
|
||||
text = str(node)
|
||||
if text.strip():
|
||||
parts.append(text)
|
||||
|
||||
raw = "".join(parts)
|
||||
# 合并多余空白,保留段落换行
|
||||
raw = re.sub(r"[ \t]+", " ", raw)
|
||||
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
||||
return raw.strip()
|
||||
|
||||
|
||||
# ── 章节提取 ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def extract_sections(html: str) -> list[dict[str, Any]]:
|
||||
"""
|
||||
从 arXiv HTML 提取所有章节(含摘要)。
|
||||
|
||||
返回列表,每项:
|
||||
name - 章节标题(含编号,如 "1 Introduction")
|
||||
level - 层级(0=摘要, 1=h2, 2=h3)
|
||||
text - 正文文本
|
||||
"""
|
||||
ensure_bs4()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
sections: list[dict[str, Any]] = []
|
||||
|
||||
# ── 摘要 ──
|
||||
abstract_elem = soup.find(class_=re.compile(r"\bltx_abstract\b"))
|
||||
if abstract_elem:
|
||||
# 去掉 "Abstract" 标题行
|
||||
for h in abstract_elem.find_all(["h2", "h6"], class_=re.compile(r"ltx_title")):
|
||||
h.decompose()
|
||||
abstract_text = _elem_to_text(abstract_elem)
|
||||
if abstract_text:
|
||||
sections.append({"name": "Abstract", "level": 0, "text": abstract_text})
|
||||
|
||||
# ── 正文各 section ──
|
||||
for sec in soup.find_all("section", class_=re.compile(r"\bltx_section\b|\bltx_appendix\b")):
|
||||
# 找本层标题(不要子 section 的标题)
|
||||
heading: Tag | None = None
|
||||
for h_tag in ["h2", "h3", "h4"]:
|
||||
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"), recursive=False)
|
||||
if candidate:
|
||||
heading = candidate
|
||||
break
|
||||
|
||||
if heading is None:
|
||||
# 有些 section 标题在首个 div 里
|
||||
for h_tag in ["h2", "h3", "h4"]:
|
||||
candidate = sec.find(h_tag, class_=re.compile(r"\bltx_title\b"))
|
||||
if candidate:
|
||||
heading = candidate
|
||||
break
|
||||
|
||||
if heading is None:
|
||||
continue
|
||||
|
||||
# 清理标题(去尾部 ¶ permalink、多余空白)
|
||||
heading_text = heading.get_text(" ", strip=True).rstrip("¶").strip()
|
||||
heading_text = re.sub(r"\s+", " ", heading_text)
|
||||
level = {"h2": 1, "h3": 2, "h4": 3}.get(heading.name, 1)
|
||||
|
||||
# 提取本 section 的文本(排除子 section,避免重复)
|
||||
sec_copy = BeautifulSoup(str(sec), "html.parser").find("section")
|
||||
# 移除子 section
|
||||
for child_sec in sec_copy.find_all("section", recursive=False):
|
||||
child_sec.decompose()
|
||||
# 移除标题自身
|
||||
for h in sec_copy.find_all(["h2", "h3", "h4"], class_=re.compile(r"\bltx_title\b"), recursive=False):
|
||||
h.decompose()
|
||||
|
||||
text = _elem_to_text(sec_copy)
|
||||
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
sections.append({"name": heading_text, "level": level, "text": text})
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
# ── 匹配章节名 ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _match_section(sections: list[dict], query: str) -> dict | None:
|
||||
"""大小写不敏感 + 去数字前缀的模糊匹配。"""
|
||||
q = query.lower().strip()
|
||||
|
||||
def clean(name: str) -> str:
|
||||
"""去掉 '1 ' / '1. ' 等数字前缀。"""
|
||||
return re.sub(r"^\d+[\.\s]+", "", name).lower().strip()
|
||||
|
||||
# 精确匹配
|
||||
for s in sections:
|
||||
if s["name"].lower() == q or clean(s["name"]) == q:
|
||||
return s
|
||||
|
||||
# 前缀 / 包含匹配
|
||||
for s in sections:
|
||||
if clean(s["name"]).startswith(q) or q in clean(s["name"]):
|
||||
return s
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── 对外接口 ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def cmd_list_sections(arxiv_id: str) -> dict[str, Any]:
|
||||
"""列出论文所有章节(不含正文)。"""
|
||||
html = fetch_html(arxiv_id)
|
||||
sections = extract_sections(html)
|
||||
return {
|
||||
"success": True,
|
||||
"arxiv_id": arxiv_id,
|
||||
"abs_url": f"{ABS_BASE}/{arxiv_id}",
|
||||
"html_url": f"{HTML_BASE}/{arxiv_id}",
|
||||
"pdf_url": f"{PDF_BASE}/{arxiv_id}",
|
||||
"section_count": len(sections),
|
||||
"sections": [{"name": s["name"], "level": s["level"]} for s in sections],
|
||||
"error": None,
|
||||
}
|
||||
|
||||
|
||||
def cmd_read_section(arxiv_id: str, section_name: str) -> dict[str, Any]:
|
||||
"""读取指定章节的正文内容。"""
|
||||
html = fetch_html(arxiv_id)
|
||||
sections = extract_sections(html)
|
||||
matched = _match_section(sections, section_name)
|
||||
|
||||
if matched is None:
|
||||
available = [s["name"] for s in sections]
|
||||
return {
|
||||
"success": False,
|
||||
"arxiv_id": arxiv_id,
|
||||
"section": section_name,
|
||||
"content": None,
|
||||
"error": f"未找到章节 '{section_name}',可用章节:{available}",
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"arxiv_id": arxiv_id,
|
||||
"abs_url": f"{ABS_BASE}/{arxiv_id}",
|
||||
"section": matched["name"],
|
||||
"level": matched["level"],
|
||||
"content": matched["text"],
|
||||
"char_count": len(matched["text"]),
|
||||
"error": None,
|
||||
}
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ArXiv 论文章节阅读器",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
python3 arxiv_paper.py 2409.05591 列出所有章节
|
||||
python3 arxiv_paper.py 2409.05591 --section introduction 读取 Introduction
|
||||
python3 arxiv_paper.py 2409.05591 --section method 读取 Method/Methods
|
||||
python3 arxiv_paper.py 2409.05591 --section conclusion 读取 Conclusion
|
||||
""",
|
||||
)
|
||||
parser.add_argument("arxiv_id", help="arXiv 论文 ID(如 2409.05591 或 2409.05591v2)")
|
||||
parser.add_argument(
|
||||
"--section", "-s",
|
||||
metavar="SECTION_NAME",
|
||||
help="要读取的章节名(大小写不敏感,支持部分匹配)。不指定则列出所有章节。",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
if args.section:
|
||||
result = cmd_read_section(args.arxiv_id.strip(), args.section.strip())
|
||||
else:
|
||||
result = cmd_list_sections(args.arxiv_id.strip())
|
||||
print_json(result)
|
||||
except Exception as e:
|
||||
print_json({
|
||||
"success": False,
|
||||
"arxiv_id": args.arxiv_id,
|
||||
"error": str(e),
|
||||
})
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user