first commit
This commit is contained in:
296
research/arxiv/SKILL.md
Normal file
296
research/arxiv/SKILL.md
Normal file
@@ -0,0 +1,296 @@
|
||||
---
|
||||
name: arxiv
|
||||
description: "搜索 arXiv 论文(关键词/作者/分类/ID)、获取摘要、生成 BibTeX。支持 Semantic Scholar 集成(引用数、相关论文、作者画像)。触发词:arXiv、论文搜索、学术论文、BibTeX、引用数。轻量专注,单源工具。"
|
||||
version: 1.0.0
|
||||
author: Hermes Agent
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [Research, Arxiv, Papers, Academic, Science, API]
|
||||
related_skills: [ocr-and-documents]
|
||||
---
|
||||
|
||||
# arXiv Research
|
||||
|
||||
Search and retrieve academic papers from arXiv via their free REST API. No API key, no dependencies — just curl.
|
||||
|
||||
## Quick Reference
|
||||
|
||||
| Action | Command |
|
||||
|--------|---------|
|
||||
| Search papers | `curl "https://export.arxiv.org/api/query?search_query=all:QUERY&max_results=5"` |
|
||||
| Get specific paper | `curl "https://export.arxiv.org/api/query?id_list=2402.03300"` |
|
||||
| Read abstract (web) | `web_extract(urls=["https://arxiv.org/abs/2402.03300"])` |
|
||||
| Read full paper (PDF) | `web_extract(urls=["https://arxiv.org/pdf/2402.03300"])` |
|
||||
|
||||
## Searching Papers
|
||||
|
||||
The API returns Atom XML. Parse with `grep`/`sed` or pipe through `python3` for clean output.
|
||||
|
||||
### Basic search
|
||||
|
||||
```bash
|
||||
curl -s "https://export.arxiv.org/api/query?search_query=all:GRPO+reinforcement+learning&max_results=5"
|
||||
```
|
||||
|
||||
### Clean output (parse XML to readable format)
|
||||
|
||||
```bash
|
||||
curl -s "https://export.arxiv.org/api/query?search_query=all:GRPO+reinforcement+learning&max_results=5&sortBy=submittedDate&sortOrder=descending" | python3 -c "
|
||||
import sys, xml.etree.ElementTree as ET
|
||||
ns = {'a': 'http://www.w3.org/2005/Atom'}
|
||||
root = ET.parse(sys.stdin).getroot()
|
||||
for i, entry in enumerate(root.findall('a:entry', ns)):
|
||||
title = entry.find('a:title', ns).text.strip().replace('\n', ' ')
|
||||
arxiv_id = entry.find('a:id', ns).text.strip().split('/abs/')[-1]
|
||||
published = entry.find('a:published', ns).text[:10]
|
||||
authors = ', '.join(a.find('a:name', ns).text for a in entry.findall('a:author', ns))
|
||||
summary = entry.find('a:summary', ns).text.strip()[:200]
|
||||
cats = ', '.join(c.get('term') for c in entry.findall('a:category', ns))
|
||||
print(f'{i+1}. [{arxiv_id}] {title}')
|
||||
print(f' Authors: {authors}')
|
||||
print(f' Published: {published} | Categories: {cats}')
|
||||
print(f' Abstract: {summary}...')
|
||||
print(f' PDF: https://arxiv.org/pdf/{arxiv_id}')
|
||||
print()
|
||||
"
|
||||
```
|
||||
|
||||
## Search Query Syntax
|
||||
|
||||
| Prefix | Searches | Example |
|
||||
|--------|----------|---------|
|
||||
| `all:` | All fields | `all:transformer+attention` |
|
||||
| `ti:` | Title | `ti:large+language+models` |
|
||||
| `au:` | Author | `au:vaswani` |
|
||||
| `abs:` | Abstract | `abs:reinforcement+learning` |
|
||||
| `cat:` | Category | `cat:cs.AI` |
|
||||
| `co:` | Comment | `co:accepted+NeurIPS` |
|
||||
|
||||
### Boolean operators
|
||||
|
||||
```
|
||||
# AND (default when using +)
|
||||
search_query=all:transformer+attention
|
||||
|
||||
# OR
|
||||
search_query=all:GPT+OR+all:BERT
|
||||
|
||||
# AND NOT
|
||||
search_query=all:language+model+ANDNOT+all:vision
|
||||
|
||||
# Exact phrase
|
||||
search_query=ti:"chain+of+thought"
|
||||
|
||||
# Combined
|
||||
search_query=au:hinton+AND+cat:cs.LG
|
||||
```
|
||||
|
||||
## Sort and Pagination
|
||||
|
||||
| Parameter | Options |
|
||||
|-----------|---------|
|
||||
| `sortBy` | `relevance`, `lastUpdatedDate`, `submittedDate` |
|
||||
| `sortOrder` | `ascending`, `descending` |
|
||||
| `start` | Result offset (0-based) |
|
||||
| `max_results` | Number of results (default 10, max 30000) |
|
||||
|
||||
```bash
|
||||
# Latest 10 papers in cs.AI
|
||||
curl -s "https://export.arxiv.org/api/query?search_query=cat:cs.AI&sortBy=submittedDate&sortOrder=descending&max_results=10"
|
||||
```
|
||||
|
||||
## Fetching Specific Papers
|
||||
|
||||
```bash
|
||||
# By arXiv ID
|
||||
curl -s "https://export.arxiv.org/api/query?id_list=2402.03300"
|
||||
|
||||
# Multiple papers
|
||||
curl -s "https://export.arxiv.org/api/query?id_list=2402.03300,2401.12345,2403.00001"
|
||||
```
|
||||
|
||||
## BibTeX Generation
|
||||
|
||||
After fetching metadata for a paper, generate a BibTeX entry:
|
||||
|
||||
{% raw %}
|
||||
```bash
|
||||
curl -s "https://export.arxiv.org/api/query?id_list=1706.03762" | python3 -c "
|
||||
import sys, xml.etree.ElementTree as ET
|
||||
ns = {'a': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
|
||||
root = ET.parse(sys.stdin).getroot()
|
||||
entry = root.find('a:entry', ns)
|
||||
if entry is None: sys.exit('Paper not found')
|
||||
title = entry.find('a:title', ns).text.strip().replace('\n', ' ')
|
||||
authors = ' and '.join(a.find('a:name', ns).text for a in entry.findall('a:author', ns))
|
||||
year = entry.find('a:published', ns).text[:4]
|
||||
raw_id = entry.find('a:id', ns).text.strip().split('/abs/')[-1]
|
||||
cat = entry.find('arxiv:primary_category', ns)
|
||||
primary = cat.get('term') if cat is not None else 'cs.LG'
|
||||
last_name = entry.find('a:author', ns).find('a:name', ns).text.split()[-1]
|
||||
print(f'@article{{{last_name}{year}_{raw_id.replace(\".\", \"\")},')
|
||||
print(f' title = {{{title}}},')
|
||||
print(f' author = {{{authors}}},')
|
||||
print(f' year = {{{year}}},')
|
||||
print(f' eprint = {{{raw_id}}},')
|
||||
print(f' archivePrefix = {{arXiv}},')
|
||||
print(f' primaryClass = {{{primary}}},')
|
||||
print(f' url = {{https://arxiv.org/abs/{raw_id}}}')
|
||||
print('}')
|
||||
"
|
||||
```
|
||||
{% endraw %}
|
||||
|
||||
## Reading Paper Content
|
||||
|
||||
After finding a paper, read it:
|
||||
|
||||
```
|
||||
# Abstract page (fast, metadata + abstract)
|
||||
web_extract(urls=["https://arxiv.org/abs/2402.03300"])
|
||||
|
||||
# Full paper (PDF → markdown via Firecrawl)
|
||||
web_extract(urls=["https://arxiv.org/pdf/2402.03300"])
|
||||
```
|
||||
|
||||
For local PDF processing, see the `ocr-and-documents` skill.
|
||||
|
||||
## Common Categories
|
||||
|
||||
| Category | Field |
|
||||
|----------|-------|
|
||||
| `cs.AI` | Artificial Intelligence |
|
||||
| `cs.CL` | Computation and Language (NLP) |
|
||||
| `cs.CV` | Computer Vision |
|
||||
| `cs.LG` | Machine Learning |
|
||||
| `cs.CR` | Cryptography and Security |
|
||||
| `stat.ML` | Machine Learning (Statistics) |
|
||||
| `math.OC` | Optimization and Control |
|
||||
| `physics.comp-ph` | Computational Physics |
|
||||
|
||||
Full list: https://arxiv.org/category_taxonomy
|
||||
|
||||
## Helper Script
|
||||
|
||||
The `scripts/search_arxiv.py` script handles XML parsing and provides clean output:
|
||||
|
||||
```bash
|
||||
python scripts/search_arxiv.py "GRPO reinforcement learning"
|
||||
python scripts/search_arxiv.py "transformer attention" --max 10 --sort date
|
||||
python scripts/search_arxiv.py --author "Yann LeCun" --max 5
|
||||
python scripts/search_arxiv.py --category cs.AI --sort date
|
||||
python scripts/search_arxiv.py --id 2402.03300
|
||||
python scripts/search_arxiv.py --id 2402.03300,2401.12345
|
||||
```
|
||||
|
||||
No dependencies — uses only Python stdlib.
|
||||
|
||||
---
|
||||
|
||||
## Semantic Scholar (Citations, Related Papers, Author Profiles)
|
||||
|
||||
arXiv doesn't provide citation data or recommendations. Use the **Semantic Scholar API** for that — free, no key needed for basic use (1 req/sec), returns JSON.
|
||||
|
||||
### Get paper details + citations
|
||||
|
||||
```bash
|
||||
# By arXiv ID
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300?fields=title,authors,citationCount,referenceCount,influentialCitationCount,year,abstract" | python3 -m json.tool
|
||||
|
||||
# By Semantic Scholar paper ID or DOI
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/DOI:10.1234/example?fields=title,citationCount"
|
||||
```
|
||||
|
||||
### Get citations OF a paper (who cited it)
|
||||
|
||||
```bash
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300/citations?fields=title,authors,year,citationCount&limit=10" | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Get references FROM a paper (what it cites)
|
||||
|
||||
```bash
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:2402.03300/references?fields=title,authors,year,citationCount&limit=10" | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Search papers (alternative to arXiv search, returns JSON)
|
||||
|
||||
```bash
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/search?query=GRPO+reinforcement+learning&limit=5&fields=title,authors,year,citationCount,externalIds" | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Get paper recommendations
|
||||
|
||||
```bash
|
||||
curl -s -X POST "https://api.semanticscholar.org/recommendations/v1/papers/" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"positivePaperIds": ["arXiv:2402.03300"], "negativePaperIds": []}' | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Author profile
|
||||
|
||||
```bash
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/author/search?query=Yann+LeCun&fields=name,hIndex,citationCount,paperCount" | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Useful Semantic Scholar fields
|
||||
|
||||
`title`, `authors`, `year`, `abstract`, `citationCount`, `referenceCount`, `influentialCitationCount`, `isOpenAccess`, `openAccessPdf`, `fieldsOfStudy`, `publicationVenue`, `externalIds` (contains arXiv ID, DOI, etc.)
|
||||
|
||||
---
|
||||
|
||||
## Complete Research Workflow
|
||||
|
||||
1. **Discover**: `python scripts/search_arxiv.py "your topic" --sort date --max 10`
|
||||
2. **Assess impact**: `curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:ID?fields=citationCount,influentialCitationCount"`
|
||||
3. **Read abstract**: `web_extract(urls=["https://arxiv.org/abs/ID"])`
|
||||
4. **Read full paper**: `web_extract(urls=["https://arxiv.org/pdf/ID"])`
|
||||
5. **Find related work**: `curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:ID/references?fields=title,citationCount&limit=20"`
|
||||
6. **Get recommendations**: POST to Semantic Scholar recommendations endpoint
|
||||
7. **Track authors**: `curl -s "https://api.semanticscholar.org/graph/v1/author/search?query=NAME"`
|
||||
|
||||
## Rate Limits
|
||||
|
||||
| API | Rate | Auth |
|
||||
|-----|------|------|
|
||||
| arXiv | ~1 req / 3 seconds | None needed |
|
||||
| Semantic Scholar | 1 req / second | None (100/sec with API key) |
|
||||
|
||||
## Deep Paper Reading
|
||||
|
||||
For extracting full methodology from a paper (not just metadata/abstract):
|
||||
|
||||
1. **HTML version is best**: `curl https://arxiv.org/html/{id}v{N}` — structured sections, tables, appendices all parseable
|
||||
2. **Download first**: `curl -sL {url} -o /tmp/paper.html` then process locally
|
||||
3. **Section IDs**: S1=Intro, S2-S5=Body, S6=Related, S7=Conclusion; appendices indexed by letter
|
||||
4. **Tables**: Extract with `re.findall(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)`
|
||||
5. **Appendix content**: headings appear twice (TOC + body); use last occurrence for actual content
|
||||
6. **Check GitHub**: `curl -sL https://api.github.com/repos/{owner}/{repo}/contents` for released code/models
|
||||
|
||||
Full extraction patterns and methodology template: `references/paper-reading-methodology.md`
|
||||
|
||||
Example of a completed deep-read reference: `references/skillrouter-methodology.md`
|
||||
|
||||
## Notes
|
||||
|
||||
- arXiv returns Atom XML — use the helper script or parsing snippet for clean output
|
||||
- Semantic Scholar returns JSON — pipe through `python3 -m json.tool` for readability
|
||||
- arXiv IDs: old format (`hep-th/0601001`) vs new (`2402.03300`)
|
||||
- PDF: `https://arxiv.org/pdf/{id}` — Abstract: `https://arxiv.org/abs/{id}`
|
||||
- HTML (when available): `https://arxiv.org/html/{id}`
|
||||
- For local PDF processing, see the `ocr-and-documents` skill
|
||||
|
||||
## ID Versioning
|
||||
|
||||
- `arxiv.org/abs/1706.03762` always resolves to the **latest** version
|
||||
- `arxiv.org/abs/1706.03762v1` points to a **specific** immutable version
|
||||
- When generating citations, preserve the version suffix you actually read to prevent citation drift (a later version may substantially change content)
|
||||
- The API `<id>` field returns the versioned URL (e.g., `http://arxiv.org/abs/1706.03762v7`)
|
||||
|
||||
## Withdrawn Papers
|
||||
|
||||
Papers can be withdrawn after submission. When this happens:
|
||||
- The `<summary>` field contains a withdrawal notice (look for "withdrawn" or "retracted")
|
||||
- Metadata fields may be incomplete
|
||||
- Always check the summary before treating a result as a valid paper
|
||||
98
research/arxiv/references/paper-reading-methodology.md
Normal file
98
research/arxiv/references/paper-reading-methodology.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Deep Paper Reading Methodology
|
||||
|
||||
Extracting full technical detail from arXiv papers when PDF parsing tools are unavailable.
|
||||
|
||||
## Extraction Fallback Chain
|
||||
|
||||
1. **pymupdf (fitz)** — best quality, but needs `uv pip install pymupdf`
|
||||
2. **pdftotext** — `apt install poppler-utils`, then `pdftotext file.pdf -`
|
||||
3. **Raw regex on PDF bytes** — `re.findall(rb'\(([^)]+)\)', data)` extracts text streams; works for metadata/abstract but garbles body
|
||||
4. **HTML version** — `curl https://arxiv.org/html/{id}v{N}` — cleanest structured extraction; **preferred method**
|
||||
5. **Abstract page** — `curl https://arxiv.org/abs/{id}` + regex on `<blockquote class="abstract">`
|
||||
|
||||
## HTML Extraction Patterns (preferred)
|
||||
|
||||
The arXiv HTML version (`/html/{id}v{N}`) has structured `<section>` elements with IDs:
|
||||
|
||||
```
|
||||
S1 = Introduction
|
||||
S2 = Problem definition
|
||||
S3 = Key findings
|
||||
S4 = Method
|
||||
S5 = Experiments (S5.SS1, S5.SS2 = subsections)
|
||||
S6 = Related work
|
||||
S7 = Conclusion
|
||||
Appendices: indexed by letter (A, B, C...) in ltx_tocentry
|
||||
```
|
||||
|
||||
### Section extraction pattern:
|
||||
|
||||
```python
|
||||
import re
|
||||
html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
|
||||
m = re.search(r'<section[^>]*id="S4"[^>]*>(.*?)(?=<section[^>]*id="S5"|$)', html, re.DOTALL)
|
||||
text = re.sub(r'<[^>]+>', ' ', m.group(1)).strip()
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
```
|
||||
|
||||
### Table extraction:
|
||||
|
||||
```python
|
||||
tables = re.findall(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
for t in tables:
|
||||
text = re.sub(r'<[^>]+>', ' | ', t).strip()
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
```
|
||||
|
||||
### Appendix content (avoid TOC duplicates):
|
||||
|
||||
Appendix headings appear twice — once in TOC, once as actual content. Use `positions[-1]` (last occurrence) for the real content. Search by keyword rather than section ID for appendices.
|
||||
|
||||
### Targeted keyword search (when section IDs fail):
|
||||
|
||||
```python
|
||||
searches = ['keyword1', 'keyword2']
|
||||
for s in searches:
|
||||
positions = [m.start() for m in re.finditer(re.escape(s), html, re.IGNORECASE)]
|
||||
if positions:
|
||||
pos = positions[-1] # last occurrence = actual content, not TOC
|
||||
chunk = html[max(0,pos-200):pos+500]
|
||||
text = re.sub(r'<[^>]+>', ' ', chunk)
|
||||
```
|
||||
|
||||
## Structured Methodology Extraction Template
|
||||
|
||||
When the user asks to "learn the method" or do a deep read, extract:
|
||||
|
||||
1. **Architecture** — pipeline stages, model sizes, data flow
|
||||
2. **Training data** — how it's constructed, sources, sizes, prompts used
|
||||
3. **Negative mining** — strategy for hard negatives, filtering
|
||||
4. **Loss functions** — exact objective, temperature, why this choice
|
||||
5. **Training hyperparams** — LR, batch size, epochs, hardware
|
||||
6. **Inference flow** — online vs offline steps, latency, throughput
|
||||
7. **Key ablations** — what matters and by how much
|
||||
8. **Code/models released** — check GitHub repo structure
|
||||
|
||||
## GitHub Repo Inspection Pattern
|
||||
|
||||
```bash
|
||||
# Check if repo exists and get stats
|
||||
curl -sL "https://api.github.com/repos/{owner}/{repo}"
|
||||
|
||||
# List top-level structure
|
||||
curl -sL "https://api.github.com/repos/{owner}/{repo}/contents"
|
||||
|
||||
# Check subdirectories
|
||||
for d in src scripts; do
|
||||
curl -sL "https://api.github.com/repos/{owner}/{repo}/contents/$d"
|
||||
done
|
||||
|
||||
# Read README
|
||||
curl -sL "https://raw.githubusercontent.com/{owner}/{repo}/main/README.md"
|
||||
```
|
||||
|
||||
## Semantic Scholar for Citation Context
|
||||
|
||||
```bash
|
||||
curl -s "https://api.semanticscholar.org/graph/v1/paper/arXiv:{id}?fields=citationCount,influentialCitationCount"
|
||||
```
|
||||
62
research/arxiv/references/skillrouter-methodology.md
Normal file
62
research/arxiv/references/skillrouter-methodology.md
Normal file
@@ -0,0 +1,62 @@
|
||||
# SkillRouter: Key Takeaways for LLM Agent Skill Routing
|
||||
|
||||
Paper: https://arxiv.org/abs/2603.22455 (Apr 2026, Alibaba)
|
||||
Code: https://github.com/zhengyanzhao1997/SkillRouter
|
||||
Models: https://huggingface.co/pipizhao/SkillRouter-Embedding-0.6B, SkillRouter-Reranker-0.6B
|
||||
|
||||
## Core Finding
|
||||
|
||||
At ~80K skill scale with heavy overlap, exposing only name+description causes 31-44pp Hit@1 drop vs full skill text. Full body is THE critical routing signal, not metadata.
|
||||
|
||||
## Architecture (1.2B total)
|
||||
|
||||
```
|
||||
query → SR-Emb-0.6B (bi-encoder) → top-20 from 80K → SR-Rank-0.6B (cross-encoder) → final rank
|
||||
```
|
||||
|
||||
## Training Recipe
|
||||
|
||||
### Data: 37,979 synthetic (query, skill) pairs
|
||||
- Skills sampled with category stratification from ~80K pool
|
||||
- Queries generated by GPT-4o-mini; prompt forbids revealing skill name
|
||||
- Benchmark skills excluded from training
|
||||
|
||||
### Hard Negative Mining (10 per query)
|
||||
- 4 semantic neighbors (embedding NN)
|
||||
- 3 BM25 lexical matches
|
||||
- 2 same-category distractors
|
||||
- 1 random cross-category
|
||||
|
||||
### False Negative Filtering (critical — +4.0pp)
|
||||
Three-layer filter removes ~10% of mined negatives:
|
||||
1. Name dedup (24,879 pairs)
|
||||
2. Body trigram Jaccard > 0.6 (13,860 pairs)
|
||||
3. Embedding cosine > 0.92 (326 pairs)
|
||||
|
||||
### Loss: Listwise CE >> Pointwise BCE
|
||||
- Pointwise: 43.3% Hit@1 (fails because homogeneous candidates get similar scores)
|
||||
- Listwise: 74.0% Hit@1 (compares candidates against each other)
|
||||
- This is THE key training choice for reranker
|
||||
|
||||
### Hyperparams
|
||||
- Encoder: InfoNCE τ=0.05, LR 2e-5, batch 8, GA 4, 1 epoch, max 2048 tokens
|
||||
- Reranker: Listwise CE τ=1.0, LR 1e-5, 1 epoch, max 4096 tokens
|
||||
- Both: single GPU, Qwen3-Emb/Rank-0.6B base
|
||||
|
||||
### Input Templates
|
||||
- Encoder query: `Instruct: ...\nQuery: <text>` (1500 char cap)
|
||||
- Encoder skill: `<name> | <desc:300> | <body:2500>` (no instruction prefix)
|
||||
- Reranker: `<Instruct>: ...\n<Query>: ...\n<Document>: <name> | <desc:500> | <body:2000>`
|
||||
|
||||
## Results
|
||||
| System | Params | Avg Hit@1 | Speed |
|
||||
|--------|--------|-----------|-------|
|
||||
| Qwen3-Emb-8B + Qwen3-Rank-8B | 16B | 68.0% | 0.32 QPS |
|
||||
| SR-Emb-0.6B + SR-Rank-0.6B | 1.2B | 74.0% | 1.83 QPS |
|
||||
| SR-Emb-8B + SR-Rank-8B | 16B | 76.0% | - |
|
||||
|
||||
## Relevance to Hermes
|
||||
- Hermes currently exposes ~100 skills via name+desc in system prompt, full SKILL.md on demand
|
||||
- At current scale this works; at 1000+ skills, a routing layer becomes necessary
|
||||
- False-negative filtering concept applies to Hermes skill deduplication
|
||||
- Listwise reranking matters when many skills look similar (e.g., multiple research skills)
|
||||
114
research/arxiv/scripts/search_arxiv.py
Normal file
114
research/arxiv/scripts/search_arxiv.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Search arXiv and display results in a clean format.
|
||||
|
||||
Usage:
|
||||
python search_arxiv.py "GRPO reinforcement learning"
|
||||
python search_arxiv.py "GRPO reinforcement learning" --max 10
|
||||
python search_arxiv.py "GRPO reinforcement learning" --sort date
|
||||
python search_arxiv.py --author "Yann LeCun" --max 5
|
||||
python search_arxiv.py --category cs.AI --sort date --max 10
|
||||
python search_arxiv.py --id 2402.03300
|
||||
python search_arxiv.py --id 2402.03300,2401.12345
|
||||
"""
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
NS = {'a': 'http://www.w3.org/2005/Atom'}
|
||||
|
||||
def search(query=None, author=None, category=None, ids=None, max_results=5, sort="relevance"):
|
||||
params = {}
|
||||
|
||||
if ids:
|
||||
params['id_list'] = ids
|
||||
else:
|
||||
parts = []
|
||||
if query:
|
||||
parts.append(f'all:{urllib.parse.quote(query)}')
|
||||
if author:
|
||||
parts.append(f'au:{urllib.parse.quote(author)}')
|
||||
if category:
|
||||
parts.append(f'cat:{category}')
|
||||
if not parts:
|
||||
print("Error: provide a query, --author, --category, or --id")
|
||||
sys.exit(1)
|
||||
params['search_query'] = '+AND+'.join(parts)
|
||||
|
||||
params['max_results'] = str(max_results)
|
||||
|
||||
sort_map = {"relevance": "relevance", "date": "submittedDate", "updated": "lastUpdatedDate"}
|
||||
params['sortBy'] = sort_map.get(sort, sort)
|
||||
params['sortOrder'] = 'descending'
|
||||
|
||||
url = "https://export.arxiv.org/api/query?" + "&".join(f"{k}={v}" for k, v in params.items())
|
||||
|
||||
req = urllib.request.Request(url, headers={'User-Agent': 'HermesAgent/1.0'})
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = resp.read()
|
||||
|
||||
root = ET.fromstring(data)
|
||||
entries = root.findall('a:entry', NS)
|
||||
|
||||
if not entries:
|
||||
print("No results found.")
|
||||
return
|
||||
|
||||
total = root.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults')
|
||||
if total is not None:
|
||||
print(f"Found {total.text} results (showing {len(entries)})\n")
|
||||
|
||||
for i, entry in enumerate(entries):
|
||||
title = entry.find('a:title', NS).text.strip().replace('\n', ' ')
|
||||
raw_id = entry.find('a:id', NS).text.strip()
|
||||
full_id = raw_id.split('/abs/')[-1] if '/abs/' in raw_id else raw_id
|
||||
arxiv_id = full_id.split('v')[0] # base ID for links
|
||||
published = entry.find('a:published', NS).text[:10]
|
||||
updated = entry.find('a:updated', NS).text[:10]
|
||||
authors = ', '.join(a.find('a:name', NS).text for a in entry.findall('a:author', NS))
|
||||
summary = entry.find('a:summary', NS).text.strip().replace('\n', ' ')
|
||||
cats = ', '.join(c.get('term') for c in entry.findall('a:category', NS))
|
||||
|
||||
version = full_id[len(arxiv_id):] if full_id != arxiv_id else ""
|
||||
print(f"{i+1}. {title}")
|
||||
print(f" ID: {arxiv_id}{version} | Published: {published} | Updated: {updated}")
|
||||
print(f" Authors: {authors}")
|
||||
print(f" Categories: {cats}")
|
||||
print(f" Abstract: {summary[:300]}{'...' if len(summary) > 300 else ''}")
|
||||
print(f" Links: https://arxiv.org/abs/{arxiv_id} | https://arxiv.org/pdf/{arxiv_id}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1:]
|
||||
if not args or args[0] in ("-h", "--help"):
|
||||
print(__doc__)
|
||||
sys.exit(0)
|
||||
|
||||
query = None
|
||||
author = None
|
||||
category = None
|
||||
ids = None
|
||||
max_results = 5
|
||||
sort = "relevance"
|
||||
|
||||
i = 0
|
||||
positional = []
|
||||
while i < len(args):
|
||||
if args[i] == "--max" and i + 1 < len(args):
|
||||
max_results = int(args[i + 1]); i += 2
|
||||
elif args[i] == "--sort" and i + 1 < len(args):
|
||||
sort = args[i + 1]; i += 2
|
||||
elif args[i] == "--author" and i + 1 < len(args):
|
||||
author = args[i + 1]; i += 2
|
||||
elif args[i] == "--category" and i + 1 < len(args):
|
||||
category = args[i + 1]; i += 2
|
||||
elif args[i] == "--id" and i + 1 < len(args):
|
||||
ids = args[i + 1]; i += 2
|
||||
else:
|
||||
positional.append(args[i]); i += 1
|
||||
|
||||
if positional:
|
||||
query = " ".join(positional)
|
||||
|
||||
search(query=query, author=author, category=category, ids=ids, max_results=max_results, sort=sort)
|
||||
Reference in New Issue
Block a user