first commit
This commit is contained in:
79
sn-search-academic/scripts/wikipedia_search.py
Normal file
79
sn-search-academic/scripts/wikipedia_search.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Wikipedia 搜索。通过 MediaWiki API。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from search_utils import build_parser, get_client, make_item, make_result, print_json
|
||||
|
||||
|
||||
def _api_url(lang: str) -> str:
|
||||
return f"https://{lang}.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
def search(query: str, limit: int, lang: str = "en") -> list[dict]:
|
||||
"""执行 Wikipedia 搜索。"""
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "search",
|
||||
"srsearch": query,
|
||||
"srlimit": min(limit, 50),
|
||||
"srprop": "snippet|timestamp|wordcount|size|sectiontitle|sectionsnippet",
|
||||
"format": "json",
|
||||
"utf8": 1,
|
||||
}
|
||||
|
||||
with get_client() as client:
|
||||
resp = client.get(_api_url(lang), params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
items = []
|
||||
for result in data.get("query", {}).get("search", [])[:limit]:
|
||||
title = result.get("title", "")
|
||||
# snippet 是 HTML 片段,简单去标签
|
||||
snippet = _strip_html(result.get("snippet", ""))
|
||||
page_id = result.get("pageid", "")
|
||||
url = f"https://{lang}.wikipedia.org/wiki/{title.replace(' ', '_')}"
|
||||
|
||||
section_title = result.get("sectiontitle", "")
|
||||
section_snippet = _strip_html(result.get("sectionsnippet", ""))
|
||||
|
||||
items.append(make_item(
|
||||
title=title,
|
||||
url=url,
|
||||
snippet=snippet,
|
||||
word_count=result.get("wordcount"),
|
||||
size=result.get("size"),
|
||||
timestamp=result.get("timestamp"),
|
||||
page_id=page_id,
|
||||
section_title=section_title if section_title else None,
|
||||
section_snippet=section_snippet if section_snippet else None,
|
||||
))
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
import re
|
||||
text = re.sub(r"<[^>]+>", "", html)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def main():
|
||||
parser = build_parser("搜索 Wikipedia 百科文章")
|
||||
parser.add_argument("--lang", "-l", default="en",
|
||||
help="语言版本(默认 en,可选 zh, ja, de 等)")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
items = search(args.query, args.limit, args.lang)
|
||||
print_json(make_result(True, args.query, "wikipedia", items))
|
||||
except Exception as e:
|
||||
print_json(make_result(False, args.query, "wikipedia", [], str(e)))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user