first commit
This commit is contained in:
89
sn-search-code/scripts/hackernews_search.py
Normal file
89
sn-search-code/scripts/hackernews_search.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Hacker News 搜索。通过 Algolia HN Search API。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from search_utils import build_parser, get_client, make_item, make_result, print_json
|
||||
|
||||
|
||||
API_URL = "https://hn.algolia.com/api/v1"
|
||||
|
||||
|
||||
def search(query: str, limit: int, sort: str = "relevance", tags: str | None = None) -> list[dict]:
|
||||
"""执行 Hacker News 搜索。
|
||||
|
||||
sort: "relevance" 或 "date"
|
||||
tags: Algolia 标签过滤,如 "story", "comment", "ask_hn", "show_hn"
|
||||
"""
|
||||
# search 按相关性,search_by_date 按时间
|
||||
endpoint = "search" if sort == "relevance" else "search_by_date"
|
||||
url = f"{API_URL}/{endpoint}"
|
||||
|
||||
params: dict = {
|
||||
"query": query,
|
||||
"hitsPerPage": min(limit, 100),
|
||||
}
|
||||
if tags:
|
||||
params["tags"] = tags
|
||||
|
||||
with get_client() as client:
|
||||
resp = client.get(url, params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
items = []
|
||||
for hit in data.get("hits", [])[:limit]:
|
||||
# 构造 HN 链接
|
||||
object_id = hit.get("objectID", "")
|
||||
hn_url = f"https://news.ycombinator.com/item?id={object_id}"
|
||||
# 原始链接(如果有)
|
||||
original_url = hit.get("url") or hn_url
|
||||
|
||||
title = hit.get("title") or hit.get("story_title") or ""
|
||||
raw_text = hit.get("comment_text") or hit.get("story_text") or ""
|
||||
snippet = _strip_html(raw_text)
|
||||
|
||||
# _tags 形如 ["story", "author_xxx", "story_43998472"],只保留内容类型标签
|
||||
raw_tags = hit.get("_tags") or []
|
||||
type_tags = [t for t in raw_tags if t in ("story", "comment", "ask_hn", "show_hn", "job", "poll")]
|
||||
|
||||
items.append(make_item(
|
||||
title=title,
|
||||
url=original_url,
|
||||
snippet=snippet,
|
||||
hn_url=hn_url,
|
||||
points=hit.get("points"),
|
||||
num_comments=hit.get("num_comments"),
|
||||
author=hit.get("author"),
|
||||
created_at=hit.get("created_at"),
|
||||
type=type_tags[0] if type_tags else None,
|
||||
))
|
||||
return items
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
import re, html as html_mod
|
||||
text = re.sub(r"<[^>]+>", " ", html)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return html_mod.unescape(text)
|
||||
|
||||
|
||||
def main():
|
||||
parser = build_parser("搜索 Hacker News 新闻和讨论")
|
||||
parser.add_argument("--sort", default="relevance",
|
||||
choices=["relevance", "date"],
|
||||
help="排序方式(默认 relevance)")
|
||||
parser.add_argument("--tags", help="HN 标签过滤(story, comment, ask_hn, show_hn)")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
items = search(args.query, args.limit, args.sort, args.tags)
|
||||
print_json(make_result(True, args.query, "hackernews", items))
|
||||
except Exception as e:
|
||||
print_json(make_result(False, args.query, "hackernews", [], str(e)))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user