first commit
This commit is contained in:
203
sn-search-social-cn/scripts/zhihu_search.py
Normal file
203
sn-search-social-cn/scripts/zhihu_search.py
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env python3
|
||||
"""知乎搜索。通过知乎内部 API(需要 cookie 认证)。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
|
||||
|
||||
# 正文内联截断长度(超出部分存文件)
|
||||
_CONTENT_INLINE_LIMIT = 2000
|
||||
|
||||
|
||||
SEARCH_URL = "https://www.zhihu.com/api/v4/search_v3"
|
||||
|
||||
# 广告类型,对研究无价值,直接过滤
|
||||
_AD_TYPES = {"education", "knowledge_ad"}
|
||||
|
||||
|
||||
def search(query: str, limit: int, cookie: str | None = None, search_type: str = "general") -> list[dict]:
|
||||
"""执行知乎搜索。"""
|
||||
if not cookie:
|
||||
raise ValueError("需要 ZHIHU_COOKIE 环境变量。请从浏览器开发者工具获取知乎 cookie。")
|
||||
|
||||
headers = {
|
||||
"Cookie": cookie,
|
||||
"Referer": "https://www.zhihu.com/search",
|
||||
"Origin": "https://www.zhihu.com",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
params = {
|
||||
"q": query,
|
||||
"t": search_type,
|
||||
"offset": 0,
|
||||
# 多请求一些以弥补过滤掉广告条目的损失
|
||||
"limit": min(limit * 2, 20),
|
||||
}
|
||||
|
||||
with get_client(headers=headers) as client:
|
||||
resp = client.get(SEARCH_URL, params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
items = []
|
||||
for entry in data.get("data", []):
|
||||
if len(items) >= limit:
|
||||
break
|
||||
if entry.get("type") in _AD_TYPES:
|
||||
continue
|
||||
|
||||
obj = entry.get("object", {}) or entry
|
||||
obj_type = obj.get("type", "")
|
||||
|
||||
item = _parse_object(obj, obj_type)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def _parse_object(obj: dict, obj_type: str) -> dict | None:
|
||||
"""将 API 返回的 object 解析为标准条目。"""
|
||||
title = _strip_html(obj.get("title") or obj.get("name") or "")
|
||||
snippet = _strip_html(obj.get("excerpt") or obj.get("description") or "")[:300]
|
||||
full_content = _strip_html(obj.get("content") or "")
|
||||
content, content_file = _maybe_save_content(full_content, obj_type, obj.get("id", ""))
|
||||
|
||||
url = _build_url(obj, obj_type)
|
||||
|
||||
# 作者信息
|
||||
author_obj = obj.get("author", {})
|
||||
author_name = author_obj.get("name", "") if isinstance(author_obj, dict) else ""
|
||||
author_headline = author_obj.get("headline", "") if isinstance(author_obj, dict) else ""
|
||||
author_followers = author_obj.get("follower_count") if isinstance(author_obj, dict) else None
|
||||
|
||||
# 互动数据
|
||||
voteup = obj.get("voteup_count") or 0
|
||||
comment = obj.get("comment_count") or 0
|
||||
favorites = obj.get("favorites_count") or obj.get("zfav_count") or 0
|
||||
visits = obj.get("visits_count") # answer 特有
|
||||
|
||||
# 时间(转为 ISO 8601,方便 agent 判断时效性)
|
||||
created_at = _ts_to_iso(obj.get("created_time"))
|
||||
updated_at = _ts_to_iso(obj.get("updated_time"))
|
||||
|
||||
# answer 专属:所属问题的标题和链接
|
||||
question_title = ""
|
||||
question_url = ""
|
||||
answer_count = None
|
||||
if obj_type == "answer":
|
||||
q = obj.get("question", {})
|
||||
question_title = _strip_html(q.get("name") or q.get("title") or "")
|
||||
qid = q.get("id", "")
|
||||
question_url = f"https://www.zhihu.com/question/{qid}" if qid else ""
|
||||
answer_count = obj.get("answer_count")
|
||||
# answer 没有独立 title,用问题标题补充
|
||||
if not title:
|
||||
title = question_title
|
||||
|
||||
# question 专属
|
||||
if obj_type == "question":
|
||||
answer_count = obj.get("answer_count")
|
||||
|
||||
if not title and not snippet:
|
||||
return None
|
||||
|
||||
return make_item(
|
||||
title=title,
|
||||
url=url,
|
||||
snippet=snippet,
|
||||
content=content,
|
||||
content_file=content_file,
|
||||
content_type=obj_type,
|
||||
author=author_name,
|
||||
author_headline=author_headline,
|
||||
author_followers=author_followers,
|
||||
voteup_count=voteup,
|
||||
comment_count=comment,
|
||||
favorites_count=favorites if favorites else None,
|
||||
visits_count=visits,
|
||||
answer_count=answer_count,
|
||||
question_title=question_title if obj_type == "answer" else None,
|
||||
question_url=question_url if obj_type == "answer" else None,
|
||||
created_at=created_at,
|
||||
updated_at=updated_at,
|
||||
)
|
||||
|
||||
|
||||
def _maybe_save_content(full_content: str, obj_type: str, obj_id: str) -> tuple[str, str | None]:
|
||||
"""处理正文:短内容直接返回,长内容截断并将完整版存为临时文件。
|
||||
|
||||
返回 (inline_content, file_path),file_path 为 None 表示未截断。
|
||||
"""
|
||||
if not full_content:
|
||||
return "", None
|
||||
|
||||
if len(full_content) <= _CONTENT_INLINE_LIMIT:
|
||||
return full_content, None
|
||||
|
||||
# 超出截断限制,写入临时文件
|
||||
suffix = f"_zhihu_{obj_type}_{obj_id}.txt"
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", encoding="utf-8", suffix=suffix, delete=False
|
||||
) as f:
|
||||
f.write(full_content)
|
||||
fpath = f.name
|
||||
|
||||
inline = (
|
||||
full_content[:_CONTENT_INLINE_LIMIT]
|
||||
+ f"\n\n[内容已截断,共 {len(full_content)} 字,完整内容见: {fpath}]"
|
||||
)
|
||||
return inline, fpath
|
||||
|
||||
|
||||
def _build_url(obj: dict, obj_type: str) -> str:
|
||||
"""构造面向用户的 Web URL(而非 API URL)。"""
|
||||
oid = obj.get("id", "")
|
||||
if obj_type == "article":
|
||||
return f"https://zhuanlan.zhihu.com/p/{oid}" if oid else ""
|
||||
if obj_type == "answer":
|
||||
q = obj.get("question", {})
|
||||
qid = q.get("id", "")
|
||||
return f"https://www.zhihu.com/question/{qid}/answer/{oid}" if qid and oid else ""
|
||||
if obj_type == "question":
|
||||
return f"https://www.zhihu.com/question/{oid}" if oid else ""
|
||||
# 其他类型直接返回 obj 里的 url(若有)
|
||||
raw = obj.get("url", "")
|
||||
# 将 api.zhihu.com 替换为 www.zhihu.com
|
||||
return raw.replace("https://api.zhihu.com/", "https://www.zhihu.com/")
|
||||
|
||||
|
||||
def _strip_html(html: str) -> str:
|
||||
return re.sub(r"<[^>]+>", "", html).strip()
|
||||
|
||||
|
||||
def _ts_to_iso(ts: int | None) -> str | None:
|
||||
if not ts:
|
||||
return None
|
||||
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def main():
|
||||
parser = build_parser("搜索知乎问答和文章")
|
||||
parser.add_argument("--cookie", help="知乎 Cookie(也可通过 ZHIHU_COOKIE 环境变量设置)")
|
||||
parser.add_argument("--type", default="general",
|
||||
choices=["general", "topic", "people", "zvideo"],
|
||||
help="搜索类型(默认 general)")
|
||||
args = parser.parse_args()
|
||||
|
||||
cookie = get_key("ZHIHU_COOKIE", args.cookie)
|
||||
try:
|
||||
items = search(args.query, args.limit, cookie, args.type)
|
||||
print_json(make_result(True, args.query, "zhihu", items))
|
||||
except Exception as e:
|
||||
print_json(make_result(False, args.query, "zhihu", [], str(e)))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user