#!/usr/bin/env python3 """知乎搜索。通过知乎内部 API(需要 cookie 认证)。""" from __future__ import annotations import re import sys import tempfile from datetime import datetime, timezone from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json # 正文内联截断长度(超出部分存文件) _CONTENT_INLINE_LIMIT = 2000 SEARCH_URL = "https://www.zhihu.com/api/v4/search_v3" # 广告类型,对研究无价值,直接过滤 _AD_TYPES = {"education", "knowledge_ad"} def search(query: str, limit: int, cookie: str | None = None, search_type: str = "general") -> list[dict]: """执行知乎搜索。""" if not cookie: raise ValueError("需要 ZHIHU_COOKIE 环境变量。请从浏览器开发者工具获取知乎 cookie。") headers = { "Cookie": cookie, "Referer": "https://www.zhihu.com/search", "Origin": "https://www.zhihu.com", "Accept": "application/json", } params = { "q": query, "t": search_type, "offset": 0, # 多请求一些以弥补过滤掉广告条目的损失 "limit": min(limit * 2, 20), } with get_client(headers=headers) as client: resp = client.get(SEARCH_URL, params=params) resp.raise_for_status() data = resp.json() items = [] for entry in data.get("data", []): if len(items) >= limit: break if entry.get("type") in _AD_TYPES: continue obj = entry.get("object", {}) or entry obj_type = obj.get("type", "") item = _parse_object(obj, obj_type) if item: items.append(item) return items def _parse_object(obj: dict, obj_type: str) -> dict | None: """将 API 返回的 object 解析为标准条目。""" title = _strip_html(obj.get("title") or obj.get("name") or "") snippet = _strip_html(obj.get("excerpt") or obj.get("description") or "")[:300] full_content = _strip_html(obj.get("content") or "") content, content_file = _maybe_save_content(full_content, obj_type, obj.get("id", "")) url = _build_url(obj, obj_type) # 作者信息 author_obj = obj.get("author", {}) author_name = author_obj.get("name", "") if isinstance(author_obj, dict) else "" author_headline = author_obj.get("headline", "") if isinstance(author_obj, dict) else "" author_followers = author_obj.get("follower_count") if isinstance(author_obj, dict) else None # 互动数据 voteup = obj.get("voteup_count") or 0 comment = obj.get("comment_count") or 0 favorites = obj.get("favorites_count") or obj.get("zfav_count") or 0 visits = obj.get("visits_count") # answer 特有 # 时间(转为 ISO 8601,方便 agent 判断时效性) created_at = _ts_to_iso(obj.get("created_time")) updated_at = _ts_to_iso(obj.get("updated_time")) # answer 专属:所属问题的标题和链接 question_title = "" question_url = "" answer_count = None if obj_type == "answer": q = obj.get("question", {}) question_title = _strip_html(q.get("name") or q.get("title") or "") qid = q.get("id", "") question_url = f"https://www.zhihu.com/question/{qid}" if qid else "" answer_count = obj.get("answer_count") # answer 没有独立 title,用问题标题补充 if not title: title = question_title # question 专属 if obj_type == "question": answer_count = obj.get("answer_count") if not title and not snippet: return None return make_item( title=title, url=url, snippet=snippet, content=content, content_file=content_file, content_type=obj_type, author=author_name, author_headline=author_headline, author_followers=author_followers, voteup_count=voteup, comment_count=comment, favorites_count=favorites if favorites else None, visits_count=visits, answer_count=answer_count, question_title=question_title if obj_type == "answer" else None, question_url=question_url if obj_type == "answer" else None, created_at=created_at, updated_at=updated_at, ) def _maybe_save_content(full_content: str, obj_type: str, obj_id: str) -> tuple[str, str | None]: """处理正文:短内容直接返回,长内容截断并将完整版存为临时文件。 返回 (inline_content, file_path),file_path 为 None 表示未截断。 """ if not full_content: return "", None if len(full_content) <= _CONTENT_INLINE_LIMIT: return full_content, None # 超出截断限制,写入临时文件 suffix = f"_zhihu_{obj_type}_{obj_id}.txt" with tempfile.NamedTemporaryFile( mode="w", encoding="utf-8", suffix=suffix, delete=False ) as f: f.write(full_content) fpath = f.name inline = ( full_content[:_CONTENT_INLINE_LIMIT] + f"\n\n[内容已截断,共 {len(full_content)} 字,完整内容见: {fpath}]" ) return inline, fpath def _build_url(obj: dict, obj_type: str) -> str: """构造面向用户的 Web URL(而非 API URL)。""" oid = obj.get("id", "") if obj_type == "article": return f"https://zhuanlan.zhihu.com/p/{oid}" if oid else "" if obj_type == "answer": q = obj.get("question", {}) qid = q.get("id", "") return f"https://www.zhihu.com/question/{qid}/answer/{oid}" if qid and oid else "" if obj_type == "question": return f"https://www.zhihu.com/question/{oid}" if oid else "" # 其他类型直接返回 obj 里的 url(若有) raw = obj.get("url", "") # 将 api.zhihu.com 替换为 www.zhihu.com return raw.replace("https://api.zhihu.com/", "https://www.zhihu.com/") def _strip_html(html: str) -> str: return re.sub(r"<[^>]+>", "", html).strip() def _ts_to_iso(ts: int | None) -> str | None: if not ts: return None return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def main(): parser = build_parser("搜索知乎问答和文章") parser.add_argument("--cookie", help="知乎 Cookie(也可通过 ZHIHU_COOKIE 环境变量设置)") parser.add_argument("--type", default="general", choices=["general", "topic", "people", "zvideo"], help="搜索类型(默认 general)") args = parser.parse_args() cookie = get_key("ZHIHU_COOKIE", args.cookie) try: items = search(args.query, args.limit, cookie, args.type) print_json(make_result(True, args.query, "zhihu", items)) except Exception as e: print_json(make_result(False, args.query, "zhihu", [], str(e))) sys.exit(1) if __name__ == "__main__": main()