first commit

2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions
--- a/sn-search-social-cn/scripts/pycache/search_utils.cpython-311.pyc
+++ b/sn-search-social-cn/scripts/pycache/search_utils.cpython-311.pyc
--- a/sn-search-social-cn/scripts/bilibili_search.py
+++ b/sn-search-social-cn/scripts/bilibili_search.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""B站搜索。通过 Bilibili Web API（无需认证即可搜索）。"""
+from __future__ import annotations
+
+import sys
+
+from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
+
+
+SEARCH_URL = "https://api.bilibili.com/x/web-interface/search/all/v2"
+
+
+def search(query: str, limit: int, cookie: str | None = None, order: str = "") -> list[dict]:
+    """执行 B站搜索。"""
+    headers = {
+        "Referer": "https://www.bilibili.com",
+        "Origin": "https://www.bilibili.com",
+    }
+    if cookie:
+        headers["Cookie"] = cookie
+
+    params = {
+        "keyword": query,
+        "page": 1,
+        "page_size": min(limit, 50),
+    }
+    if order:
+        params["order"] = order
+
+    with get_client(headers=headers) as client:
+        resp = client.get(SEARCH_URL, params=params)
+        resp.raise_for_status()
+        data = resp.json()
+
+    if data.get("code") != 0:
+        msg = data.get("message", "未知错误")
+        raise RuntimeError(f"B站 API 返回错误: {msg}")
+
+    items = []
+    # 结果在 data.data.result 中，按类型分组
+    result_groups = data.get("data", {}).get("result", [])
+    for group in result_groups:
+        result_type = group.get("result_type", "")
+        if result_type not in ("video", "media_bangumi", "media_ft", "article"):
+            continue
+        for entry in group.get("data", []):
+            title = _strip_html(entry.get("title", ""))
+            if result_type == "video":
+                bvid = entry.get("bvid", "")
+                url = f"https://www.bilibili.com/video/{bvid}" if bvid else entry.get("arcurl", "")
+                items.append(make_item(
+                    title=title,
+                    url=url,
+                    snippet=entry.get("description", "")[:300],
+                    author=entry.get("author", ""),
+                    play=entry.get("play", 0),
+                    like=entry.get("like", 0),
+                    pubdate=entry.get("pubdate"),
+                    type="video",
+                ))
+            elif result_type == "article":
+                url = f"https://www.bilibili.com/read/cv{entry.get('id', '')}"
+                items.append(make_item(
+                    title=title,
+                    url=url,
+                    snippet=entry.get("desc", "")[:300],
+                    author=entry.get("author_name", ""),
+                    view=entry.get("view", 0),
+                    type="article",
+                ))
+
+            if len(items) >= limit:
+                break
+        if len(items) >= limit:
+            break
+
+    return items[:limit]
+
+
+def _strip_html(html: str) -> str:
+    import re
+    return re.sub(r"<[^>]+>", "", html).strip()
+
+
+def main():
+    parser = build_parser("搜索 B站视频和文章")
+    parser.add_argument("--cookie", help="B站 Cookie（也可通过 BILIBILI_COOKIE 环境变量设置，可选）")
+    parser.add_argument("--order", default="",
+                        choices=["", "totalrank", "click", "pubdate", "dm", "stow"],
+                        help="排序：空=综合, totalrank=最佳匹配, click=播放, pubdate=最新, dm=弹幕, stow=收藏")
+    args = parser.parse_args()
+
+    cookie = get_key("BILIBILI_COOKIE", args.cookie)
+    try:
+        items = search(args.query, args.limit, cookie, args.order)
+        print_json(make_result(True, args.query, "bilibili", items))
+    except Exception as e:
+        print_json(make_result(False, args.query, "bilibili", [], str(e)))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/sn-search-social-cn/scripts/douyin_search.py
+++ b/sn-search-social-cn/scripts/douyin_search.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""抖音搜索。通过抖音 Web API（需要 cookie 认证，稳定性较低）。"""
+from __future__ import annotations
+
+import sys
+
+from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
+
+
+SEARCH_URL = "https://www.douyin.com/aweme/v1/web/general/search/single/"
+
+
+def search(query: str, limit: int, cookie: str | None = None) -> list[dict]:
+    """执行抖音搜索。
+
+    注意：抖音反爬较严格，此脚本稳定性较低，可能需要频繁更新 cookie。
+    """
+    if not cookie:
+        raise ValueError("需要 DOUYIN_COOKIE 环境变量。请从浏览器开发者工具获取抖音 cookie。")
+
+    headers = {
+        "Cookie": cookie,
+        "Referer": "https://www.douyin.com/search/" + query,
+        "Origin": "https://www.douyin.com",
+    }
+
+    params = {
+        "keyword": query,
+        "search_channel": "aweme_general",
+        "sort_type": 0,  # 0=综合, 1=最多点赞, 2=最新发布
+        "publish_time": 0,  # 0=不限, 1=一天内, 7=一周内, 182=半年内
+        "count": min(limit, 20),
+        "offset": 0,
+        "need_filter_settings": 0,
+        "device_platform": "webapp",
+        "aid": 6383,
+    }
+
+    with get_client(timeout=20, headers=headers) as client:
+        resp = client.get(SEARCH_URL, params=params)
+        resp.raise_for_status()
+        data = resp.json()
+
+    status_code = data.get("status_code", -1)
+    if status_code != 0:
+        msg = data.get("status_msg") or f"status_code={status_code}"
+        raise RuntimeError(f"抖音 API 错误: {msg}")
+
+    items = []
+    for entry in data.get("data", [])[:limit]:
+        aweme = entry.get("aweme_info", entry)
+        if not aweme:
+            continue
+
+        desc = aweme.get("desc", "")
+        aweme_id = aweme.get("aweme_id", "")
+        author = aweme.get("author", {}) or {}
+        stats = aweme.get("statistics", {}) or {}
+
+        items.append(make_item(
+            title=desc[:100],
+            url=f"https://www.douyin.com/video/{aweme_id}" if aweme_id else "",
+            snippet=desc[:300],
+            author=author.get("nickname", ""),
+            digg_count=stats.get("digg_count", 0),
+            comment_count=stats.get("comment_count", 0),
+            share_count=stats.get("share_count", 0),
+            play_count=stats.get("play_count", 0),
+            create_time=aweme.get("create_time"),
+        ))
+
+    return items
+
+
+def main():
+    parser = build_parser("搜索抖音视频")
+    parser.add_argument("--cookie", help="抖音 Cookie（也可通过 DOUYIN_COOKIE 环境变量设置）")
+    args = parser.parse_args()
+
+    cookie = get_key("DOUYIN_COOKIE", args.cookie)
+    try:
+        items = search(args.query, args.limit, cookie)
+        print_json(make_result(True, args.query, "douyin", items))
+    except Exception as e:
+        print_json(make_result(False, args.query, "douyin", [], str(e)))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/sn-search-social-cn/scripts/search_utils.py
+++ b/sn-search-social-cn/scripts/search_utils.py
@@ -0,0 +1,150 @@
+"""
+搜索 Skill 共享工具库。
+
+提供标准 JSON 输出、CLI 脚手架、httpx helper 和配置读取。
+所有搜索脚本通过 sys.path 导入此模块。
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from typing import Any
+
+try:
+    import httpx
+except ImportError:
+    json.dump(
+        {
+            "success": False,
+            "error": "缺少 httpx，请运行：python3 -m pip install -r skills/sn-search-social-cn/requirements.txt",
+        },
+        sys.stdout,
+        ensure_ascii=False,
+    )
+    sys.stdout.write("\n")
+    sys.exit(1)
+
+# ---------------------------------------------------------------------------
+# 标准输出
+# ---------------------------------------------------------------------------
+
+def make_result(
+    success: bool,
+    query: str,
+    provider: str,
+    items: list[dict[str, Any]],
+    error: str | None = None,
+) -> dict[str, Any]:
+    """构造标准化的搜索结果。"""
+    return {
+        "success": success,
+        "query": query,
+        "provider": provider,
+        "items": items,
+        "error": error,
+    }
+
+
+def make_item(
+    title: str,
+    url: str,
+    snippet: str = "",
+    **extra: Any,
+) -> dict[str, Any]:
+    """构造标准化的搜索结果条目。"""
+    item: dict[str, Any] = {"title": title, "url": url, "snippet": snippet}
+    for k, v in extra.items():
+        if v not in (None, "", [], {}):
+            item[k] = v
+    return item
+
+
+def print_json(data: dict[str, Any]) -> None:
+    """将结果 JSON 输出到 stdout。"""
+    json.dump(data, sys.stdout, ensure_ascii=False, indent=2)
+    sys.stdout.write("\n")
+    sys.stdout.flush()
+
+
+# ---------------------------------------------------------------------------
+# CLI 脚手架
+# ---------------------------------------------------------------------------
+
+def build_parser(description: str) -> argparse.ArgumentParser:
+    """创建带有通用参数的 ArgumentParser。"""
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument("query", help="搜索关键词")
+    parser.add_argument("--limit", "-n", type=int, default=10, help="返回结果数量（默认 10）")
+    return parser
+
+
+# ---------------------------------------------------------------------------
+# httpx helper
+# ---------------------------------------------------------------------------
+
+_DEFAULT_TIMEOUT = 15
+_DEFAULT_UA = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/125.0.0.0 Safari/537.36"
+)
+
+
+def get_client(
+    timeout: int = _DEFAULT_TIMEOUT,
+    headers: dict[str, str] | None = None,
+    **kwargs: Any,
+) -> httpx.Client:
+    """返回预配置的 httpx.Client。"""
+    default_headers = {
+        "User-Agent": _DEFAULT_UA,
+        "Accept": "application/json",
+    }
+    if headers:
+        default_headers.update(headers)
+    return httpx.Client(
+        timeout=timeout,
+        headers=default_headers,
+        follow_redirects=True,
+        **kwargs,
+    )
+
+
+# ---------------------------------------------------------------------------
+# 配置读取
+# ---------------------------------------------------------------------------
+
+def get_key(env_var: str, cli_arg: str | None = None) -> str | None:
+    """读取 API key：CLI 参数 > 环境变量。"""
+    if cli_arg:
+        return cli_arg
+    return os.environ.get(env_var)
+
+
+# ---------------------------------------------------------------------------
+# 脚本入口辅助
+# ---------------------------------------------------------------------------
+
+def run_search(
+    provider: str,
+    search_fn,  # Callable[[str, int, ...], list[dict]]
+    parser: argparse.ArgumentParser | None = None,
+    extra_kwargs_fn=None,  # Callable[[Namespace], dict] 从 args 提取额外参数
+) -> None:
+    """通用脚本入口：解析参数 → 执行搜索 → 输出 JSON。"""
+    if parser is None:
+        parser = build_parser(f"Search {provider}")
+    args = parser.parse_args()
+
+    extra = {}
+    if extra_kwargs_fn:
+        extra = extra_kwargs_fn(args)
+
+    try:
+        items = search_fn(args.query, args.limit, **extra)
+        print_json(make_result(True, args.query, provider, items))
+    except Exception as e:
+        print_json(make_result(False, args.query, provider, [], str(e)))
+        sys.exit(1)
--- a/sn-search-social-cn/scripts/zhihu_search.py
+++ b/sn-search-social-cn/scripts/zhihu_search.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""知乎搜索。通过知乎内部 API（需要 cookie 认证）。"""
+from __future__ import annotations
+
+import re
+import sys
+import tempfile
+from datetime import datetime, timezone
+
+from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
+
+# 正文内联截断长度（超出部分存文件）
+_CONTENT_INLINE_LIMIT = 2000
+
+
+SEARCH_URL = "https://www.zhihu.com/api/v4/search_v3"
+
+# 广告类型，对研究无价值，直接过滤
+_AD_TYPES = {"education", "knowledge_ad"}
+
+
+def search(query: str, limit: int, cookie: str | None = None, search_type: str = "general") -> list[dict]:
+    """执行知乎搜索。"""
+    if not cookie:
+        raise ValueError("需要 ZHIHU_COOKIE 环境变量。请从浏览器开发者工具获取知乎 cookie。")
+
+    headers = {
+        "Cookie": cookie,
+        "Referer": "https://www.zhihu.com/search",
+        "Origin": "https://www.zhihu.com",
+        "Accept": "application/json",
+    }
+
+    params = {
+        "q": query,
+        "t": search_type,
+        "offset": 0,
+        # 多请求一些以弥补过滤掉广告条目的损失
+        "limit": min(limit * 2, 20),
+    }
+
+    with get_client(headers=headers) as client:
+        resp = client.get(SEARCH_URL, params=params)
+        resp.raise_for_status()
+        data = resp.json()
+
+    items = []
+    for entry in data.get("data", []):
+        if len(items) >= limit:
+            break
+        if entry.get("type") in _AD_TYPES:
+            continue
+
+        obj = entry.get("object", {}) or entry
+        obj_type = obj.get("type", "")
+
+        item = _parse_object(obj, obj_type)
+        if item:
+            items.append(item)
+
+    return items
+
+
+def _parse_object(obj: dict, obj_type: str) -> dict | None:
+    """将 API 返回的 object 解析为标准条目。"""
+    title = _strip_html(obj.get("title") or obj.get("name") or "")
+    snippet = _strip_html(obj.get("excerpt") or obj.get("description") or "")[:300]
+    full_content = _strip_html(obj.get("content") or "")
+    content, content_file = _maybe_save_content(full_content, obj_type, obj.get("id", ""))
+
+    url = _build_url(obj, obj_type)
+
+    # 作者信息
+    author_obj = obj.get("author", {})
+    author_name = author_obj.get("name", "") if isinstance(author_obj, dict) else ""
+    author_headline = author_obj.get("headline", "") if isinstance(author_obj, dict) else ""
+    author_followers = author_obj.get("follower_count") if isinstance(author_obj, dict) else None
+
+    # 互动数据
+    voteup = obj.get("voteup_count") or 0
+    comment = obj.get("comment_count") or 0
+    favorites = obj.get("favorites_count") or obj.get("zfav_count") or 0
+    visits = obj.get("visits_count")  # answer 特有
+
+    # 时间（转为 ISO 8601，方便 agent 判断时效性）
+    created_at = _ts_to_iso(obj.get("created_time"))
+    updated_at = _ts_to_iso(obj.get("updated_time"))
+
+    # answer 专属：所属问题的标题和链接
+    question_title = ""
+    question_url = ""
+    answer_count = None
+    if obj_type == "answer":
+        q = obj.get("question", {})
+        question_title = _strip_html(q.get("name") or q.get("title") or "")
+        qid = q.get("id", "")
+        question_url = f"https://www.zhihu.com/question/{qid}" if qid else ""
+        answer_count = obj.get("answer_count")
+        # answer 没有独立 title，用问题标题补充
+        if not title:
+            title = question_title
+
+    # question 专属
+    if obj_type == "question":
+        answer_count = obj.get("answer_count")
+
+    if not title and not snippet:
+        return None
+
+    return make_item(
+        title=title,
+        url=url,
+        snippet=snippet,
+        content=content,
+        content_file=content_file,
+        content_type=obj_type,
+        author=author_name,
+        author_headline=author_headline,
+        author_followers=author_followers,
+        voteup_count=voteup,
+        comment_count=comment,
+        favorites_count=favorites if favorites else None,
+        visits_count=visits,
+        answer_count=answer_count,
+        question_title=question_title if obj_type == "answer" else None,
+        question_url=question_url if obj_type == "answer" else None,
+        created_at=created_at,
+        updated_at=updated_at,
+    )
+
+
+def _maybe_save_content(full_content: str, obj_type: str, obj_id: str) -> tuple[str, str | None]:
+    """处理正文：短内容直接返回，长内容截断并将完整版存为临时文件。
+
+    返回 (inline_content, file_path)，file_path 为 None 表示未截断。
+    """
+    if not full_content:
+        return "", None
+
+    if len(full_content) <= _CONTENT_INLINE_LIMIT:
+        return full_content, None
+
+    # 超出截断限制，写入临时文件
+    suffix = f"_zhihu_{obj_type}_{obj_id}.txt"
+    with tempfile.NamedTemporaryFile(
+        mode="w", encoding="utf-8", suffix=suffix, delete=False
+    ) as f:
+        f.write(full_content)
+        fpath = f.name
+
+    inline = (
+        full_content[:_CONTENT_INLINE_LIMIT]
+        + f"\n\n[内容已截断，共 {len(full_content)} 字，完整内容见: {fpath}]"
+    )
+    return inline, fpath
+
+
+def _build_url(obj: dict, obj_type: str) -> str:
+    """构造面向用户的 Web URL（而非 API URL）。"""
+    oid = obj.get("id", "")
+    if obj_type == "article":
+        return f"https://zhuanlan.zhihu.com/p/{oid}" if oid else ""
+    if obj_type == "answer":
+        q = obj.get("question", {})
+        qid = q.get("id", "")
+        return f"https://www.zhihu.com/question/{qid}/answer/{oid}" if qid and oid else ""
+    if obj_type == "question":
+        return f"https://www.zhihu.com/question/{oid}" if oid else ""
+    # 其他类型直接返回 obj 里的 url（若有）
+    raw = obj.get("url", "")
+    # 将 api.zhihu.com 替换为 www.zhihu.com
+    return raw.replace("https://api.zhihu.com/", "https://www.zhihu.com/")
+
+
+def _strip_html(html: str) -> str:
+    return re.sub(r"<[^>]+>", "", html).strip()
+
+
+def _ts_to_iso(ts: int | None) -> str | None:
+    if not ts:
+        return None
+    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def main():
+    parser = build_parser("搜索知乎问答和文章")
+    parser.add_argument("--cookie", help="知乎 Cookie（也可通过 ZHIHU_COOKIE 环境变量设置）")
+    parser.add_argument("--type", default="general",
+                        choices=["general", "topic", "people", "zvideo"],
+                        help="搜索类型（默认 general）")
+    args = parser.parse_args()
+
+    cookie = get_key("ZHIHU_COOKIE", args.cookie)
+    try:
+        items = search(args.query, args.limit, cookie, args.type)
+        print_json(make_result(True, args.query, "zhihu", items))
+    except Exception as e:
+        print_json(make_result(False, args.query, "zhihu", [], str(e)))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()