first commit

This commit is contained in:
Hermes Agent
2026-05-10 13:52:46 +08:00
commit ccc63d1e70
4583 changed files with 584341 additions and 0 deletions

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""B站搜索。通过 Bilibili Web API无需认证即可搜索"""
from __future__ import annotations
import sys
from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
SEARCH_URL = "https://api.bilibili.com/x/web-interface/search/all/v2"
def search(query: str, limit: int, cookie: str | None = None, order: str = "") -> list[dict]:
"""执行 B站搜索。"""
headers = {
"Referer": "https://www.bilibili.com",
"Origin": "https://www.bilibili.com",
}
if cookie:
headers["Cookie"] = cookie
params = {
"keyword": query,
"page": 1,
"page_size": min(limit, 50),
}
if order:
params["order"] = order
with get_client(headers=headers) as client:
resp = client.get(SEARCH_URL, params=params)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
msg = data.get("message", "未知错误")
raise RuntimeError(f"B站 API 返回错误: {msg}")
items = []
# 结果在 data.data.result 中,按类型分组
result_groups = data.get("data", {}).get("result", [])
for group in result_groups:
result_type = group.get("result_type", "")
if result_type not in ("video", "media_bangumi", "media_ft", "article"):
continue
for entry in group.get("data", []):
title = _strip_html(entry.get("title", ""))
if result_type == "video":
bvid = entry.get("bvid", "")
url = f"https://www.bilibili.com/video/{bvid}" if bvid else entry.get("arcurl", "")
items.append(make_item(
title=title,
url=url,
snippet=entry.get("description", "")[:300],
author=entry.get("author", ""),
play=entry.get("play", 0),
like=entry.get("like", 0),
pubdate=entry.get("pubdate"),
type="video",
))
elif result_type == "article":
url = f"https://www.bilibili.com/read/cv{entry.get('id', '')}"
items.append(make_item(
title=title,
url=url,
snippet=entry.get("desc", "")[:300],
author=entry.get("author_name", ""),
view=entry.get("view", 0),
type="article",
))
if len(items) >= limit:
break
if len(items) >= limit:
break
return items[:limit]
def _strip_html(html: str) -> str:
import re
return re.sub(r"<[^>]+>", "", html).strip()
def main():
parser = build_parser("搜索 B站视频和文章")
parser.add_argument("--cookie", help="B站 Cookie也可通过 BILIBILI_COOKIE 环境变量设置,可选)")
parser.add_argument("--order", default="",
choices=["", "totalrank", "click", "pubdate", "dm", "stow"],
help="排序:空=综合, totalrank=最佳匹配, click=播放, pubdate=最新, dm=弹幕, stow=收藏")
args = parser.parse_args()
cookie = get_key("BILIBILI_COOKIE", args.cookie)
try:
items = search(args.query, args.limit, cookie, args.order)
print_json(make_result(True, args.query, "bilibili", items))
except Exception as e:
print_json(make_result(False, args.query, "bilibili", [], str(e)))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""抖音搜索。通过抖音 Web API需要 cookie 认证,稳定性较低)。"""
from __future__ import annotations
import sys
from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
SEARCH_URL = "https://www.douyin.com/aweme/v1/web/general/search/single/"
def search(query: str, limit: int, cookie: str | None = None) -> list[dict]:
"""执行抖音搜索。
注意:抖音反爬较严格,此脚本稳定性较低,可能需要频繁更新 cookie。
"""
if not cookie:
raise ValueError("需要 DOUYIN_COOKIE 环境变量。请从浏览器开发者工具获取抖音 cookie。")
headers = {
"Cookie": cookie,
"Referer": "https://www.douyin.com/search/" + query,
"Origin": "https://www.douyin.com",
}
params = {
"keyword": query,
"search_channel": "aweme_general",
"sort_type": 0, # 0=综合, 1=最多点赞, 2=最新发布
"publish_time": 0, # 0=不限, 1=一天内, 7=一周内, 182=半年内
"count": min(limit, 20),
"offset": 0,
"need_filter_settings": 0,
"device_platform": "webapp",
"aid": 6383,
}
with get_client(timeout=20, headers=headers) as client:
resp = client.get(SEARCH_URL, params=params)
resp.raise_for_status()
data = resp.json()
status_code = data.get("status_code", -1)
if status_code != 0:
msg = data.get("status_msg") or f"status_code={status_code}"
raise RuntimeError(f"抖音 API 错误: {msg}")
items = []
for entry in data.get("data", [])[:limit]:
aweme = entry.get("aweme_info", entry)
if not aweme:
continue
desc = aweme.get("desc", "")
aweme_id = aweme.get("aweme_id", "")
author = aweme.get("author", {}) or {}
stats = aweme.get("statistics", {}) or {}
items.append(make_item(
title=desc[:100],
url=f"https://www.douyin.com/video/{aweme_id}" if aweme_id else "",
snippet=desc[:300],
author=author.get("nickname", ""),
digg_count=stats.get("digg_count", 0),
comment_count=stats.get("comment_count", 0),
share_count=stats.get("share_count", 0),
play_count=stats.get("play_count", 0),
create_time=aweme.get("create_time"),
))
return items
def main():
parser = build_parser("搜索抖音视频")
parser.add_argument("--cookie", help="抖音 Cookie也可通过 DOUYIN_COOKIE 环境变量设置)")
args = parser.parse_args()
cookie = get_key("DOUYIN_COOKIE", args.cookie)
try:
items = search(args.query, args.limit, cookie)
print_json(make_result(True, args.query, "douyin", items))
except Exception as e:
print_json(make_result(False, args.query, "douyin", [], str(e)))
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,150 @@
"""
搜索 Skill 共享工具库。
提供标准 JSON 输出、CLI 脚手架、httpx helper 和配置读取。
所有搜索脚本通过 sys.path 导入此模块。
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from typing import Any
try:
import httpx
except ImportError:
json.dump(
{
"success": False,
"error": "缺少 httpx请运行python3 -m pip install -r skills/sn-search-social-cn/requirements.txt",
},
sys.stdout,
ensure_ascii=False,
)
sys.stdout.write("\n")
sys.exit(1)
# ---------------------------------------------------------------------------
# 标准输出
# ---------------------------------------------------------------------------
def make_result(
success: bool,
query: str,
provider: str,
items: list[dict[str, Any]],
error: str | None = None,
) -> dict[str, Any]:
"""构造标准化的搜索结果。"""
return {
"success": success,
"query": query,
"provider": provider,
"items": items,
"error": error,
}
def make_item(
title: str,
url: str,
snippet: str = "",
**extra: Any,
) -> dict[str, Any]:
"""构造标准化的搜索结果条目。"""
item: dict[str, Any] = {"title": title, "url": url, "snippet": snippet}
for k, v in extra.items():
if v not in (None, "", [], {}):
item[k] = v
return item
def print_json(data: dict[str, Any]) -> None:
"""将结果 JSON 输出到 stdout。"""
json.dump(data, sys.stdout, ensure_ascii=False, indent=2)
sys.stdout.write("\n")
sys.stdout.flush()
# ---------------------------------------------------------------------------
# CLI 脚手架
# ---------------------------------------------------------------------------
def build_parser(description: str) -> argparse.ArgumentParser:
"""创建带有通用参数的 ArgumentParser。"""
parser = argparse.ArgumentParser(description=description)
parser.add_argument("query", help="搜索关键词")
parser.add_argument("--limit", "-n", type=int, default=10, help="返回结果数量(默认 10")
return parser
# ---------------------------------------------------------------------------
# httpx helper
# ---------------------------------------------------------------------------
_DEFAULT_TIMEOUT = 15
_DEFAULT_UA = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36"
)
def get_client(
timeout: int = _DEFAULT_TIMEOUT,
headers: dict[str, str] | None = None,
**kwargs: Any,
) -> httpx.Client:
"""返回预配置的 httpx.Client。"""
default_headers = {
"User-Agent": _DEFAULT_UA,
"Accept": "application/json",
}
if headers:
default_headers.update(headers)
return httpx.Client(
timeout=timeout,
headers=default_headers,
follow_redirects=True,
**kwargs,
)
# ---------------------------------------------------------------------------
# 配置读取
# ---------------------------------------------------------------------------
def get_key(env_var: str, cli_arg: str | None = None) -> str | None:
"""读取 API keyCLI 参数 > 环境变量。"""
if cli_arg:
return cli_arg
return os.environ.get(env_var)
# ---------------------------------------------------------------------------
# 脚本入口辅助
# ---------------------------------------------------------------------------
def run_search(
provider: str,
search_fn, # Callable[[str, int, ...], list[dict]]
parser: argparse.ArgumentParser | None = None,
extra_kwargs_fn=None, # Callable[[Namespace], dict] 从 args 提取额外参数
) -> None:
"""通用脚本入口:解析参数 → 执行搜索 → 输出 JSON。"""
if parser is None:
parser = build_parser(f"Search {provider}")
args = parser.parse_args()
extra = {}
if extra_kwargs_fn:
extra = extra_kwargs_fn(args)
try:
items = search_fn(args.query, args.limit, **extra)
print_json(make_result(True, args.query, provider, items))
except Exception as e:
print_json(make_result(False, args.query, provider, [], str(e)))
sys.exit(1)

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""知乎搜索。通过知乎内部 API需要 cookie 认证)。"""
from __future__ import annotations
import re
import sys
import tempfile
from datetime import datetime, timezone
from search_utils import build_parser, get_client, get_key, make_item, make_result, print_json
# 正文内联截断长度(超出部分存文件)
_CONTENT_INLINE_LIMIT = 2000
SEARCH_URL = "https://www.zhihu.com/api/v4/search_v3"
# 广告类型,对研究无价值,直接过滤
_AD_TYPES = {"education", "knowledge_ad"}
def search(query: str, limit: int, cookie: str | None = None, search_type: str = "general") -> list[dict]:
"""执行知乎搜索。"""
if not cookie:
raise ValueError("需要 ZHIHU_COOKIE 环境变量。请从浏览器开发者工具获取知乎 cookie。")
headers = {
"Cookie": cookie,
"Referer": "https://www.zhihu.com/search",
"Origin": "https://www.zhihu.com",
"Accept": "application/json",
}
params = {
"q": query,
"t": search_type,
"offset": 0,
# 多请求一些以弥补过滤掉广告条目的损失
"limit": min(limit * 2, 20),
}
with get_client(headers=headers) as client:
resp = client.get(SEARCH_URL, params=params)
resp.raise_for_status()
data = resp.json()
items = []
for entry in data.get("data", []):
if len(items) >= limit:
break
if entry.get("type") in _AD_TYPES:
continue
obj = entry.get("object", {}) or entry
obj_type = obj.get("type", "")
item = _parse_object(obj, obj_type)
if item:
items.append(item)
return items
def _parse_object(obj: dict, obj_type: str) -> dict | None:
"""将 API 返回的 object 解析为标准条目。"""
title = _strip_html(obj.get("title") or obj.get("name") or "")
snippet = _strip_html(obj.get("excerpt") or obj.get("description") or "")[:300]
full_content = _strip_html(obj.get("content") or "")
content, content_file = _maybe_save_content(full_content, obj_type, obj.get("id", ""))
url = _build_url(obj, obj_type)
# 作者信息
author_obj = obj.get("author", {})
author_name = author_obj.get("name", "") if isinstance(author_obj, dict) else ""
author_headline = author_obj.get("headline", "") if isinstance(author_obj, dict) else ""
author_followers = author_obj.get("follower_count") if isinstance(author_obj, dict) else None
# 互动数据
voteup = obj.get("voteup_count") or 0
comment = obj.get("comment_count") or 0
favorites = obj.get("favorites_count") or obj.get("zfav_count") or 0
visits = obj.get("visits_count") # answer 特有
# 时间(转为 ISO 8601方便 agent 判断时效性)
created_at = _ts_to_iso(obj.get("created_time"))
updated_at = _ts_to_iso(obj.get("updated_time"))
# answer 专属:所属问题的标题和链接
question_title = ""
question_url = ""
answer_count = None
if obj_type == "answer":
q = obj.get("question", {})
question_title = _strip_html(q.get("name") or q.get("title") or "")
qid = q.get("id", "")
question_url = f"https://www.zhihu.com/question/{qid}" if qid else ""
answer_count = obj.get("answer_count")
# answer 没有独立 title用问题标题补充
if not title:
title = question_title
# question 专属
if obj_type == "question":
answer_count = obj.get("answer_count")
if not title and not snippet:
return None
return make_item(
title=title,
url=url,
snippet=snippet,
content=content,
content_file=content_file,
content_type=obj_type,
author=author_name,
author_headline=author_headline,
author_followers=author_followers,
voteup_count=voteup,
comment_count=comment,
favorites_count=favorites if favorites else None,
visits_count=visits,
answer_count=answer_count,
question_title=question_title if obj_type == "answer" else None,
question_url=question_url if obj_type == "answer" else None,
created_at=created_at,
updated_at=updated_at,
)
def _maybe_save_content(full_content: str, obj_type: str, obj_id: str) -> tuple[str, str | None]:
"""处理正文:短内容直接返回,长内容截断并将完整版存为临时文件。
返回 (inline_content, file_path)file_path 为 None 表示未截断。
"""
if not full_content:
return "", None
if len(full_content) <= _CONTENT_INLINE_LIMIT:
return full_content, None
# 超出截断限制,写入临时文件
suffix = f"_zhihu_{obj_type}_{obj_id}.txt"
with tempfile.NamedTemporaryFile(
mode="w", encoding="utf-8", suffix=suffix, delete=False
) as f:
f.write(full_content)
fpath = f.name
inline = (
full_content[:_CONTENT_INLINE_LIMIT]
+ f"\n\n[内容已截断,共 {len(full_content)} 字,完整内容见: {fpath}]"
)
return inline, fpath
def _build_url(obj: dict, obj_type: str) -> str:
"""构造面向用户的 Web URL而非 API URL"""
oid = obj.get("id", "")
if obj_type == "article":
return f"https://zhuanlan.zhihu.com/p/{oid}" if oid else ""
if obj_type == "answer":
q = obj.get("question", {})
qid = q.get("id", "")
return f"https://www.zhihu.com/question/{qid}/answer/{oid}" if qid and oid else ""
if obj_type == "question":
return f"https://www.zhihu.com/question/{oid}" if oid else ""
# 其他类型直接返回 obj 里的 url若有
raw = obj.get("url", "")
# 将 api.zhihu.com 替换为 www.zhihu.com
return raw.replace("https://api.zhihu.com/", "https://www.zhihu.com/")
def _strip_html(html: str) -> str:
return re.sub(r"<[^>]+>", "", html).strip()
def _ts_to_iso(ts: int | None) -> str | None:
if not ts:
return None
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def main():
parser = build_parser("搜索知乎问答和文章")
parser.add_argument("--cookie", help="知乎 Cookie也可通过 ZHIHU_COOKIE 环境变量设置)")
parser.add_argument("--type", default="general",
choices=["general", "topic", "people", "zvideo"],
help="搜索类型(默认 general")
args = parser.parse_args()
cookie = get_key("ZHIHU_COOKIE", args.cookie)
try:
items = search(args.query, args.limit, cookie, args.type)
print_json(make_result(True, args.query, "zhihu", items))
except Exception as e:
print_json(make_result(False, args.query, "zhihu", [], str(e)))
sys.exit(1)
if __name__ == "__main__":
main()