Refactor AI daily report pipeline

This commit is contained in:
Mimikko-zeus
2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions

View File

@@ -0,0 +1,2 @@
"""Core package for the AI daily report pipeline."""

View File

@@ -0,0 +1,77 @@
from __future__ import annotations
import re
from typing import Any
from .classify import SECTION_ORDER
from .models import NewsItem
from .validate import validate_markdown
END_PUNCTUATION = "。!?;.!?;"
def _clean_text(text: str) -> str:
value = re.sub(r"^```(?:\w+)?\s*\n?", "", (text or "").strip())
value = re.sub(r"\n?```\s*$", "", value)
value = re.sub(r"^\s*>\s*", "", value)
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"主线判断[:]\s*", "", value)
value = re.sub(r"\s+", " ", value).strip()
return value
def _ensure_sentence(text: str) -> str:
value = _clean_text(text)
if value and value[-1] not in END_PUNCTUATION:
value += ""
return value
def _source_link(item: NewsItem) -> str:
source = item.source_label or item.source_group or "来源"
if item.url:
return f"[{source} ↗]({item.url})"
return source
def assemble_markdown(items: list[NewsItem], guide: dict[str, Any] | None = None) -> tuple[str, dict[str, Any]]:
guide = guide or {"theme": "", "threads": []}
lines: list[str] = []
theme = _clean_text(str(guide.get("theme") or ""))
if theme:
lines.extend(["## 导览", "", f"> {theme}", ""])
item_number = 1
for section in SECTION_ORDER:
section_items = [item for item in items if item.section == section]
if not section_items:
continue
lines.extend([f"## {section}", ""])
for item in section_items:
title = _clean_text(item.title or item.title_raw)
summary = _ensure_sentence(item.summary or item.summary_raw or "该条目暂无摘要。")
lines.extend(
[
f"**{item_number}. {title}**",
"",
f"> {summary}{_source_link(item)}",
"",
]
)
item_number += 1
threads = guide.get("threads", []) or []
if threads:
lines.extend(["## 今日脉络", ""])
for thread in threads:
title = _clean_text(str(thread.get("title") or ""))
text = _ensure_sentence(str(thread.get("text") or ""))
if not title or not text:
continue
lines.extend([f"- **{title}**", f" {text}", ""])
markdown = "\n".join(lines).strip()
report = validate_markdown(markdown, items)
return markdown, report

109
ai_daily_report/classify.py Normal file
View File

@@ -0,0 +1,109 @@
from __future__ import annotations
from collections import Counter
from typing import Any
from .models import NewsItem
SECTION_ORDER = [
"模型与能力",
"产品与应用",
"开发与基础设施",
"公司与资本",
"政策与安全",
"论文与研究",
"观点与教程",
"人物与动态",
]
SECTION_ALIASES = {
"模型发布/更新": "模型与能力",
"产品发布/更新": "产品与应用",
"产品与工具": "产品与应用",
"开发与工程": "开发与基础设施",
"行业动态": "公司与资本",
"行业与公司": "公司与资本",
"论文研究": "论文与研究",
"论文与研究": "论文与研究",
"技巧与观点": "观点与教程",
"观点与教程": "观点与教程",
"人物与花絮": "人物与动态",
}
RULES = [
("政策与安全", ("监管", "政策", "安全", "风险", "滥用", "攻击", "合规", "版权")),
("论文与研究", ("论文", "研究", "arxiv", "cvpr", "benchmark", "评测", "实验")),
("开发与基础设施", ("sdk", "api", "mcp", "kubernetes", "框架", "开源", "github", "部署", "基础设施")),
("公司与资本", ("融资", "ipo", "上市", "招股书", "合作", "估值", "收购", "资本")),
("模型与能力", ("模型", "gpt", "claude", "gemini", "grok", "token", "参数", "多模态", "语音", "推理")),
("产品与应用", ("agent", "应用", "产品", "平台", "上线", "工具", "智能体")),
("观点与教程", ("教程", "观点", "方法论", "guide", "实践", "技巧")),
("人物与动态", ("黄仁勋", "纳德拉", "访谈", "演讲", "人物")),
]
def normalize_section_hint(section_hint: str) -> str:
hint = (section_hint or "").strip()
if hint in SECTION_ORDER:
return hint
return SECTION_ALIASES.get(hint, "")
def rule_classify(item: NewsItem) -> str:
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}".lower()
for section, keywords in RULES:
if any(keyword.lower() in text for keyword in keywords):
return section
return "公司与资本"
def rank_score(item: NewsItem) -> int:
text = f"{item.title or item.title_raw} {item.summary or item.summary_raw}"
score = max(0, 200 - item.source_priority)
if item.source_role == "primary":
score += 10
if item.canonical_url:
score += 10
if any(ch.isdigit() for ch in text):
score += 10
if item.duplicate_sources:
score += min(20, len(item.duplicate_sources) * 5)
score -= len(item.quality_flags) * 10
return score
def classify_and_order_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
hint_classified = 0
rule_classified = 0
for item in items:
mapped = normalize_section_hint(item.section_hint)
if mapped:
item.section = mapped
hint_classified += 1
else:
item.section = rule_classify(item)
rule_classified += 1
section_index = {section: index for index, section in enumerate(SECTION_ORDER)}
ordered = sorted(
items,
key=lambda item: (
section_index.get(item.section or "", len(SECTION_ORDER)),
-rank_score(item),
item.title or item.title_raw,
),
)
section_counts = Counter(item.section for item in ordered if item.section)
report = {
"input_count": len(items),
"section_counts": dict(section_counts),
"hint_classified": hint_classified,
"rule_classified": rule_classified,
"llm_classified": 0,
"fallback_classified": 0,
"invalid_section_count": sum(1 for item in ordered if item.section not in SECTION_ORDER),
}
return ordered, report

40
ai_daily_report/cli.py Normal file
View File

@@ -0,0 +1,40 @@
from __future__ import annotations
import argparse
from pathlib import Path
from .runner import run_daily_report
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="ai-daily-report")
subcommands = parser.add_subparsers(dest="command")
run = subcommands.add_parser("run")
run.add_argument("--date", default="today")
run.add_argument("--mode", choices=["dry-run", "draft", "publish"], default="dry-run")
run.add_argument("--source-mode", choices=["mock", "live"], default="mock")
run.add_argument("--llm-mode", choices=["mock", "live"], default="mock")
run.add_argument("--out-dir", default="runs")
run.add_argument("--base-url", default="https://blog.ephron.ren")
run.add_argument("--sources-path", default=None)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if args.command == "run":
run_daily_report(
run_date=args.date,
mode=args.mode,
source_mode=args.source_mode,
llm_mode=args.llm_mode,
out_dir=Path(args.out_dir),
base_url=args.base_url,
sources_path=Path(args.sources_path) if args.sources_path else None,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,64 @@
from __future__ import annotations
import json
import urllib.request
from typing import Any
UA = "Mozilla/5.0 (compatible; ai-daily-report/1.0)"
def fetch_text(url: str, timeout_seconds: int) -> str:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
return response.read().decode("utf-8", "ignore")
class OpenAICompatibleClient:
def __init__(self, *, api_key: str, base_url: str, model: str, timeout_seconds: int = 600):
self.api_key = api_key
self.base_url = base_url.rstrip("/")
self.model = model
self.timeout_seconds = timeout_seconds
def chat(self, prompt: str) -> str:
payload = json.dumps(
{
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2,
"max_tokens": 8000,
},
ensure_ascii=False,
).encode("utf-8")
req = urllib.request.Request(
f"{self.base_url}/chat/completions",
data=payload,
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
data = json.loads(response.read().decode("utf-8"))
return data["choices"][0]["message"]["content"].strip()
class BlogApiClient:
def __init__(self, *, base_url: str, token: str, timeout_seconds: int = 25):
self.base_url = base_url.rstrip("/")
self.token = token
self.timeout_seconds = timeout_seconds
def _request(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any]:
data = None
headers = {"Authorization": f"Bearer {self.token}", "User-Agent": UA}
if payload is not None:
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
headers["Content-Type"] = "application/json"
req = urllib.request.Request(f"{self.base_url}{path}", data=data, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=self.timeout_seconds) as response:
return json.loads(response.read().decode("utf-8"))
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
return self._request("POST", "/api/service/posts", payload)
def publish_post(self, slug: str) -> None:
self._request("POST", f"/api/service/posts/{slug}/publish")

View File

@@ -0,0 +1,95 @@
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from time import perf_counter
from typing import Callable, Iterable, Any
from .models import SourceConfig, SourceResult
Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
def _status_from_exception(exc: Exception) -> str:
if isinstance(exc, TimeoutError):
return "timeout"
return "error"
def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
fetched_at = datetime.now(timezone.utc).isoformat()
if not config.enabled:
return SourceResult(
source=config.name,
role=config.role,
ok=False,
status="disabled",
fetched_at=fetched_at,
)
started = perf_counter()
try:
items = fetcher(config, run_date)
elapsed_ms = int((perf_counter() - started) * 1000)
status = "ok" if items else "empty"
return SourceResult(
source=config.name,
role=config.role,
ok=status == "ok",
status=status,
items=items,
elapsed_ms=elapsed_ms,
fetched_at=fetched_at,
)
except Exception as exc:
elapsed_ms = int((perf_counter() - started) * 1000)
return SourceResult(
source=config.name,
role=config.role,
ok=False,
status=_status_from_exception(exc),
error=f"{type(exc).__name__}: {exc}",
elapsed_ms=elapsed_ms,
fetched_at=fetched_at,
)
def collect_sources(
configs: Iterable[SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
max_workers: int | None = None,
) -> tuple[list[SourceResult], dict[str, Any]]:
ordered_configs = list(configs)
if not ordered_configs:
return [], {
"input_source_count": 0,
"ok_source_count": 0,
"failed_source_count": 0,
"raw_item_count": 0,
}
workers = max_workers or min(8, len(ordered_configs))
result_by_name: dict[str, SourceResult] = {}
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = {
executor.submit(_collect_one, config, run_date, fetcher): config
for config in ordered_configs
}
for future in as_completed(futures):
config = futures[future]
result_by_name[config.name] = future.result()
results = [result_by_name[config.name] for config in ordered_configs]
report = {
"input_source_count": len(results),
"ok_source_count": sum(1 for result in results if result.ok),
"failed_source_count": sum(1 for result in results if not result.ok),
"raw_item_count": sum(len(result.items) for result in results),
"source_counts": {result.source: len(result.items) for result in results},
"statuses": {result.source: result.status for result in results},
}
return results, report

19
ai_daily_report/config.py Normal file
View File

@@ -0,0 +1,19 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from .models import SourceConfig
from .pipeline import _source_config_from_dict
def load_json(path: Path) -> Any:
return json.loads(path.read_text(encoding="utf-8"))
def load_source_configs(path: Path) -> list[SourceConfig]:
raw = load_json(path)
if not isinstance(raw, list):
raise ValueError("sources config must be a list")
return [_source_config_from_dict(item) for item in raw]

100
ai_daily_report/dedupe.py Normal file
View File

@@ -0,0 +1,100 @@
from __future__ import annotations
import difflib
from typing import Any
from .models import NewsItem
def _item_score(item: NewsItem) -> int:
score = 0
score += max(0, 200 - item.source_priority)
if item.canonical_url:
score += 20
if item.summary_raw:
score += min(40, len(item.summary_raw))
if item.section_hint:
score += 10
if item.source_role == "primary":
score += 10
score -= len(item.quality_flags) * 10
return score
def _merge_group(group: list[NewsItem], reason: str) -> tuple[NewsItem, list[NewsItem], dict[str, Any]]:
keep = max(group, key=_item_score)
removed = [item for item in group if item is not keep]
for removed_item in removed:
keep.duplicate_sources.append(
{
"id": removed_item.id,
"source_group": removed_item.source_group,
"source_label": removed_item.source_label,
"url": removed_item.url,
"reason": reason,
}
)
report_group = {
"reason": reason,
"keep_id": keep.id,
"removed_ids": [item.id for item in removed],
"confidence": "high",
}
return keep, removed, report_group
def _group_by_key(items: list[NewsItem], key_name: str) -> dict[str, list[NewsItem]]:
groups: dict[str, list[NewsItem]] = {}
for item in items:
key = getattr(item, key_name)
if key:
groups.setdefault(key, []).append(item)
return {key: group for key, group in groups.items() if len(group) > 1}
def _possible_duplicates(items: list[NewsItem]) -> list[dict[str, Any]]:
possible: list[dict[str, Any]] = []
for index, left in enumerate(items):
for right in items[index + 1 :]:
if not left.title_norm or not right.title_norm:
continue
ratio = difflib.SequenceMatcher(None, left.title_norm, right.title_norm).ratio()
if ratio >= 0.65:
possible.append(
{
"item_ids": [left.id, right.id],
"reason": "title_similarity",
"similarity": round(ratio, 3),
"confidence": "medium",
}
)
return possible
def hard_dedup_items(items: list[NewsItem]) -> tuple[list[NewsItem], dict[str, Any]]:
remaining = list(items)
removed_object_ids: set[int] = set()
groups_report: list[dict[str, Any]] = []
for key_name, reason in (
("canonical_url", "same_canonical_url"),
("title_norm", "same_title_norm"),
):
grouped = _group_by_key([item for item in remaining if id(item) not in removed_object_ids], key_name)
for group in grouped.values():
active_group = [item for item in group if id(item) not in removed_object_ids]
if len(active_group) < 2:
continue
keep, removed, report_group = _merge_group(active_group, reason)
removed_object_ids.update(id(item) for item in removed)
groups_report.append(report_group)
deduped = [item for item in remaining if id(item) not in removed_object_ids]
report = {
"input_count": len(items),
"output_count": len(deduped),
"removed_count": len(removed_object_ids),
"groups": groups_report,
"possible_duplicates": _possible_duplicates(deduped),
}
return deduped, report

143
ai_daily_report/env.py Normal file
View File

@@ -0,0 +1,143 @@
from __future__ import annotations
import os
import json
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
def read_env_file(env_path: Path) -> dict[str, str]:
env: dict[str, str] = {}
if not env_path.exists():
return env
text = env_path.read_text(encoding="utf-8", errors="ignore")
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
env[key.strip()] = value.strip().strip('"').strip("'")
return env
def load_env() -> dict[str, str]:
env: dict[str, str] = {}
env.update(read_env_file(PROJECT_ROOT / ".env"))
env.update(read_env_file(Path.home() / ".hermes" / ".env"))
env.update({key: value for key, value in os.environ.items() if value})
return env
def first_env(env: dict[str, str], *names: str) -> str:
for name in names:
value = (env.get(name) or "").strip()
if value:
return value
return ""
def _load_simple_yaml(path: Path) -> dict[str, object]:
if not path.exists():
return {}
root: dict[str, object] = {}
stack: list[tuple[int, dict[str, object]]] = [(-1, root)]
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
if not raw_line.strip() or raw_line.lstrip().startswith("#") or ":" not in raw_line:
continue
indent = len(raw_line) - len(raw_line.lstrip(" "))
key, value = raw_line.strip().split(":", 1)
key = key.strip()
value = value.strip().strip('"').strip("'")
while stack and indent <= stack[-1][0]:
stack.pop()
current = stack[-1][1]
if value:
current[key] = value
else:
child: dict[str, object] = {}
current[key] = child
stack.append((indent, child))
return root
def _env_with_hermes(env: dict[str, str], hermes_dir: Path) -> dict[str, str]:
merged = dict(read_env_file(hermes_dir / ".env"))
merged.update(env)
return merged
def _provider_env_names(provider: str) -> tuple[str, str, str]:
prefix = provider.upper().replace("-", "_")
return f"{prefix}_API_KEY", f"{prefix}_BASE_URL", f"{prefix}_MODEL"
def _auth_json_key(env: dict[str, str], hermes_dir: Path, provider: str) -> str:
auth_path = hermes_dir / "auth.json"
if not auth_path.exists() or not provider:
return ""
try:
auth = json.loads(auth_path.read_text(encoding="utf-8"))
except Exception:
return ""
pool = auth.get("credential_pool", {}) or {}
provider_keys = [provider, provider.replace("-", "_")]
for key in provider_keys:
creds = pool.get(key, []) or []
if not creds:
continue
cred = creds[0]
source = str(cred.get("source") or "")
if source.startswith("env:"):
resolved = first_env(env, source[4:])
if resolved:
return resolved
token = str(cred.get("access_token") or "").strip()
if token:
return token
return ""
def resolve_llm_config(env: dict[str, str], *, hermes_dir: Path | None = None) -> dict[str, str]:
hermes_dir = hermes_dir or Path.home() / ".hermes"
env = _env_with_hermes(env, hermes_dir)
hermes_config = _load_simple_yaml(hermes_dir / "config.yaml")
model_config = hermes_config.get("model", {}) if isinstance(hermes_config.get("model"), dict) else {}
provider = str(model_config.get("provider") or "").strip()
provider_key, provider_base_url, provider_model = _provider_env_names(provider) if provider else ("", "", "")
api_key = first_env(env, "LLM_API_KEY")
base_url = first_env(env, "LLM_BASE_URL")
model = first_env(env, "LLM_MODEL")
if not api_key and provider:
api_key = first_env(env, provider_key) or _auth_json_key(env, hermes_dir, provider)
if not base_url and provider:
base_url = first_env(env, provider_base_url) or str(model_config.get("base_url") or "").strip()
if not model and provider:
model = first_env(env, provider_model) or str(model_config.get("default") or "").strip()
if not api_key:
api_key = first_env(env, "SUB2API_API_KEY", "XIAOMI_API_KEY", "OPENROUTER_API_KEY")
if not base_url:
base_url = first_env(env, "SUB2API_BASE_URL", "XIAOMI_BASE_URL", "OPENROUTER_BASE_URL")
if not model:
model = first_env(env, "SUB2API_MODEL", "XIAOMI_MODEL")
missing = [
name
for name, value in (
("LLM_API_KEY", api_key),
("LLM_BASE_URL", base_url),
("LLM_MODEL", model),
)
if not value
]
if missing:
raise ValueError("missing_llm_config: " + ",".join(missing))
return {"api_key": api_key, "base_url": base_url, "model": model}
def resolve_blog_token(env: dict[str, str]) -> str:
return first_env(env, "BLOG_SERVICE_TOKEN", "EPHRON_SERVICE_TOKEN")

113
ai_daily_report/guide.py Normal file
View File

@@ -0,0 +1,113 @@
from __future__ import annotations
import json
import re
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
GuideLlmCall = Callable[[str], str]
def _clean_text(text: str, limit: int | None = None) -> str:
value = re.sub(r"^\s*>\s*", "", text or "").strip()
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"\s+", " ", value).strip()
if limit and len(value) > limit:
value = value[:limit].rstrip()
return value
def _build_prompt(items: list[NewsItem]) -> str:
payload = {
"task": (
"Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
"Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
),
"items": [
{
"id": item.id,
"title": item.title or item.title_raw,
"summary": item.summary or item.summary_raw,
"section": item.section,
"source": item.source_label,
}
for item in items
],
"output_schema": {
"theme": "one sentence under 120 Chinese characters",
"threads": [
{
"title": "thread title",
"text": "one or two sentences",
"item_ids": ["existing item id"],
"kind": "thread|uncertain",
}
],
},
}
return json.dumps(payload, ensure_ascii=False)
def generate_guide(
items: list[NewsItem],
*,
llm_call: GuideLlmCall,
) -> tuple[dict[str, Any], dict[str, Any]]:
if not items:
return {
"theme": "",
"threads": [],
}, {
"input_count": 0,
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": False,
"errors": [],
}
try:
obj = parse_json_object(llm_call(_build_prompt(items)))
except Exception as exc:
return {
"theme": "",
"threads": [],
}, {
"input_count": len(items),
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": True,
"errors": [f"{type(exc).__name__}: {exc}"],
}
valid_ids = {item.id for item in items}
threads: list[dict[str, Any]] = []
dropped = 0
for thread in obj.get("threads", []) or []:
item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
if not item_ids:
dropped += 1
continue
title = _clean_text(str(thread.get("title") or ""), limit=80)
text = _clean_text(str(thread.get("text") or ""), limit=220)
if not title or not text:
dropped += 1
continue
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
guide = {"theme": theme, "threads": threads}
report = {
"input_count": len(items),
"theme_present": bool(theme),
"thread_count": len(threads),
"dropped_thread_count": dropped,
"fallback_used": False,
"errors": [],
}
return guide, report

18
ai_daily_report/llm.py Normal file
View File

@@ -0,0 +1,18 @@
from __future__ import annotations
import json
import re
from typing import Any, Callable
LlmCall = Callable[[str], str]
def parse_json_object(text: str) -> dict[str, Any]:
text = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
text = re.sub(r"\n?```\s*$", "", text)
match = re.search(r"\{.*\}\s*$", text, re.S)
if not match:
raise ValueError("LLM output does not contain a JSON object")
return json.loads(match.group(0))

53
ai_daily_report/models.py Normal file
View File

@@ -0,0 +1,53 @@
from dataclasses import dataclass, field
from typing import Any
@dataclass(frozen=True)
class SourceConfig:
name: str
type: str
role: str = "supplement"
priority: int = 100
required: bool = False
enabled: bool = True
timeout_seconds: int = 25
retries: int = 0
min_items: int = 0
url: str = ""
@dataclass
class SourceResult:
source: str
role: str
ok: bool
status: str
items: list[dict[str, Any]] = field(default_factory=list)
error: str | None = None
elapsed_ms: int = 0
retry_count: int = 0
fetched_at: str = ""
@dataclass
class NewsItem:
id: str
source_group: str
source_label: str
source_role: str
source_priority: int
title_raw: str
title_norm: str
summary_raw: str
url: str
canonical_url: str
published_at: str | None = None
collected_at: str = ""
origin_type: str = ""
section_hint: str = ""
language_hint: str = ""
title: str | None = None
summary: str | None = None
section: str | None = None
quality_flags: list[str] = field(default_factory=list)
duplicate_sources: list[dict[str, Any]] = field(default_factory=list)

View File

@@ -0,0 +1,132 @@
from __future__ import annotations
import hashlib
import html
import re
import unicodedata
from collections import Counter
from datetime import datetime, timezone
from typing import Any
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
from .models import NewsItem, SourceResult
TRACKING_QUERY_PREFIXES = ("utm_",)
TRACKING_QUERY_KEYS = {"fbclid", "gclid", "spm", "from", "ref"}
def clean_text(value: str) -> str:
text = html.unescape(value or "")
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def canonicalize_url(url: str) -> str:
if not url:
return ""
parsed = urlparse(url.strip())
scheme = (parsed.scheme or "https").lower()
host = (parsed.netloc or "").lower()
if host.startswith("www."):
host = host[4:]
if host == "twitter.com":
host = "x.com"
query = []
for key, value in parse_qsl(parsed.query, keep_blank_values=True):
key_lower = key.lower()
if key_lower in TRACKING_QUERY_KEYS:
continue
if any(key_lower.startswith(prefix) for prefix in TRACKING_QUERY_PREFIXES):
continue
query.append((key, value))
path = parsed.path or ""
if len(path) > 1:
path = path.rstrip("/")
return urlunparse((scheme, host, path, "", urlencode(query), ""))
def normalize_title(title: str) -> str:
text = unicodedata.normalize("NFKC", title or "").lower()
text = re.sub(r"[^\w\u4e00-\u9fff]+", "", text)
return text
def _item_id(canonical_url: str, source_group: str, title_norm: str, published_at: str | None) -> str:
seed = canonical_url or "|".join([source_group, title_norm, published_at or ""])
digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
return f"item_{digest}"
def _quality_flags(title: str, summary: str, url: str) -> list[str]:
flags: list[str] = []
if not url:
flags.append("missing_url")
if not summary:
flags.append("missing_summary")
if len(normalize_title(title)) < 3:
flags.append("short_title")
return flags
def normalize_items(
source_results: list[SourceResult],
*,
run_date: str,
source_priorities: dict[str, int] | None = None,
) -> tuple[list[NewsItem], dict[str, Any]]:
source_priorities = source_priorities or {}
collected_at = datetime.now(timezone.utc).isoformat()
items: list[NewsItem] = []
flag_counts: Counter[str] = Counter()
id_counts: Counter[str] = Counter()
input_count = 0
for source_result in source_results:
for raw in source_result.items:
input_count += 1
title = clean_text(str(raw.get("title_raw") or raw.get("title") or ""))
summary = clean_text(str(raw.get("summary_raw") or raw.get("summary") or ""))
url = str(raw.get("url") or "").strip()
canonical_url = canonicalize_url(url)
title_norm = normalize_title(title)
flags = _quality_flags(title, summary, canonical_url)
flag_counts.update(flags)
source_label = clean_text(str(raw.get("source_label") or source_result.source))
published_at = raw.get("published_at")
base_id = _item_id(canonical_url, source_result.source, title_norm, published_at)
id_counts[base_id] += 1
item_id = base_id if id_counts[base_id] == 1 else f"{base_id}_{id_counts[base_id]}"
items.append(
NewsItem(
id=item_id,
source_group=source_result.source,
source_label=source_label,
source_role=source_result.role,
source_priority=source_priorities.get(source_result.source, 100),
title_raw=title,
title_norm=title_norm,
summary_raw=summary,
url=url,
canonical_url=canonical_url,
published_at=published_at,
collected_at=collected_at,
origin_type=str(raw.get("origin_type") or ""),
section_hint=str(raw.get("section_hint") or ""),
language_hint=str(raw.get("language_hint") or ""),
quality_flags=flags,
)
)
report = {
"run_date": run_date,
"input_count": input_count,
"output_count": len(items),
"quality_flag_counts": dict(flag_counts),
}
return items, report

219
ai_daily_report/pipeline.py Normal file
View File

@@ -0,0 +1,219 @@
from __future__ import annotations
from typing import Any
from .assemble import assemble_markdown
from .classify import classify_and_order_items
from .collect import Fetcher, collect_sources
from .dedupe import hard_dedup_items
from .guide import GuideLlmCall, generate_guide
from .models import SourceConfig
from .normalize import normalize_items
from .publish import BlogClient, publish_markdown
from .rewrite import RewriteLlmCall, rewrite_items
from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
return SourceConfig(
name=value["name"],
type=value["type"],
role=value.get("role", "supplement"),
priority=int(value.get("priority", 100)),
required=bool(value.get("required", False)),
enabled=bool(value.get("enabled", True)),
timeout_seconds=int(value.get("timeout_seconds", 25)),
retries=int(value.get("retries", 0)),
min_items=int(value.get("min_items", 0)),
url=value.get("url", ""),
)
def run_stage0_to_stage2(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
) -> dict[str, Any]:
configs = [
config if isinstance(config, SourceConfig) else _source_config_from_dict(config)
for config in source_configs
]
source_results, stage0_report = collect_sources(configs, run_date, fetcher=fetcher)
source_priorities = {config.name: config.priority for config in configs}
normalized_items, stage1_report = normalize_items(
source_results,
run_date=run_date,
source_priorities=source_priorities,
)
deduped_items, stage2_report = hard_dedup_items(normalized_items)
return {
"source_results": source_results,
"items": deduped_items,
"reports": {
"stage0": stage0_report,
"stage1": stage1_report,
"stage2": stage2_report,
},
}
def run_stage0_to_stage4(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
) -> dict[str, Any]:
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
items = stage2_result["items"]
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
semantic_items, stage3_report = semantic_dedup_items(
items,
candidates,
llm_call=semantic_llm_call,
)
rewritten_items, stage4_report = rewrite_items(
semantic_items,
llm_call=rewrite_llm_call,
)
reports = dict(stage2_result["reports"])
reports["stage3"] = stage3_report
reports["stage4"] = stage4_report
return {
"source_results": stage2_result["source_results"],
"items": rewritten_items,
"reports": reports,
}
def run_stage0_to_stage5(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
) -> dict[str, Any]:
stage4_result = run_stage0_to_stage4(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
reports = dict(stage4_result["reports"])
reports["stage5"] = stage5_report
return {
"source_results": stage4_result["source_results"],
"items": classified_items,
"reports": reports,
}
def run_stage0_to_stage6(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
) -> dict[str, Any]:
stage5_result = run_stage0_to_stage5(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
)
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
reports = dict(stage5_result["reports"])
reports["stage6"] = stage6_report
return {
"source_results": stage5_result["source_results"],
"items": stage5_result["items"],
"guide": guide,
"reports": reports,
}
def run_stage0_to_stage7(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
) -> dict[str, Any]:
stage6_result = run_stage0_to_stage6(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
reports = dict(stage6_result["reports"])
reports["stage7"] = stage7_report
return {
"source_results": stage6_result["source_results"],
"items": stage6_result["items"],
"guide": stage6_result["guide"],
"markdown": markdown,
"reports": reports,
}
def run_stage0_to_stage8(
source_configs: list[dict[str, Any] | SourceConfig],
run_date: str,
*,
fetcher: Fetcher,
semantic_llm_call: SemanticLlmCall,
rewrite_llm_call: RewriteLlmCall,
guide_llm_call: GuideLlmCall,
mode: str,
base_url: str,
client: BlogClient | None,
) -> dict[str, Any]:
stage7_result = run_stage0_to_stage7(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
)
slug = f"ai-{run_date}"
publish_result = publish_markdown(
title=f"AI日报 · {run_date}",
markdown=stage7_result["markdown"],
tags=["AI日报", "AI资讯", "人工智能"],
slug=slug,
base_url=base_url,
mode=mode,
markdown_report=stage7_result["reports"]["stage7"],
client=client,
)
reports = dict(stage7_result["reports"])
reports["stage8"] = {
"mode": publish_result.mode,
"status": publish_result.status,
"slug": publish_result.slug,
"blog_url": publish_result.blog_url,
"public_ok": publish_result.public_ok,
"error": publish_result.error,
}
return {
"source_results": stage7_result["source_results"],
"items": stage7_result["items"],
"guide": stage7_result["guide"],
"markdown": stage7_result["markdown"],
"publish": publish_result,
"reports": reports,
}

View File

@@ -0,0 +1,90 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Protocol
@dataclass
class PublishResult:
mode: str
status: str
slug: str
blog_url: str
public_ok: bool = False
error: str | None = None
class BlogClient(Protocol):
def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
...
def publish_post(self, slug: str) -> None:
...
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
return PublishResult(
mode="dry-run",
status="ok",
slug=slug,
blog_url=f"{base_url.rstrip('/')}/posts/{slug}",
public_ok=True,
)
def publish_markdown(
*,
title: str,
markdown: str,
tags: list[str],
slug: str,
base_url: str,
mode: str,
markdown_report: dict[str, Any],
client: BlogClient | None,
) -> PublishResult:
blocking_errors = markdown_report.get("blocking_errors", []) or []
blog_url = f"{base_url.rstrip('/')}/posts/{slug}"
if blocking_errors:
return PublishResult(
mode=mode,
status="blocked",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=";".join(blocking_errors),
)
if mode == "dry-run":
return dry_run_publish(slug, base_url)
if client is None:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error="missing_blog_client",
)
payload = {"title": title, "content": markdown, "tags": tags, "slug": slug}
try:
create_resp = client.create_post(payload)
created_slug = create_resp.get("slug") or slug
if mode == "publish":
client.publish_post(created_slug)
return PublishResult(
mode=mode,
status="ok",
slug=created_slug,
blog_url=f"{base_url.rstrip('/')}/posts/{created_slug}",
public_ok=mode == "publish",
)
except Exception as exc:
return PublishResult(
mode=mode,
status="failed",
slug=slug,
blog_url=blog_url,
public_ok=False,
error=f"{type(exc).__name__}: {exc}",
)

103
ai_daily_report/rewrite.py Normal file
View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import json
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
RewriteLlmCall = Callable[[str], str]
def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
return [items[index : index + size] for index in range(0, len(items), size)]
def _build_prompt(batch: list[NewsItem]) -> str:
payload = {
"task": (
"Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names "
"such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts."
),
"items": [
{
"id": item.id,
"title_raw": item.title_raw,
"summary_raw": item.summary_raw,
"source": item.source_label,
"language_hint": item.language_hint,
}
for item in batch
],
"output_schema": {
"rewrites": [
{
"id": "item id",
"title": "display title",
"summary": "display summary",
"flags": [],
}
]
},
}
return json.dumps(payload, ensure_ascii=False)
def _fallback(item: NewsItem) -> None:
item.title = item.title_raw
item.summary = item.summary_raw or "该条目暂无摘要。"
def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int:
obj = parse_json_object(llm_call(_build_prompt(batch)))
rewrites = obj.get("rewrites", [])
if not isinstance(rewrites, list):
raise ValueError("rewrites is not a list")
by_id = {item.id: item for item in batch}
seen_ids: set[str] = set()
for entry in rewrites:
item_id = entry.get("id")
title = str(entry.get("title") or "").strip()
summary = str(entry.get("summary") or "").strip()
if item_id in by_id and title and summary:
by_id[item_id].title = title
by_id[item_id].summary = summary
seen_ids.add(item_id)
for item in batch:
if item.id not in seen_ids:
raise ValueError(f"missing_rewrite_for_item: {item.id}")
return len(seen_ids)
def rewrite_items(
items: list[NewsItem],
*,
llm_call: RewriteLlmCall,
batch_size: int = 10,
) -> tuple[list[NewsItem], dict[str, Any]]:
rewritten_count = 0
fallback_count = 0
errors: list[str] = []
for batch in _chunks(items, max(1, batch_size)):
try:
rewritten_count += _apply_rewrite_batch(batch, llm_call)
except Exception as exc:
errors.append(f"batch:{type(exc).__name__}: {exc}")
for item in batch:
try:
rewritten_count += _apply_rewrite_batch([item], llm_call)
except Exception as item_exc:
errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
_fallback(item)
fallback_count += 1
report = {
"input_count": len(items),
"rewritten_count": rewritten_count,
"fallback_count": fallback_count,
"batch_count": len(_chunks(items, max(1, batch_size))),
"errors": errors,
}
return items, report

156
ai_daily_report/runner.py Normal file
View File

@@ -0,0 +1,156 @@
from __future__ import annotations
import json
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any
from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as default_fetch_text
from .config import load_source_configs
from .env import load_env, resolve_blog_token, resolve_llm_config
from .models import SourceConfig
from .pipeline import run_stage0_to_stage8
from .sources.registry import get_source_fetcher
def _json_default(value: Any):
if is_dataclass(value):
return asdict(value)
raise TypeError(f"Object is not JSON serializable: {type(value).__name__}")
def _mock_source_configs() -> list[SourceConfig]:
return [SourceConfig(name="Mock AI HOT", type="mock", role="primary", priority=10)]
def _mock_fetcher(config: SourceConfig, run_date: str) -> list[dict[str, Any]]:
return [
{
"title_raw": "GPT-5 API 发布",
"summary_raw": "OpenAI 发布 GPT-5 API用于本地 mock 测试。",
"url": "https://example.com/gpt5",
"source_label": "OpenAIBlog",
"section_hint": "模型发布/更新",
"origin_type": "mock",
"language_hint": "zh",
}
]
def _mock_semantic_llm(prompt: str) -> str:
return json.dumps({"duplicate_groups": [], "not_duplicates": [], "uncertain": []}, ensure_ascii=False)
def _mock_rewrite_llm(prompt: str) -> str:
payload = json.loads(prompt)
return json.dumps(
{
"rewrites": [
{
"id": item["id"],
"title": item["title_raw"],
"summary": item["summary_raw"],
"flags": [],
}
for item in payload["items"]
]
},
ensure_ascii=False,
)
def _mock_guide_llm(prompt: str) -> str:
payload = json.loads(prompt)
item_ids = [item["id"] for item in payload["items"][:3]]
return json.dumps(
{
"theme": "本地 mock 模式已生成 AI 日报,用于验证流水线。",
"threads": [
{
"title": "本地链路验证",
"text": "采集、改写、分类、导览、Markdown 和发布报告都已通过 mock 数据串联。",
"item_ids": item_ids,
"kind": "thread",
}
],
},
ensure_ascii=False,
)
def run_daily_report(
*,
run_date: str,
mode: str,
source_mode: str,
llm_mode: str,
out_dir: Path,
base_url: str,
sources_path: Path | None = None,
fetch_text=None,
env: dict[str, str] | None = None,
llm_client_factory=OpenAICompatibleClient,
blog_client_factory=BlogApiClient,
) -> dict[str, Any]:
fetch_text = fetch_text or default_fetch_text
env = env if env is not None else load_env()
if source_mode == "mock":
source_configs = _mock_source_configs()
fetcher = _mock_fetcher
elif source_mode == "live":
if sources_path is None:
sources_path = Path("config") / "sources.json"
source_configs = load_source_configs(sources_path)
def fetcher(config: SourceConfig, current_date: str) -> list[dict[str, Any]]:
source_fetcher = get_source_fetcher(config.type)
return source_fetcher(config, current_date, fetch_text)
else:
raise ValueError("source_mode must be 'mock' or 'live'")
if llm_mode == "mock":
semantic_llm_call = _mock_semantic_llm
rewrite_llm_call = _mock_rewrite_llm
guide_llm_call = _mock_guide_llm
elif llm_mode == "live":
llm_client = llm_client_factory(**resolve_llm_config(env))
semantic_llm_call = llm_client.chat
rewrite_llm_call = llm_client.chat
guide_llm_call = llm_client.chat
else:
raise ValueError("llm_mode must be 'mock' or 'live'")
blog_client = None
if mode in ("draft", "publish"):
token = resolve_blog_token(env)
if not token:
raise ValueError("missing_blog_token: set BLOG_SERVICE_TOKEN or EPHRON_SERVICE_TOKEN")
blog_client = blog_client_factory(base_url=base_url, token=token)
result = run_stage0_to_stage8(
source_configs,
run_date,
fetcher=fetcher,
semantic_llm_call=semantic_llm_call,
rewrite_llm_call=rewrite_llm_call,
guide_llm_call=guide_llm_call,
mode=mode,
base_url=base_url,
client=blog_client,
)
run_dir = out_dir / run_date
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
(run_dir / "run_report.json").write_text(
json.dumps(result["reports"], ensure_ascii=False, indent=2, default=_json_default),
encoding="utf-8",
)
return {
"run_dir": str(run_dir),
"markdown": result["markdown"],
"reports": result["reports"],
"publish": result["publish"],
}

View File

@@ -0,0 +1,167 @@
from __future__ import annotations
import json
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
SemanticLlmCall = Callable[[str], str]
def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> str:
item_payload = [
{
"id": item.id,
"title": item.title or item.title_raw,
"summary": item.summary or item.summary_raw,
"source": item.source_label,
"section_hint": item.section_hint,
}
for item in items
]
prompt = {
"task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
"items": item_payload,
"candidates": candidates,
"output_schema": {
"duplicate_groups": [
{
"keep_id": "item id",
"remove_ids": ["item id"],
"confidence": "high|medium|low",
"reason": "same concrete event reason",
}
],
"not_duplicates": [],
"uncertain": [],
},
}
return json.dumps(prompt, ensure_ascii=False)
def _score(item: NewsItem) -> int:
score = max(0, 200 - item.source_priority)
if item.source_role == "primary":
score += 10
if item.summary_raw:
score += min(40, len(item.summary_raw))
if item.canonical_url:
score += 20
score -= len(item.quality_flags) * 10
return score
def _choose_keep(group_items: list[NewsItem], suggested_keep_id: str) -> NewsItem:
suggested = [item for item in group_items if item.id == suggested_keep_id]
if suggested:
best = max(group_items, key=_score)
if _score(suggested[0]) >= _score(best) - 10:
return suggested[0]
return max(group_items, key=_score)
def semantic_dedup_items(
items: list[NewsItem],
candidates: list[dict[str, Any]],
*,
llm_call: SemanticLlmCall,
max_deletion_ratio: float = 0.5,
) -> tuple[list[NewsItem], dict[str, Any]]:
if not items or not candidates:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"uncertain": [],
"errors": [],
"skipped_for_deletion_ratio": False,
}
errors: list[str] = []
try:
obj = parse_json_object(llm_call(_build_prompt(items, candidates)))
except Exception as exc:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"uncertain": [],
"errors": [f"{type(exc).__name__}: {exc}"],
"skipped_for_deletion_ratio": False,
}
by_id = {item.id: item for item in items}
candidate_sets = {
frozenset(item_id for item_id in candidate.get("item_ids", []) if isinstance(item_id, str))
for candidate in candidates
}
candidate_removals: set[str] = set()
valid_groups: list[dict[str, Any]] = []
for group in obj.get("duplicate_groups", []) or []:
if group.get("confidence") != "high":
continue
ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
errors.append(f"invalid_ids_in_group: {group}")
continue
group_set = frozenset(ids)
if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
errors.append(f"group_outside_candidates: {group}")
continue
group_items = [by_id[item_id] for item_id in ids]
keep = _choose_keep(group_items, str(group.get("keep_id")))
remove_items = [item for item in group_items if item is not keep]
candidate_removals.update(item.id for item in remove_items)
valid_groups.append(
{
"keep_id": keep.id,
"remove_ids": [item.id for item in remove_items],
"confidence": "high",
"reason": str(group.get("reason") or "semantic_duplicate"),
}
)
deletion_ratio = len(candidate_removals) / len(items) if items else 0
if deletion_ratio > max_deletion_ratio:
return items, {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": valid_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": True,
}
removed_ids: set[str] = set()
for group in valid_groups:
keep = by_id[group["keep_id"]]
for remove_id in group["remove_ids"]:
removed = by_id[remove_id]
keep.duplicate_sources.append(
{
"id": removed.id,
"source_group": removed.source_group,
"source_label": removed.source_label,
"url": removed.url,
"reason": group["reason"],
}
)
removed_ids.add(remove_id)
deduped = [item for item in items if item.id not in removed_ids]
report = {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": len(removed_ids),
"duplicate_groups": valid_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": False,
}
return deduped, report

View File

@@ -0,0 +1,2 @@
"""Source adapters for the AI daily report pipeline."""

View File

@@ -0,0 +1,32 @@
from __future__ import annotations
import json
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
FetchText = Callable[[str, int], str]
def fetch_aihot(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
data = json.loads(fetch_text(f"https://aihot.virxact.com/api/public/daily/{run_date}", config.timeout_seconds))
items: list[dict[str, Any]] = []
generated = data.get("generatedAt")
for section in data.get("sections", []) or []:
for raw in section.get("items", []) or []:
items.append(
{
"source_group": config.name,
"source_label": raw.get("sourceName") or config.name,
"title_raw": raw.get("title") or "",
"summary_raw": raw.get("summary") or "",
"url": raw.get("sourceUrl") or "",
"published_at": generated,
"origin_type": "aihot_json",
"section_hint": section.get("label") or "",
"language_hint": "zh",
}
)
return items

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
import re
import xml.etree.ElementTree as ET
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
from ai_daily_report.sources.labels import source_label_from_url
FetchText = Callable[[str, int], str]
def parse_juya_rss(config: SourceConfig, xml_text: str, run_date: str) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
article_html = ""
for raw in raw_items:
if (raw.findtext("title") or "").strip() != run_date:
continue
content_el = raw.find("{http://purl.org/rss/1.0/modules/content/}encoded")
article_html = content_el.text if content_el is not None and content_el.text else ""
break
if not article_html:
return []
block_pattern = re.compile(
r'<h2[^>]*>\s*(?:<a[^>]*href="(?P<title_url>[^"]+)"[^>]*>)?(?P<title_html>[^<]*?)</a>?\s*<code>#(?P<num>\d+)</code>\s*</h2>(?P<body>.*?)(?=<hr\s*/?>\s*<h2|<p><strong>提示</strong>|$)',
re.S | re.I,
)
items: list[dict[str, Any]] = []
for match in block_pattern.finditer(article_html):
title = clean_text(match.group("title_html") or "")
body_html = match.group("body") or ""
links = re.findall(r'<a[^>]*href="([^"]+)"[^>]*>', body_html, re.I)
url = links[0].replace("&amp;", "&").strip() if links else (match.group("title_url") or "")
summary = clean_text(re.sub(r"<[^>]+>", " ", body_html))
if title:
items.append(
{
"source_group": config.name,
"source_label": source_label_from_url(url, fallback=config.name),
"title_raw": title,
"summary_raw": summary[:500],
"url": url,
"published_at": None,
"origin_type": "juya_issue",
"section_hint": "",
"language_hint": "zh",
}
)
return items
def fetch_juya(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_juya_rss(config, fetch_text(config.url, config.timeout_seconds), run_date)

View File

@@ -0,0 +1,78 @@
from __future__ import annotations
from urllib.parse import urlparse
DOMAIN_LABELS = {
"anthropic.com": "Anthropic",
"arxiv.org": "arXiv",
"bloomberg.com": "Bloomberg",
"deepseek.com": "DeepSeek",
"github.blog": "GitHub Blog",
"github.com": "GitHub",
"huggingface.co": "Hugging Face",
"infoq.com": "InfoQ",
"mp.weixin.qq.com": "微信公众号",
"openai.com": "OpenAI",
"platform.minimaxi.com": "MiniMaxDocs",
"qbitai.com": "量子位",
"techcrunch.com": "TechCrunch",
"technologyreview.com": "MIT科技评论AI",
"theverge.com": "The Verge",
"x.com": "X",
"twitter.com": "X",
}
X_DISPLAY_NAMES = {
"MiniMax_AI": "MiniMax",
"OpenAIDevs": "OpenAI Developers",
"openai": "OpenAI",
"openclaw": "OpenClaw",
"xai": "xAI",
"krea_ai": "Krea AI",
"nvidia": "NVIDIA",
"NVIDIAAI": "NVIDIA AI",
"alibaba_cloud": "阿里云 / Alibaba Cloud",
"cb_doge": "cb_doge",
}
def _host(url: str) -> str:
host = (urlparse(url).netloc or "").lower()
return host[4:] if host.startswith("www.") else host
def _domain_label(host: str) -> str:
for domain, label in DOMAIN_LABELS.items():
if host == domain or host.endswith("." + domain):
return label
return host
def _x_handle(url: str) -> str:
parts = [part for part in urlparse(url).path.split("/") if part]
if not parts:
return ""
handle = parts[0]
if handle in {"i", "search", "explore", "settings", "notifications", "home", "compose"}:
return ""
return handle
def source_label_from_url(url: str, *, fallback: str = "来源") -> str:
if not url:
return fallback
host = _host(url)
if host in {"x.com", "twitter.com"}:
handle = _x_handle(url)
if handle:
display = X_DISPLAY_NAMES.get(handle, handle)
return f"X{display} (@{handle})"
return "X"
label = _domain_label(host)
parsed = urlparse(url)
path = (parsed.path or "").lower()
if label and ("blog" in host or "/blog" in path or "/research" in path):
return f"{label}Blog"
return label or fallback

View File

@@ -0,0 +1,24 @@
from __future__ import annotations
from typing import Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.sources.aihot import fetch_aihot
from ai_daily_report.sources.juya import fetch_juya
from ai_daily_report.sources.rss import fetch_rss
SourceFetcher = Callable[[SourceConfig, str, Callable[[str, int], str]], list[dict]]
SOURCE_FETCHERS: dict[str, SourceFetcher] = {
"aihot": fetch_aihot,
"rss": fetch_rss,
"juya_rss": fetch_juya,
}
def get_source_fetcher(source_type: str) -> SourceFetcher:
if source_type not in SOURCE_FETCHERS:
raise KeyError(f"Unknown source type: {source_type}")
return SOURCE_FETCHERS[source_type]

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from typing import Any, Callable
from ai_daily_report.models import SourceConfig
from ai_daily_report.normalize import clean_text
FetchText = Callable[[str, int], str]
def _parse_pubdate(value: str) -> str | None:
if not value:
return None
try:
return parsedate_to_datetime(value).isoformat()
except Exception:
return None
def parse_rss_items(config: SourceConfig, xml_text: str, *, limit: int = 20) -> list[dict[str, Any]]:
root = ET.fromstring(xml_text)
channel = root.find("channel")
raw_items = channel.findall("item") if channel is not None else []
items: list[dict[str, Any]] = []
for raw in raw_items[:limit]:
title = clean_text(raw.findtext("title") or "")
if not title:
continue
summary = clean_text(raw.findtext("description") or "")
items.append(
{
"source_group": config.name,
"source_label": config.name,
"title_raw": title,
"summary_raw": summary,
"url": (raw.findtext("link") or "").strip(),
"published_at": _parse_pubdate(raw.findtext("pubDate") or ""),
"origin_type": "rss",
"section_hint": "",
"language_hint": "en" if title.encode("utf-8").isascii() else "zh",
}
)
return items
def fetch_rss(config: SourceConfig, run_date: str, fetch_text: FetchText) -> list[dict[str, Any]]:
return parse_rss_items(config, fetch_text(config.url, config.timeout_seconds))

View File

@@ -0,0 +1,46 @@
from __future__ import annotations
import re
from typing import Any
from .classify import SECTION_ORDER
from .models import NewsItem
def validate_report_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
return validate_markdown(markdown, items)
def validate_markdown(markdown: str, items: list[NewsItem]) -> dict[str, Any]:
blocking_errors: list[str] = []
auto_fixes: list[str] = []
warnings: list[dict[str, str]] = []
if not items:
blocking_errors.append("no_items")
if len((markdown or "").strip()) < 80:
blocking_errors.append("markdown_too_short")
if items and "## " not in markdown:
blocking_errors.append("no_sections")
if re.search(r"\{[^{}]*\}", markdown or ""):
blocking_errors.append("json_fragment_detected")
if "> >" in (markdown or ""):
auto_fixes.append("double_blockquote_detected")
if re.search(r"\[\d+\]|\[N\]", markdown or ""):
auto_fixes.append("reference_marker_detected")
for item in items:
if not item.url:
warnings.append({"type": "missing_url", "item_id": item.id})
if item.section not in SECTION_ORDER:
blocking_errors.append("invalid_section")
break
return {
"item_count": len(items),
"section_count": len({item.section for item in items if item.section}),
"markdown_length": len(markdown or ""),
"auto_fixes": auto_fixes,
"warnings": warnings,
"blocking_errors": blocking_errors,
}