Refactor AI daily report pipeline

This commit is contained in:
Mimikko-zeus
2026-06-04 15:21:56 +08:00
parent 94e18ce22d
commit 5a98696255
64 changed files with 4778 additions and 1316 deletions

113
ai_daily_report/guide.py Normal file
View File

@@ -0,0 +1,113 @@
from __future__ import annotations
import json
import re
from typing import Any, Callable
from .llm import parse_json_object
from .models import NewsItem
GuideLlmCall = Callable[[str], str]
def _clean_text(text: str, limit: int | None = None) -> str:
value = re.sub(r"^\s*>\s*", "", text or "").strip()
value = re.sub(r"\[\d+\]|\[N\]", "", value)
value = re.sub(r"\s+", " ", value).strip()
if limit and len(value) > limit:
value = value[:limit].rstrip()
return value
def _build_prompt(items: list[NewsItem]) -> str:
payload = {
"task": (
"Generate a concise AI daily report guide. Return JSON only. Do not use 强信号/中信号/待验证. "
"Use a short theme and 2-4 daily threads. Every thread must reference existing item_ids."
),
"items": [
{
"id": item.id,
"title": item.title or item.title_raw,
"summary": item.summary or item.summary_raw,
"section": item.section,
"source": item.source_label,
}
for item in items
],
"output_schema": {
"theme": "one sentence under 120 Chinese characters",
"threads": [
{
"title": "thread title",
"text": "one or two sentences",
"item_ids": ["existing item id"],
"kind": "thread|uncertain",
}
],
},
}
return json.dumps(payload, ensure_ascii=False)
def generate_guide(
items: list[NewsItem],
*,
llm_call: GuideLlmCall,
) -> tuple[dict[str, Any], dict[str, Any]]:
if not items:
return {
"theme": "",
"threads": [],
}, {
"input_count": 0,
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": False,
"errors": [],
}
try:
obj = parse_json_object(llm_call(_build_prompt(items)))
except Exception as exc:
return {
"theme": "",
"threads": [],
}, {
"input_count": len(items),
"theme_present": False,
"thread_count": 0,
"dropped_thread_count": 0,
"fallback_used": True,
"errors": [f"{type(exc).__name__}: {exc}"],
}
valid_ids = {item.id for item in items}
threads: list[dict[str, Any]] = []
dropped = 0
for thread in obj.get("threads", []) or []:
item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
if not item_ids:
dropped += 1
continue
title = _clean_text(str(thread.get("title") or ""), limit=80)
text = _clean_text(str(thread.get("text") or ""), limit=220)
if not title or not text:
dropped += 1
continue
kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})
theme = _clean_text(str(obj.get("theme") or ""), limit=120)
guide = {"theme": theme, "threads": threads}
report = {
"input_count": len(items),
"theme_present": bool(theme),
"thread_count": len(threads),
"dropped_thread_count": dropped,
"fallback_used": False,
"errors": [],
}
return guide, report