ai-daily-report/ai_daily_report/guide.py

from __future__ import annotations

import json
import re
from typing import Any, Callable

from .llm import parse_json_object
from .models import NewsItem


GuideLlmCall = Callable[[str], str]


def _clean_text(text: str, limit: int | None = None) -> str:
    value = re.sub(r"^\s*>\s*", "", text or "").strip()
    value = re.sub(r"\[\d+\]|\[N\]", "", value)
    value = re.sub(r"\s+", " ", value).strip()
    if limit and len(value) > limit:
        value = value[:limit].rstrip()
    return value


def _build_prompt(items: list[NewsItem]) -> str:
    payload = {
        "task": (
            "Generate a concise Chinese AI daily report guide. Return JSON only. "
            "Do not use 强信号/中信号/待验证. Do not add facts. "
            "Write one opening intro, a short theme, 2-4 daily threads, and one closing conclusion. "
            "Every thread must reference existing item_ids."
        ),
        "items": [
            {
                "id": item.id,
                "title": item.title or item.title_raw,
                "summary": item.summary or item.summary_raw,
                "section": item.section,
                "source": item.source_label,
            }
            for item in items
        ],
        "output_schema": {
            "intro": "one opening paragraph under 160 Chinese characters",
            "theme": "one sentence under 120 Chinese characters",
            "threads": [
                {
                    "title": "thread title",
                    "text": "one or two sentences",
                    "item_ids": ["existing item id"],
                    "kind": "thread|uncertain",
                }
            ],
            "conclusion": "one closing paragraph under 180 Chinese characters",
        },
    }
    return json.dumps(payload, ensure_ascii=False)


def _empty_guide() -> dict[str, Any]:
    return {"intro": "", "theme": "", "threads": [], "conclusion": ""}


def generate_guide(
    items: list[NewsItem],
    *,
    llm_call: GuideLlmCall,
) -> tuple[dict[str, Any], dict[str, Any]]:
    if not items:
        return _empty_guide(), {
            "input_count": 0,
            "intro_present": False,
            "theme_present": False,
            "conclusion_present": False,
            "thread_count": 0,
            "dropped_thread_count": 0,
            "fallback_used": False,
            "errors": [],
        }

    try:
        obj = parse_json_object(llm_call(_build_prompt(items)))
    except Exception as exc:
        return _empty_guide(), {
            "input_count": len(items),
            "intro_present": False,
            "theme_present": False,
            "conclusion_present": False,
            "thread_count": 0,
            "dropped_thread_count": 0,
            "fallback_used": True,
            "errors": [f"{type(exc).__name__}: {exc}"],
        }

    valid_ids = {item.id for item in items}
    threads: list[dict[str, Any]] = []
    dropped = 0
    for thread in obj.get("threads", []) or []:
        item_ids = [item_id for item_id in thread.get("item_ids", []) if item_id in valid_ids]
        if not item_ids:
            dropped += 1
            continue
        title = _clean_text(str(thread.get("title") or ""), limit=80)
        text = _clean_text(str(thread.get("text") or ""), limit=220)
        if not title or not text:
            dropped += 1
            continue
        kind = thread.get("kind") if thread.get("kind") in ("thread", "uncertain") else "thread"
        threads.append({"title": title, "text": text, "item_ids": item_ids, "kind": kind})

    intro = _clean_text(str(obj.get("intro") or ""), limit=160)
    theme = _clean_text(str(obj.get("theme") or ""), limit=120)
    conclusion = _clean_text(str(obj.get("conclusion") or ""), limit=180)
    guide = {"intro": intro, "theme": theme, "threads": threads, "conclusion": conclusion}
    report = {
        "input_count": len(items),
        "intro_present": bool(intro),
        "theme_present": bool(theme),
        "conclusion_present": bool(conclusion),
        "thread_count": len(threads),
        "dropped_thread_count": dropped,
        "fallback_used": False,
        "errors": [],
    }
    return guide, report