Improve LLM rewrite classification pipeline

2026-06-04 17:12:59 +08:00
parent dd12755ff1
commit 22cdd71a08
9 changed files with 100 additions and 16 deletions
--- a/ai_daily_report/rewrite.py
+++ b/ai_daily_report/rewrite.py
@@ -4,6 +4,7 @@ import json
 from typing import Any, Callable
 from urllib.error import HTTPError

+from .classify import SECTION_ORDER
 from .llm import parse_json_object
 from .models import NewsItem

@@ -18,9 +19,21 @@ def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
 def _build_prompt(batch: list[NewsItem]) -> str:
    payload = {
        "task": (
-            "Rewrite AI news titles and summaries into concise Chinese. Preserve brand/model/API names "
-            "such as GPT-5, Codex, Gemini, Claude, API, MCP. Do not add facts."
+            "For each AI news item, translate when needed, rewrite the title and summary into concise Chinese, "
+            "and classify it into exactly one allowed section. Preserve brand/model/API names such as GPT-5, "
+            "Codex, Gemini, Claude, API, MCP. Do not add facts."
        ),
+        "allowed_sections": SECTION_ORDER,
+        "section_guidance": {
+            "模型与能力": "model releases, capability upgrades, modalities, context windows, inference, benchmarks tied to model ability",
+            "产品与应用": "end-user products, apps, agents, workflows, product launches, practical business or consumer use cases",
+            "开发与基础设施": "developer tools, APIs, SDKs, MCP, frameworks, deployment, chips, cloud, infra, open source engineering",
+            "公司与资本": "company strategy, financing, IPO, acquisitions, partnerships, revenue, business competition",
+            "政策与安全": "policy, regulation, safety, privacy, copyright, misuse, security incidents, governance",
+            "论文与研究": "papers, academic research, arXiv, methods, experiments, datasets, evaluations",
+            "观点与教程": "opinions, analysis, explainers, tutorials, guides, practices",
+            "人物与动态": "people-focused interviews, speeches, career moves, public appearances",
+        },
        "items": [
            {
                "id": item.id,
@@ -28,6 +41,7 @@ def _build_prompt(batch: list[NewsItem]) -> str:
                "summary_raw": item.summary_raw,
                "source": item.source_label,
                "language_hint": item.language_hint,
+                "source_section_hint": item.section_hint,
            }
            for item in batch
        ],
@@ -37,6 +51,8 @@ def _build_prompt(batch: list[NewsItem]) -> str:
                    "id": "item id",
                    "title": "display title",
                    "summary": "display summary",
+                    "section": "one allowed section",
+                    "confidence": 0.0,
                    "flags": [],
                }
            ]
@@ -58,13 +74,14 @@ def _is_transient_llm_error(exc: Exception) -> bool:
    return False


-def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int:
+def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> tuple[int, int]:
    obj = parse_json_object(llm_call(_build_prompt(batch)))
    rewrites = obj.get("rewrites", [])
    if not isinstance(rewrites, list):
        raise ValueError("rewrites is not a list")
    by_id = {item.id: item for item in batch}
    seen_ids: set[str] = set()
+    section_count = 0
    for entry in rewrites:
        item_id = entry.get("id")
        title = str(entry.get("title") or "").strip()
@@ -72,8 +89,12 @@ def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> int
        if item_id in by_id and title and summary:
            by_id[item_id].title = title
            by_id[item_id].summary = summary
+            section = str(entry.get("section") or "").strip()
+            if section in SECTION_ORDER:
+                by_id[item_id].section = section
+                section_count += 1
            seen_ids.add(item_id)
-    return len(seen_ids)
+    return len(seen_ids), section_count


 def rewrite_items(
@@ -85,14 +106,16 @@ def rewrite_items(
    retry_single_items: bool = False,
 ) -> tuple[list[NewsItem], dict[str, Any]]:
    rewritten_count = 0
+    llm_section_count = 0
    fallback_count = 0
    missing_rewrite_count = 0
    errors: list[str] = []

    for batch in _chunks(items, max(1, batch_size)):
        try:
-            batch_rewritten_count = _apply_rewrite_batch(batch, llm_call)
+            batch_rewritten_count, batch_section_count = _apply_rewrite_batch(batch, llm_call)
            rewritten_count += batch_rewritten_count
+            llm_section_count += batch_section_count
            for item in batch:
                if item.title is None or item.summary is None:
                    errors.append(f"missing_rewrite_for_item: {item.id}")
@@ -113,7 +136,9 @@ def rewrite_items(
                continue
            for item in batch:
                try:
-                    rewritten_count += _apply_rewrite_batch([item], llm_call)
+                    item_rewritten_count, item_section_count = _apply_rewrite_batch([item], llm_call)
+                    rewritten_count += item_rewritten_count
+                    llm_section_count += item_section_count
                except Exception as item_exc:
                    errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
                    _fallback(item)
@@ -127,6 +152,7 @@ def rewrite_items(
    report = {
        "input_count": len(items),
        "rewritten_count": rewritten_count,
+        "llm_section_count": llm_section_count,
        "fallback_count": fallback_count,
        "missing_rewrite_count": missing_rewrite_count,
        "fallback_ratio": round(fallback_ratio, 4),