165 lines
6.4 KiB
Python
165 lines
6.4 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any, Callable
|
|
from urllib.error import HTTPError
|
|
|
|
from .classify import SECTION_ORDER
|
|
from .llm import parse_json_object
|
|
from .models import NewsItem
|
|
|
|
|
|
RewriteLlmCall = Callable[[str], str]
|
|
|
|
|
|
def _chunks(items: list[NewsItem], size: int) -> list[list[NewsItem]]:
|
|
return [items[index : index + size] for index in range(0, len(items), size)]
|
|
|
|
|
|
def _build_prompt(batch: list[NewsItem]) -> str:
|
|
payload = {
|
|
"task": (
|
|
"For each AI news item, translate when needed, rewrite the title and summary into concise Chinese, "
|
|
"and classify it into exactly one allowed section. Preserve brand/model/API names such as GPT-5, "
|
|
"Codex, Gemini, Claude, API, MCP. Do not add facts."
|
|
),
|
|
"allowed_sections": SECTION_ORDER,
|
|
"section_guidance": {
|
|
"模型与能力": "model releases, capability upgrades, modalities, context windows, inference, benchmarks tied to model ability",
|
|
"产品与应用": "end-user products, apps, agents, workflows, product launches, practical business or consumer use cases",
|
|
"开发与基础设施": "developer tools, APIs, SDKs, MCP, frameworks, deployment, chips, cloud, infra, open source engineering",
|
|
"公司与资本": "company strategy, financing, IPO, acquisitions, partnerships, revenue, business competition",
|
|
"政策与安全": "policy, regulation, safety, privacy, copyright, misuse, security incidents, governance",
|
|
"论文与研究": "papers, academic research, arXiv, methods, experiments, datasets, evaluations",
|
|
"观点与教程": "opinions, analysis, explainers, tutorials, guides, practices",
|
|
"人物与动态": "people-focused interviews, speeches, career moves, public appearances",
|
|
},
|
|
"items": [
|
|
{
|
|
"id": item.id,
|
|
"title_raw": item.title_raw,
|
|
"summary_raw": item.summary_raw,
|
|
"source": item.source_label,
|
|
"language_hint": item.language_hint,
|
|
"source_section_hint": item.section_hint,
|
|
}
|
|
for item in batch
|
|
],
|
|
"output_schema": {
|
|
"rewrites": [
|
|
{
|
|
"id": "item id",
|
|
"title": "display title",
|
|
"summary": "display summary",
|
|
"section": "one allowed section",
|
|
"confidence": 0.0,
|
|
"flags": [],
|
|
}
|
|
]
|
|
},
|
|
}
|
|
return json.dumps(payload, ensure_ascii=False)
|
|
|
|
|
|
def _fallback(item: NewsItem) -> None:
|
|
item.title = item.title_raw
|
|
item.summary = item.summary_raw or "该条目暂无摘要。"
|
|
|
|
|
|
def _is_transient_llm_error(exc: Exception) -> bool:
|
|
if isinstance(exc, TimeoutError):
|
|
return True
|
|
if isinstance(exc, HTTPError):
|
|
return exc.code in {429, 500, 502, 503, 504}
|
|
return False
|
|
|
|
|
|
def _apply_rewrite_batch(batch: list[NewsItem], llm_call: RewriteLlmCall) -> tuple[int, int]:
|
|
obj = parse_json_object(llm_call(_build_prompt(batch)))
|
|
rewrites = obj.get("rewrites", [])
|
|
if not isinstance(rewrites, list):
|
|
raise ValueError("rewrites is not a list")
|
|
by_id = {item.id: item for item in batch}
|
|
seen_ids: set[str] = set()
|
|
section_count = 0
|
|
for entry in rewrites:
|
|
item_id = entry.get("id")
|
|
title = str(entry.get("title") or "").strip()
|
|
summary = str(entry.get("summary") or "").strip()
|
|
if item_id in by_id and title and summary:
|
|
by_id[item_id].title = title
|
|
by_id[item_id].summary = summary
|
|
section = str(entry.get("section") or "").strip()
|
|
if section in SECTION_ORDER:
|
|
by_id[item_id].section = section
|
|
section_count += 1
|
|
seen_ids.add(item_id)
|
|
return len(seen_ids), section_count
|
|
|
|
|
|
def rewrite_items(
|
|
items: list[NewsItem],
|
|
*,
|
|
llm_call: RewriteLlmCall,
|
|
batch_size: int = 30,
|
|
max_fallback_ratio: float = 0.2,
|
|
retry_single_items: bool = False,
|
|
) -> tuple[list[NewsItem], dict[str, Any]]:
|
|
rewritten_count = 0
|
|
llm_section_count = 0
|
|
fallback_count = 0
|
|
missing_rewrite_count = 0
|
|
errors: list[str] = []
|
|
|
|
for batch in _chunks(items, max(1, batch_size)):
|
|
try:
|
|
batch_rewritten_count, batch_section_count = _apply_rewrite_batch(batch, llm_call)
|
|
rewritten_count += batch_rewritten_count
|
|
llm_section_count += batch_section_count
|
|
for item in batch:
|
|
if item.title is None or item.summary is None:
|
|
errors.append(f"missing_rewrite_for_item: {item.id}")
|
|
_fallback(item)
|
|
fallback_count += 1
|
|
missing_rewrite_count += 1
|
|
except Exception as exc:
|
|
errors.append(f"batch:{type(exc).__name__}: {exc}")
|
|
if _is_transient_llm_error(exc):
|
|
for item in batch:
|
|
_fallback(item)
|
|
fallback_count += 1
|
|
continue
|
|
if not retry_single_items:
|
|
for item in batch:
|
|
_fallback(item)
|
|
fallback_count += 1
|
|
continue
|
|
for item in batch:
|
|
try:
|
|
item_rewritten_count, item_section_count = _apply_rewrite_batch([item], llm_call)
|
|
rewritten_count += item_rewritten_count
|
|
llm_section_count += item_section_count
|
|
except Exception as item_exc:
|
|
errors.append(f"item:{item.id}:{type(item_exc).__name__}: {item_exc}")
|
|
_fallback(item)
|
|
fallback_count += 1
|
|
|
|
fallback_ratio = fallback_count / len(items) if items else 0
|
|
blocking_errors: list[str] = []
|
|
if fallback_ratio > max_fallback_ratio:
|
|
blocking_errors.append("rewrite_fallback_ratio_exceeded")
|
|
|
|
report = {
|
|
"input_count": len(items),
|
|
"rewritten_count": rewritten_count,
|
|
"llm_section_count": llm_section_count,
|
|
"fallback_count": fallback_count,
|
|
"missing_rewrite_count": missing_rewrite_count,
|
|
"fallback_ratio": round(fallback_ratio, 4),
|
|
"batch_count": len(_chunks(items, max(1, batch_size))),
|
|
"errors": errors,
|
|
"blocking_errors": blocking_errors,
|
|
"quality_gate_failed": bool(blocking_errors),
|
|
}
|
|
return items, report
|