Improve AI daily report operations and dedupe observability

2026-06-10 21:55:29 +08:00
parent b46cef2c7b
commit 2159ee733b
23 changed files with 761 additions and 57 deletions
--- a/ai_daily_report/audit.py
+++ b/ai_daily_report/audit.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+
+def load_run_report(path: Path) -> dict[str, Any] | None:
+    report_path = path / "run_report.json" if path.is_dir() else path
+    if not report_path.exists():
+        return None
+    try:
+        value = json.loads(report_path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+    return value if isinstance(value, dict) else None
+
+
+def summarize_reports(out_dir: Path, *, limit_days: int = 7) -> dict[str, Any]:
+    run_dirs = sorted([path for path in out_dir.iterdir() if path.is_dir()], reverse=True)[:limit_days]
+    rows: list[dict[str, Any]] = []
+    totals: dict[str, Any] = {
+        "source_failures": 0,
+        "duplicate_candidates": 0,
+        "final_items": 0,
+        "fallback_items": 0,
+        "quality_warnings": 0,
+        "quality_blocks": 0,
+    }
+    for run_dir in sorted(run_dirs):
+        report = load_run_report(run_dir)
+        if not report:
+            continue
+        quality_gate = report.get("quality_gate", {}) or {}
+        stage2_8 = report.get("stage2_8", {}) or {}
+        stage4 = report.get("stage4", {}) or {}
+        stage5 = report.get("stage5", {}) or {}
+        stage8 = report.get("stage8", {}) or {}
+        fallback_count = int(stage4.get("fallback_count", stage4.get("fallback_item_count", 0)) or 0)
+        final_count = int(stage5.get("output_count", stage4.get("output_count", 0)) or 0)
+        source_failures = len(quality_gate.get("source_failures", []) or [])
+        duplicate_candidates = int(stage2_8.get("candidate_group_count", 0) or 0)
+        warnings = len(quality_gate.get("warnings", []) or [])
+        blocks = len(quality_gate.get("blocking_errors", []) or [])
+        row = {
+            "date": run_dir.name,
+            "source_failures": source_failures,
+            "duplicate_candidates": duplicate_candidates,
+            "final_items": final_count,
+            "fallback_items": fallback_count,
+            "fallback_ratio": round(fallback_count / final_count, 4) if final_count else 0,
+            "quality_warnings": warnings,
+            "quality_blocks": blocks,
+            "publish_status": stage8.get("status"),
+            "publish_slug": stage8.get("slug"),
+        }
+        rows.append(row)
+        totals["source_failures"] += source_failures
+        totals["duplicate_candidates"] += duplicate_candidates
+        totals["final_items"] += final_count
+        totals["fallback_items"] += fallback_count
+        totals["quality_warnings"] += warnings
+        totals["quality_blocks"] += blocks
+    totals["fallback_ratio"] = round(totals["fallback_items"] / totals["final_items"], 4) if totals["final_items"] else 0
+    return {"run_count": len(rows), "totals": totals, "runs": rows}
+
+
+def render_markdown(summary: dict[str, Any]) -> str:
+    totals = summary.get("totals", {})
+    lines = [
+        "# AI日报每周自动审计报告",
+        "",
+        f"- 覆盖运行数：{summary.get('run_count', 0)}",
+        f"- 源失败次数：{totals.get('source_failures', 0)}",
+        f"- 重复候选数：{totals.get('duplicate_candidates', 0)}",
+        f"- 最终条数：{totals.get('final_items', 0)}",
+        f"- fallback ratio：{totals.get('fallback_ratio', 0)}",
+        f"- 质量门禁 warning/block：{totals.get('quality_warnings', 0)}/{totals.get('quality_blocks', 0)}",
+        "",
+        "| 日期 | 源失败 | 重复候选 | 最终条数 | fallback | warning | block | 发布 | slug |",
+        "|---|---:|---:|---:|---:|---:|---:|---|---|",
+    ]
+    for row in summary.get("runs", []) or []:
+        lines.append(
+            f"| {row['date']} | {row['source_failures']} | {row['duplicate_candidates']} | "
+            f"{row['final_items']} | {row['fallback_ratio']} | {row['quality_warnings']} | "
+            f"{row['quality_blocks']} | {row.get('publish_status') or ''} | {row.get('publish_slug') or ''} |"
+        )
+    return "\n".join(lines) + "\n"
--- a/ai_daily_report/cli.py
+++ b/ai_daily_report/cli.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import argparse
 from pathlib import Path

+from .audit import render_markdown, summarize_reports
 from .runner import run_daily_report


@@ -19,6 +20,9 @@ def build_parser() -> argparse.ArgumentParser:
    run.add_argument("--sources-path", default=None)
    run.add_argument("--pipeline-path", default=None)
    run.add_argument("--history-path", default=None)
+    audit = subcommands.add_parser("audit")
+    audit.add_argument("--out-dir", default=str(Path.home() / ".hermes" / "scripts" / "ai_morning_out"))
+    audit.add_argument("--limit-days", type=int, default=7)
    return parser


@@ -37,6 +41,8 @@ def main(argv: list[str] | None = None) -> int:
            pipeline_path=Path(args.pipeline_path) if args.pipeline_path else None,
            history_path=Path(args.history_path) if args.history_path else None,
        )
+    elif args.command == "audit":
+        print(render_markdown(summarize_reports(Path(args.out_dir), limit_days=args.limit_days)))
    return 0


--- a/ai_daily_report/clients.py
+++ b/ai_daily_report/clients.py
@@ -5,6 +5,7 @@ import socket
 import time
 from dataclasses import dataclass
 from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
 import urllib.request
 from typing import Any

@@ -115,17 +116,49 @@ class BlogApiClient:
    def create_post(self, payload: dict[str, Any]) -> dict[str, Any]:
        return self._request("POST", "/api/service/posts", payload)

-    def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
+    def _normalize_post_response(self, value: Any, slug: str) -> dict[str, Any] | None:
+        if isinstance(value, dict):
+            if isinstance(value.get("post"), dict):
+                value = value["post"]
+            elif isinstance(value.get("data"), dict):
+                value = value["data"]
+            elif isinstance(value.get("items"), list):
+                for item in value["items"]:
+                    if isinstance(item, dict) and item.get("slug") == slug:
+                        return item
+                return None
+            if value.get("slug") == slug or value.get("id") or value.get("content") or value.get("markdown"):
+                return value
+        if isinstance(value, list):
+            for item in value:
+                if isinstance(item, dict) and item.get("slug") == slug:
+                    return item
+        return None
+
+    def _request_optional(self, method: str, path: str, payload: dict[str, Any] | None = None) -> dict[str, Any] | list[Any] | None:
        try:
-            return self._request("GET", f"/api/service/posts/{slug}")
+            return self._request(method, path, payload)
        except HTTPError as exc:
-            if exc.code == 404:
+            if exc.code in {403, 404}:
                return None
            raise
        except FetchTextError as exc:
-            if exc.error_type == "http_404":
+            if exc.error_type in {"http_403", "http_404"}:
                return None
            raise

+    def get_post_by_slug(self, slug: str) -> dict[str, Any] | None:
+        paths = [
+            f"/api/service/posts/{slug}",
+            f"/api/service/posts?{urlencode({'slug': slug})}",
+            f"/api/service/posts/slug/{slug}",
+        ]
+        for path in paths:
+            value = self._request_optional("GET", path)
+            post = self._normalize_post_response(value, slug)
+            if post is not None:
+                return post
+        return None
+
    def publish_post(self, slug: str) -> None:
        self._request("POST", f"/api/service/posts/{slug}/publish")
--- a/ai_daily_report/collect.py
+++ b/ai_daily_report/collect.py
@@ -35,6 +35,7 @@ def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> Sourc
            ok=False,
            status="disabled",
            fetched_at=fetched_at,
+            error=f"failure_policy={config.failure_policy}; min_items={config.min_items}",
        )

    started = perf_counter()
@@ -42,12 +43,15 @@ def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> Sourc
        items = fetcher(config, run_date)
        elapsed_ms = int((perf_counter() - started) * 1000)
        status = "ok" if items else "empty"
+        if status == "ok" and config.min_items and len(items) < config.min_items:
+            status = "below_min_items"
        return SourceResult(
            source=config.name,
            role=config.role,
            ok=status == "ok",
            status=status,
            items=items,
+            error=None if status == "ok" else f"items={len(items)}; min_items={config.min_items}; failure_policy={config.failure_policy}",
            elapsed_ms=elapsed_ms,
            fetched_at=fetched_at,
        )
@@ -58,7 +62,7 @@ def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> Sourc
            role=config.role,
            ok=False,
            status=_status_from_exception(exc),
-            error=f"{type(exc).__name__}: {exc}",
+            error=f"{type(exc).__name__}: {exc}; failure_policy={config.failure_policy}; min_items={config.min_items}",
            elapsed_ms=elapsed_ms,
            retry_count=_retry_count_from_exception(exc),
            fetched_at=fetched_at,
--- a/ai_daily_report/models.py
+++ b/ai_daily_report/models.py
@@ -15,6 +15,7 @@ class SourceConfig:
    min_items: int = 0
    url: str = ""
    max_item_age_days: int | None = None
+    failure_policy: str = "warn"


@dataclass
--- a/ai_daily_report/observability.py
+++ b/ai_daily_report/observability.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass, field
+from typing import Any, Callable
+
+
+def sha256_text(value: str) -> str:
+    return hashlib.sha256((value or "").encode("utf-8")).hexdigest()
+
+
+def truncate_text(value: str, limit: int = 500) -> str:
+    text = value or ""
+    if len(text) <= limit:
+        return text
+    return f"{text[:limit]}…[truncated {len(text) - limit} chars]"
+
+
+@dataclass
+class LlmCallObserver:
+    call: Callable[[str], str]
+    stage: str
+    records: list[dict[str, Any]] = field(default_factory=list)
+    prompt_preview_chars: int = 500
+    response_preview_chars: int = 500
+
+    def __call__(self, prompt: str) -> str:
+        response = self.call(prompt)
+        self.records.append(
+            {
+                "stage": self.stage,
+                "call_index": len(self.records) + 1,
+                "prompt_hash": sha256_text(prompt),
+                "response_hash": sha256_text(response),
+                "prompt_chars": len(prompt or ""),
+                "response_chars": len(response or ""),
+                "prompt_preview": truncate_text(prompt, self.prompt_preview_chars),
+                "response_preview": truncate_text(response, self.response_preview_chars),
+            }
+        )
+        return response
+
+
+def summarize_observed_calls(observers: list[LlmCallObserver]) -> dict[str, Any]:
+    records: list[dict[str, Any]] = []
+    by_stage: dict[str, int] = {}
+    for observer in observers:
+        records.extend(observer.records)
+        by_stage[observer.stage] = by_stage.get(observer.stage, 0) + len(observer.records)
+    return {
+        "total_calls": len(records),
+        "by_stage": by_stage,
+        "records": records,
+    }
--- a/ai_daily_report/pipeline.py
+++ b/ai_daily_report/pipeline.py
@@ -30,6 +30,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
        min_items=int(value.get("min_items", 0)),
        url=value.get("url", ""),
        max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
+        failure_policy=str(value.get("failure_policy") or ("block" if bool(value.get("required", False)) else "warn")),
    )


@@ -347,19 +348,26 @@ def run_stage0_to_stage8(
        quality_gate_config=quality_gate_config,
    )
    slug = f"ai-{run_date}"
+    effective_mode = mode
+    quality_gate_report = stage7_result["reports"].get("quality_gate", {}) or {}
+    required_policy = str(quality_gate_report.get("required_source_failure_policy") or "block")
+    if quality_gate_report.get("required_source_failures") and required_policy in {"draft", "dry_run"}:
+        effective_mode = "dry-run" if required_policy == "dry_run" else "draft"
+
    publish_result = publish_markdown(
        title=f"AI日报 · {run_date}",
        markdown=stage7_result["markdown"],
        tags=["AI日报", "AI资讯", "人工智能"],
        slug=slug,
        base_url=base_url,
-        mode=mode,
+        mode=effective_mode,
        markdown_report=stage7_result["reports"]["stage7"],
        client=client,
        idempotency_config=publish_idempotency_config,
    )
    reports = dict(stage7_result["reports"])
    reports["stage8"] = {
+        "requested_mode": mode,
        "mode": publish_result.mode,
        "status": publish_result.status,
        "slug": publish_result.slug,
--- a/ai_daily_report/quality_gate.py
+++ b/ai_daily_report/quality_gate.py
@@ -8,6 +8,7 @@ from .models import NewsItem, SourceResult


 DEFAULT_CONFIG = {
+    "required_source_failure_policy": "block",  # block | draft | dry_run | warn
    "block_on_required_source_failure": True,
    "warn_on_enabled_source_failure": True,
    "warn_when_stage3_candidates_zero_min_items": 30,
@@ -73,10 +74,14 @@ def evaluate_quality_gate(
            warnings.append(f"enabled_source_failed:{failure['source']}:{failure['status']}")

    required_sources = set(config.get("required_sources") or [])
-    if bool(config["block_on_required_source_failure"]):
-        for failure in failures:
-            if failure["source"] in required_sources:
-                blocking_errors.append(f"required_source_failed:{failure['source']}:{failure['status']}")
+    required_failures = [failure for failure in failures if failure["source"] in required_sources]
+    policy = str(config.get("required_source_failure_policy") or "block")
+    if bool(config["block_on_required_source_failure"]) and policy == "block":
+        for failure in required_failures:
+            blocking_errors.append(f"required_source_failed:{failure['source']}:{failure['status']}")
+    elif required_failures:
+        for failure in required_failures:
+            warnings.append(f"required_source_failed:{failure['source']}:{failure['status']}:{policy}")

    title_threshold = float(config["warn_on_final_title_similarity"])
    if title_threshold > 0:
@@ -87,5 +92,7 @@ def evaluate_quality_gate(
        "warnings": warnings,
        "blocking_errors": blocking_errors,
        "source_failures": failures,
+        "required_source_failures": required_failures,
+        "required_source_failure_policy": policy,
        "quality_gate_failed": bool(blocking_errors),
    }
--- a/ai_daily_report/runner.py
+++ b/ai_daily_report/runner.py
@@ -9,6 +9,7 @@ from .clients import BlogApiClient, OpenAICompatibleClient, fetch_text as defaul
 from .config import load_pipeline_config, load_source_configs
 from .env import load_env, resolve_blog_token, resolve_llm_config
 from .models import SourceConfig
+from .observability import LlmCallObserver, summarize_observed_calls
 from .pipeline import run_stage0_to_stage8
 from .publish import load_published_urls, update_published_urls
 from .sources.registry import get_source_fetcher
@@ -135,15 +136,33 @@ def run_daily_report(
    else:
        raise ValueError("source_mode must be 'mock' or 'live'")

+    llm_observability_config = pipeline_config.get("llm_observability", {}) or {}
+    llm_observers: list[LlmCallObserver] = []
+    observe_llm = bool(llm_observability_config.get("enabled", True))
+    prompt_preview_chars = int(llm_observability_config.get("prompt_preview_chars", 500))
+    response_preview_chars = int(llm_observability_config.get("response_preview_chars", 500))
+
+    def maybe_observe(stage: str, call):
+        if not observe_llm:
+            return call
+        observer = LlmCallObserver(
+            call=call,
+            stage=stage,
+            prompt_preview_chars=prompt_preview_chars,
+            response_preview_chars=response_preview_chars,
+        )
+        llm_observers.append(observer)
+        return observer
+
    if llm_mode == "mock":
-        semantic_llm_call = _mock_semantic_llm
-        rewrite_llm_call = _mock_rewrite_llm
-        guide_llm_call = _mock_guide_llm
+        semantic_llm_call = maybe_observe("stage3", _mock_semantic_llm)
+        rewrite_llm_call = maybe_observe("stage4", _mock_rewrite_llm)
+        guide_llm_call = maybe_observe("stage6", _mock_guide_llm)
    elif llm_mode == "live":
        llm_client = llm_client_factory(**resolve_llm_config(env))
-        semantic_llm_call = llm_client.chat
-        rewrite_llm_call = llm_client.chat
-        guide_llm_call = llm_client.chat
+        semantic_llm_call = maybe_observe("stage3", llm_client.chat)
+        rewrite_llm_call = maybe_observe("stage4", llm_client.chat)
+        guide_llm_call = maybe_observe("stage6", llm_client.chat)
    else:
        raise ValueError("llm_mode must be 'mock' or 'live'")

@@ -182,6 +201,9 @@ def run_daily_report(
            max_age_days=cross_day_max_age_days,
        )

+    llm_observability_report = summarize_observed_calls(llm_observers)
+    result["reports"]["llm_observability"] = llm_observability_report
+
    run_dir = out_dir / run_date
    run_dir.mkdir(parents=True, exist_ok=True)
    (run_dir / "blog_markdown.md").write_text(result["markdown"], encoding="utf-8")
--- a/ai_daily_report/semantic_dedupe.py
+++ b/ai_daily_report/semantic_dedupe.py
@@ -25,6 +25,11 @@ def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> st
        "task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
        "items": item_payload,
        "candidates": candidates,
+        "dedupe_policy": [
+            "Use duplicate_groups only when items are substantially the same article/event and one can be removed.",
+            "Use merge_groups when items cover the same concrete event from different angles; keep the best item and attach the others as supplementary sources instead of dropping the event context.",
+            "Do not curate by importance. Do not merge unrelated follow-ups just because they mention the same company/model.",
+        ],
        "output_schema": {
            "duplicate_groups": [
                {
@@ -34,6 +39,14 @@ def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> st
                    "reason": "same concrete event reason",
                }
            ],
+            "merge_groups": [
+                {
+                    "keep_id": "item id",
+                    "merge_ids": ["item id"],
+                    "confidence": "high|medium|low",
+                    "reason": "same event, complementary angle/source",
+                }
+            ],
            "not_duplicates": [],
            "uncertain": [],
        },
@@ -75,6 +88,7 @@ def semantic_dedup_items(
            "candidate_group_count": len(candidates),
            "removed_count": 0,
            "duplicate_groups": [],
+            "merge_groups": [],
            "uncertain": [],
            "errors": [],
            "skipped_for_deletion_ratio": False,
@@ -89,6 +103,7 @@ def semantic_dedup_items(
            "candidate_group_count": len(candidates),
            "removed_count": 0,
            "duplicate_groups": [],
+            "merge_groups": [],
            "uncertain": [],
            "errors": [f"{type(exc).__name__}: {exc}"],
            "skipped_for_deletion_ratio": False,
@@ -101,19 +116,27 @@ def semantic_dedup_items(
    }
    candidate_removals: set[str] = set()
    valid_groups: list[dict[str, Any]] = []
+    valid_merge_groups: list[dict[str, Any]] = []
+
+    def _validate_group_ids(group: dict[str, Any], member_key: str) -> tuple[list[str], list[NewsItem]] | None:
+        raw_ids = [group.get("keep_id")] + list(group.get(member_key) or [])
+        if any(not isinstance(item_id, str) or item_id not in by_id for item_id in raw_ids):
+            errors.append(f"invalid_ids_in_group: {group}")
+            return None
+        ids = [str(item_id) for item_id in raw_ids]
+        group_set = frozenset(ids)
+        if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
+            errors.append(f"group_outside_candidates: {group}")
+            return None
+        return ids, [by_id[item_id] for item_id in ids]

    for group in obj.get("duplicate_groups", []) or []:
        if group.get("confidence") != "high":
            continue
-        ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
-        if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
-            errors.append(f"invalid_ids_in_group: {group}")
+        validated = _validate_group_ids(group, "remove_ids")
+        if validated is None:
            continue
-        group_set = frozenset(ids)
-        if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
-            errors.append(f"group_outside_candidates: {group}")
-            continue
-        group_items = [by_id[item_id] for item_id in ids]
+        ids, group_items = validated
        keep = _choose_keep(group_items, str(group.get("keep_id")))
        remove_items = [item for item in group_items if item is not keep]
        candidate_removals.update(item.id for item in remove_items)
@@ -126,6 +149,24 @@ def semantic_dedup_items(
            }
        )

+    for group in obj.get("merge_groups", []) or []:
+        if group.get("confidence") != "high":
+            continue
+        validated = _validate_group_ids(group, "merge_ids")
+        if validated is None:
+            continue
+        ids, group_items = validated
+        keep = _choose_keep(group_items, str(group.get("keep_id")))
+        merge_items = [item for item in group_items if item is not keep]
+        valid_merge_groups.append(
+            {
+                "keep_id": keep.id,
+                "merge_ids": [item.id for item in merge_items],
+                "confidence": "high",
+                "reason": str(group.get("reason") or "semantic_merge"),
+            }
+        )
+
    deletion_ratio = len(candidate_removals) / len(items) if items else 0
    if deletion_ratio > max_deletion_ratio:
        return items, {
@@ -133,33 +174,49 @@ def semantic_dedup_items(
            "candidate_group_count": len(candidates),
            "removed_count": 0,
            "duplicate_groups": valid_groups,
+            "merge_groups": valid_merge_groups,
            "uncertain": obj.get("uncertain", []) or [],
            "errors": errors,
            "skipped_for_deletion_ratio": True,
        }

    removed_ids: set[str] = set()
+
+    def append_supplement(keep: NewsItem, source_item: NewsItem, reason: str, action: str) -> None:
+        keep.duplicate_sources.append(
+            {
+                "id": source_item.id,
+                "source_group": source_item.source_group,
+                "source_label": source_item.source_label,
+                "url": source_item.url,
+                "title": source_item.title or source_item.title_raw,
+                "summary": source_item.summary or source_item.summary_raw,
+                "reason": reason,
+                "action": action,
+            }
+        )
+
    for group in valid_groups:
        keep = by_id[group["keep_id"]]
        for remove_id in group["remove_ids"]:
            removed = by_id[remove_id]
-            keep.duplicate_sources.append(
-                {
-                    "id": removed.id,
-                    "source_group": removed.source_group,
-                    "source_label": removed.source_label,
-                    "url": removed.url,
-                    "reason": group["reason"],
-                }
-            )
+            append_supplement(keep, removed, group["reason"], "dedupe_remove")
            removed_ids.add(remove_id)

+    for group in valid_merge_groups:
+        keep = by_id[group["keep_id"]]
+        for merge_id in group["merge_ids"]:
+            if merge_id in removed_ids:
+                continue
+            append_supplement(keep, by_id[merge_id], group["reason"], "merge_supplement")
+
    deduped = [item for item in items if item.id not in removed_ids]
    report = {
        "input_count": len(items),
        "candidate_group_count": len(candidates),
        "removed_count": len(removed_ids),
        "duplicate_groups": valid_groups,
+        "merge_groups": valid_merge_groups,
        "uncertain": obj.get("uncertain", []) or [],
        "errors": errors,
        "skipped_for_deletion_ratio": False,