Improve AI daily report operations and dedupe observability

This commit is contained in:
Ubuntu
2026-06-10 21:55:29 +08:00
parent b46cef2c7b
commit 2159ee733b
23 changed files with 761 additions and 57 deletions

View File

@@ -25,6 +25,11 @@ def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> st
"task": "Identify only high-confidence semantic duplicates. Do not curate or remove by importance.",
"items": item_payload,
"candidates": candidates,
"dedupe_policy": [
"Use duplicate_groups only when items are substantially the same article/event and one can be removed.",
"Use merge_groups when items cover the same concrete event from different angles; keep the best item and attach the others as supplementary sources instead of dropping the event context.",
"Do not curate by importance. Do not merge unrelated follow-ups just because they mention the same company/model.",
],
"output_schema": {
"duplicate_groups": [
{
@@ -34,6 +39,14 @@ def _build_prompt(items: list[NewsItem], candidates: list[dict[str, Any]]) -> st
"reason": "same concrete event reason",
}
],
"merge_groups": [
{
"keep_id": "item id",
"merge_ids": ["item id"],
"confidence": "high|medium|low",
"reason": "same event, complementary angle/source",
}
],
"not_duplicates": [],
"uncertain": [],
},
@@ -75,6 +88,7 @@ def semantic_dedup_items(
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"merge_groups": [],
"uncertain": [],
"errors": [],
"skipped_for_deletion_ratio": False,
@@ -89,6 +103,7 @@ def semantic_dedup_items(
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": [],
"merge_groups": [],
"uncertain": [],
"errors": [f"{type(exc).__name__}: {exc}"],
"skipped_for_deletion_ratio": False,
@@ -101,19 +116,27 @@ def semantic_dedup_items(
}
candidate_removals: set[str] = set()
valid_groups: list[dict[str, Any]] = []
valid_merge_groups: list[dict[str, Any]] = []
def _validate_group_ids(group: dict[str, Any], member_key: str) -> tuple[list[str], list[NewsItem]] | None:
raw_ids = [group.get("keep_id")] + list(group.get(member_key) or [])
if any(not isinstance(item_id, str) or item_id not in by_id for item_id in raw_ids):
errors.append(f"invalid_ids_in_group: {group}")
return None
ids = [str(item_id) for item_id in raw_ids]
group_set = frozenset(ids)
if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
errors.append(f"group_outside_candidates: {group}")
return None
return ids, [by_id[item_id] for item_id in ids]
for group in obj.get("duplicate_groups", []) or []:
if group.get("confidence") != "high":
continue
ids = [group.get("keep_id")] + list(group.get("remove_ids") or [])
if any(not isinstance(item_id, str) or item_id not in by_id for item_id in ids):
errors.append(f"invalid_ids_in_group: {group}")
validated = _validate_group_ids(group, "remove_ids")
if validated is None:
continue
group_set = frozenset(ids)
if not any(group_set.issubset(candidate_set) for candidate_set in candidate_sets):
errors.append(f"group_outside_candidates: {group}")
continue
group_items = [by_id[item_id] for item_id in ids]
ids, group_items = validated
keep = _choose_keep(group_items, str(group.get("keep_id")))
remove_items = [item for item in group_items if item is not keep]
candidate_removals.update(item.id for item in remove_items)
@@ -126,6 +149,24 @@ def semantic_dedup_items(
}
)
for group in obj.get("merge_groups", []) or []:
if group.get("confidence") != "high":
continue
validated = _validate_group_ids(group, "merge_ids")
if validated is None:
continue
ids, group_items = validated
keep = _choose_keep(group_items, str(group.get("keep_id")))
merge_items = [item for item in group_items if item is not keep]
valid_merge_groups.append(
{
"keep_id": keep.id,
"merge_ids": [item.id for item in merge_items],
"confidence": "high",
"reason": str(group.get("reason") or "semantic_merge"),
}
)
deletion_ratio = len(candidate_removals) / len(items) if items else 0
if deletion_ratio > max_deletion_ratio:
return items, {
@@ -133,33 +174,49 @@ def semantic_dedup_items(
"candidate_group_count": len(candidates),
"removed_count": 0,
"duplicate_groups": valid_groups,
"merge_groups": valid_merge_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": True,
}
removed_ids: set[str] = set()
def append_supplement(keep: NewsItem, source_item: NewsItem, reason: str, action: str) -> None:
keep.duplicate_sources.append(
{
"id": source_item.id,
"source_group": source_item.source_group,
"source_label": source_item.source_label,
"url": source_item.url,
"title": source_item.title or source_item.title_raw,
"summary": source_item.summary or source_item.summary_raw,
"reason": reason,
"action": action,
}
)
for group in valid_groups:
keep = by_id[group["keep_id"]]
for remove_id in group["remove_ids"]:
removed = by_id[remove_id]
keep.duplicate_sources.append(
{
"id": removed.id,
"source_group": removed.source_group,
"source_label": removed.source_label,
"url": removed.url,
"reason": group["reason"],
}
)
append_supplement(keep, removed, group["reason"], "dedupe_remove")
removed_ids.add(remove_id)
for group in valid_merge_groups:
keep = by_id[group["keep_id"]]
for merge_id in group["merge_ids"]:
if merge_id in removed_ids:
continue
append_supplement(keep, by_id[merge_id], group["reason"], "merge_supplement")
deduped = [item for item in items if item.id not in removed_ids]
report = {
"input_count": len(items),
"candidate_group_count": len(candidates),
"removed_count": len(removed_ids),
"duplicate_groups": valid_groups,
"merge_groups": valid_merge_groups,
"uncertain": obj.get("uncertain", []) or [],
"errors": errors,
"skipped_for_deletion_ratio": False,