fix: add cross-day dedupe
This commit is contained in:
@@ -5,9 +5,9 @@ from typing import Any
|
||||
from .assemble import assemble_markdown
|
||||
from .classify import classify_and_order_items
|
||||
from .collect import Fetcher, collect_sources
|
||||
from .dedupe import hard_dedup_items
|
||||
from .dedupe import cross_day_dedup_items, hard_dedup_items
|
||||
from .guide import GuideLlmCall, generate_guide
|
||||
from .models import SourceConfig
|
||||
from .models import PublishedUrls, SourceConfig
|
||||
from .normalize import normalize_items
|
||||
from .publish import BlogClient, publish_markdown
|
||||
from .rewrite import RewriteLlmCall, rewrite_items
|
||||
@@ -15,6 +15,7 @@ from .semantic_dedupe import SemanticLlmCall, semantic_dedup_items
|
||||
|
||||
|
||||
def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
||||
max_item_age_days = value.get("max_item_age_days")
|
||||
return SourceConfig(
|
||||
name=value["name"],
|
||||
type=value["type"],
|
||||
@@ -26,6 +27,7 @@ def _source_config_from_dict(value: dict[str, Any]) -> SourceConfig:
|
||||
retries=int(value.get("retries", 0)),
|
||||
min_items=int(value.get("min_items", 0)),
|
||||
url=value.get("url", ""),
|
||||
max_item_age_days=int(max_item_age_days) if max_item_age_days is not None else None,
|
||||
)
|
||||
|
||||
|
||||
@@ -58,6 +60,43 @@ def run_stage0_to_stage2(
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage2_5(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
*,
|
||||
fetcher: Fetcher,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
||||
if cross_day_dedup_enabled:
|
||||
items, stage2_5_report = cross_day_dedup_items(
|
||||
stage2_result["items"],
|
||||
published_urls,
|
||||
run_date=run_date,
|
||||
max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
else:
|
||||
items = stage2_result["items"]
|
||||
stage2_5_report = {
|
||||
"input_count": len(items),
|
||||
"output_count": len(items),
|
||||
"removed_count": 0,
|
||||
"removed": [],
|
||||
"enabled": False,
|
||||
"max_age_days": cross_day_dedup_max_age_days,
|
||||
}
|
||||
reports = dict(stage2_result["reports"])
|
||||
stage2_5_report.setdefault("enabled", cross_day_dedup_enabled)
|
||||
reports["stage2_5"] = stage2_5_report
|
||||
return {
|
||||
"source_results": stage2_result["source_results"],
|
||||
"items": items,
|
||||
"reports": reports,
|
||||
}
|
||||
|
||||
|
||||
def run_stage0_to_stage4(
|
||||
source_configs: list[dict[str, Any] | SourceConfig],
|
||||
run_date: str,
|
||||
@@ -65,10 +104,25 @@ def run_stage0_to_stage4(
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage2_result = run_stage0_to_stage2(source_configs, run_date, fetcher=fetcher)
|
||||
items = stage2_result["items"]
|
||||
candidates = stage2_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||
stage2_5_result = run_stage0_to_stage2_5(
|
||||
source_configs,
|
||||
run_date,
|
||||
fetcher=fetcher,
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
items = stage2_5_result["items"]
|
||||
remaining_ids = {item.id for item in items}
|
||||
candidates = [
|
||||
candidate
|
||||
for candidate in stage2_5_result["reports"]["stage2"].get("possible_duplicates", [])
|
||||
if set(candidate.get("item_ids", [])).issubset(remaining_ids)
|
||||
]
|
||||
semantic_items, stage3_report = semantic_dedup_items(
|
||||
items,
|
||||
candidates,
|
||||
@@ -78,11 +132,11 @@ def run_stage0_to_stage4(
|
||||
semantic_items,
|
||||
llm_call=rewrite_llm_call,
|
||||
)
|
||||
reports = dict(stage2_result["reports"])
|
||||
reports = dict(stage2_5_result["reports"])
|
||||
reports["stage3"] = stage3_report
|
||||
reports["stage4"] = stage4_report
|
||||
return {
|
||||
"source_results": stage2_result["source_results"],
|
||||
"source_results": stage2_5_result["source_results"],
|
||||
"items": rewritten_items,
|
||||
"reports": reports,
|
||||
}
|
||||
@@ -95,6 +149,9 @@ def run_stage0_to_stage5(
|
||||
fetcher: Fetcher,
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage4_result = run_stage0_to_stage4(
|
||||
source_configs,
|
||||
@@ -102,6 +159,9 @@ def run_stage0_to_stage5(
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
classified_items, stage5_report = classify_and_order_items(stage4_result["items"])
|
||||
reports = dict(stage4_result["reports"])
|
||||
@@ -121,6 +181,9 @@ def run_stage0_to_stage6(
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
guide_llm_call: GuideLlmCall,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage5_result = run_stage0_to_stage5(
|
||||
source_configs,
|
||||
@@ -128,6 +191,9 @@ def run_stage0_to_stage6(
|
||||
fetcher=fetcher,
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
guide, stage6_report = generate_guide(stage5_result["items"], llm_call=guide_llm_call)
|
||||
reports = dict(stage5_result["reports"])
|
||||
@@ -148,6 +214,9 @@ def run_stage0_to_stage7(
|
||||
semantic_llm_call: SemanticLlmCall,
|
||||
rewrite_llm_call: RewriteLlmCall,
|
||||
guide_llm_call: GuideLlmCall,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage6_result = run_stage0_to_stage6(
|
||||
source_configs,
|
||||
@@ -156,6 +225,9 @@ def run_stage0_to_stage7(
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
guide_llm_call=guide_llm_call,
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
markdown, stage7_report = assemble_markdown(stage6_result["items"], stage6_result["guide"])
|
||||
upstream_blocking_errors: list[str] = []
|
||||
@@ -187,6 +259,9 @@ def run_stage0_to_stage8(
|
||||
mode: str,
|
||||
base_url: str,
|
||||
client: BlogClient | None,
|
||||
published_urls: PublishedUrls | None = None,
|
||||
cross_day_dedup_enabled: bool = True,
|
||||
cross_day_dedup_max_age_days: int = 7,
|
||||
) -> dict[str, Any]:
|
||||
stage7_result = run_stage0_to_stage7(
|
||||
source_configs,
|
||||
@@ -195,6 +270,9 @@ def run_stage0_to_stage8(
|
||||
semantic_llm_call=semantic_llm_call,
|
||||
rewrite_llm_call=rewrite_llm_call,
|
||||
guide_llm_call=guide_llm_call,
|
||||
published_urls=published_urls,
|
||||
cross_day_dedup_enabled=cross_day_dedup_enabled,
|
||||
cross_day_dedup_max_age_days=cross_day_dedup_max_age_days,
|
||||
)
|
||||
slug = f"ai-{run_date}"
|
||||
publish_result = publish_markdown(
|
||||
|
||||
Reference in New Issue
Block a user