fix: add cross-day dedupe
This commit is contained in:
@@ -1,8 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Protocol
|
||||
|
||||
from .models import NewsItem, PublishedUrlEntry, PublishedUrls
|
||||
|
||||
|
||||
@dataclass
|
||||
class PublishResult:
|
||||
@@ -22,6 +27,122 @@ class BlogClient(Protocol):
|
||||
...
|
||||
|
||||
|
||||
def _parse_date(value: str | None) -> date | None:
|
||||
if not value:
|
||||
return None
|
||||
text = value.strip()
|
||||
try:
|
||||
return date.fromisoformat(text[:10])
|
||||
except ValueError:
|
||||
try:
|
||||
return datetime.fromisoformat(text).date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _published_entry_from_dict(value: Any) -> PublishedUrlEntry | None:
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
first_seen = str(value.get("first_seen") or "")
|
||||
last_published = str(value.get("last_published") or first_seen)
|
||||
titles = [str(title) for title in value.get("titles", []) or [] if str(title)]
|
||||
if not first_seen and not last_published:
|
||||
return None
|
||||
return PublishedUrlEntry(
|
||||
first_seen=first_seen or last_published,
|
||||
last_published=last_published or first_seen,
|
||||
titles=titles,
|
||||
)
|
||||
|
||||
|
||||
def load_published_urls(path: Path) -> PublishedUrls:
|
||||
if not path.exists():
|
||||
return PublishedUrls()
|
||||
try:
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return PublishedUrls()
|
||||
if not isinstance(raw, dict):
|
||||
return PublishedUrls()
|
||||
|
||||
urls: dict[str, PublishedUrlEntry] = {}
|
||||
for canonical_url, value in (raw.get("urls") or {}).items():
|
||||
if not canonical_url:
|
||||
continue
|
||||
entry = _published_entry_from_dict(value)
|
||||
if entry is not None:
|
||||
urls[str(canonical_url)] = entry
|
||||
return PublishedUrls(
|
||||
version=int(raw.get("version") or 1),
|
||||
urls=urls,
|
||||
updated_at=str(raw.get("updated_at") or ""),
|
||||
)
|
||||
|
||||
|
||||
def _entry_within_window(entry: PublishedUrlEntry, *, run_date: str, max_age_days: int) -> bool:
|
||||
if max_age_days < 0:
|
||||
return True
|
||||
current = _parse_date(run_date)
|
||||
previous = _parse_date(entry.last_published) or _parse_date(entry.first_seen)
|
||||
if current is None or previous is None:
|
||||
return True
|
||||
return (current - previous).days <= max_age_days
|
||||
|
||||
|
||||
def _published_urls_to_dict(history: PublishedUrls) -> dict[str, Any]:
|
||||
return {
|
||||
"version": history.version,
|
||||
"urls": {
|
||||
canonical_url: {
|
||||
"first_seen": entry.first_seen,
|
||||
"last_published": entry.last_published,
|
||||
"titles": entry.titles,
|
||||
}
|
||||
for canonical_url, entry in sorted(history.urls.items())
|
||||
},
|
||||
"updated_at": history.updated_at,
|
||||
}
|
||||
|
||||
|
||||
def update_published_urls(
|
||||
path: Path,
|
||||
items: list[NewsItem],
|
||||
*,
|
||||
run_date: str,
|
||||
max_age_days: int = 7,
|
||||
) -> PublishedUrls:
|
||||
history = load_published_urls(path)
|
||||
history.urls = {
|
||||
canonical_url: entry
|
||||
for canonical_url, entry in history.urls.items()
|
||||
if _entry_within_window(entry, run_date=run_date, max_age_days=max_age_days)
|
||||
}
|
||||
|
||||
for item in items:
|
||||
if not item.canonical_url:
|
||||
continue
|
||||
title = item.title or item.title_raw
|
||||
entry = history.urls.get(item.canonical_url)
|
||||
if entry is None:
|
||||
entry = PublishedUrlEntry(
|
||||
first_seen=run_date,
|
||||
last_published=run_date,
|
||||
titles=[],
|
||||
)
|
||||
history.urls[item.canonical_url] = entry
|
||||
entry.last_published = run_date
|
||||
if title and title not in entry.titles:
|
||||
entry.titles.append(title)
|
||||
|
||||
history.updated_at = datetime.now(timezone.utc).isoformat()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(
|
||||
json.dumps(_published_urls_to_dict(history), ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return history
|
||||
|
||||
|
||||
def dry_run_publish(slug: str, base_url: str) -> PublishResult:
|
||||
return PublishResult(
|
||||
mode="dry-run",
|
||||
|
||||
Reference in New Issue
Block a user