115 lines
3.9 KiB
Python
115 lines
3.9 KiB
Python
from __future__ import annotations
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import datetime, timezone
|
|
from time import perf_counter
|
|
from typing import Callable, Iterable, Any
|
|
|
|
from .clients import FetchTextError
|
|
from .models import SourceConfig, SourceResult
|
|
|
|
|
|
Fetcher = Callable[[SourceConfig, str], list[dict[str, Any]]]
|
|
|
|
|
|
def _status_from_exception(exc: Exception) -> str:
|
|
if isinstance(exc, FetchTextError):
|
|
return exc.error_type
|
|
if isinstance(exc, TimeoutError):
|
|
return "timeout"
|
|
return "error"
|
|
|
|
|
|
def _retry_count_from_exception(exc: Exception) -> int:
|
|
if isinstance(exc, FetchTextError):
|
|
return max(0, exc.attempts - 1)
|
|
return 0
|
|
|
|
|
|
def _collect_one(config: SourceConfig, run_date: str, fetcher: Fetcher) -> SourceResult:
|
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
|
if not config.enabled:
|
|
return SourceResult(
|
|
source=config.name,
|
|
role=config.role,
|
|
ok=False,
|
|
status="disabled",
|
|
fetched_at=fetched_at,
|
|
error=f"failure_policy={config.failure_policy}; min_items={config.min_items}",
|
|
)
|
|
|
|
started = perf_counter()
|
|
try:
|
|
items = fetcher(config, run_date)
|
|
elapsed_ms = int((perf_counter() - started) * 1000)
|
|
status = "ok" if items else "empty"
|
|
if status == "ok" and config.min_items and len(items) < config.min_items:
|
|
status = "below_min_items"
|
|
return SourceResult(
|
|
source=config.name,
|
|
role=config.role,
|
|
ok=status == "ok",
|
|
status=status,
|
|
items=items,
|
|
error=None if status == "ok" else f"items={len(items)}; min_items={config.min_items}; failure_policy={config.failure_policy}",
|
|
elapsed_ms=elapsed_ms,
|
|
fetched_at=fetched_at,
|
|
)
|
|
except Exception as exc:
|
|
elapsed_ms = int((perf_counter() - started) * 1000)
|
|
return SourceResult(
|
|
source=config.name,
|
|
role=config.role,
|
|
ok=False,
|
|
status=_status_from_exception(exc),
|
|
error=f"{type(exc).__name__}: {exc}; failure_policy={config.failure_policy}; min_items={config.min_items}",
|
|
elapsed_ms=elapsed_ms,
|
|
retry_count=_retry_count_from_exception(exc),
|
|
fetched_at=fetched_at,
|
|
)
|
|
|
|
|
|
def collect_sources(
|
|
configs: Iterable[SourceConfig],
|
|
run_date: str,
|
|
*,
|
|
fetcher: Fetcher,
|
|
max_workers: int | None = None,
|
|
) -> tuple[list[SourceResult], dict[str, Any]]:
|
|
ordered_configs = list(configs)
|
|
if not ordered_configs:
|
|
return [], {
|
|
"input_source_count": 0,
|
|
"ok_source_count": 0,
|
|
"failed_source_count": 0,
|
|
"raw_item_count": 0,
|
|
}
|
|
|
|
workers = max_workers or min(8, len(ordered_configs))
|
|
result_by_name: dict[str, SourceResult] = {}
|
|
|
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
futures = {
|
|
executor.submit(_collect_one, config, run_date, fetcher): config
|
|
for config in ordered_configs
|
|
}
|
|
for future in as_completed(futures):
|
|
config = futures[future]
|
|
result_by_name[config.name] = future.result()
|
|
|
|
results = [result_by_name[config.name] for config in ordered_configs]
|
|
report = {
|
|
"input_source_count": len(results),
|
|
"ok_source_count": sum(1 for result in results if result.ok),
|
|
"failed_source_count": sum(1 for result in results if not result.ok),
|
|
"raw_item_count": sum(len(result.items) for result in results),
|
|
"source_counts": {result.source: len(result.items) for result in results},
|
|
"statuses": {result.source: result.status for result in results},
|
|
"error_types": {
|
|
result.source: result.status
|
|
for result in results
|
|
if not result.ok and result.status != "disabled"
|
|
},
|
|
}
|
|
return results, report
|