#!/usr/bin/env python3 """诊断:为什么 Full CGK 有 20% 污染率""" import sys, os sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from src.gatekeeper import ContextGatekeeper redis_qa = [ ("Redis 分布式锁和 RedLock 算法有什么区别?", "RedLock是..."), ("Redis 集群环境下怎么做分布式锁?", "集群下..."), ("Redis 惰性删除和定期删除有什么区别?", "惰性删除..."), ("Redis 的过期 key 对 RDB 快照有什么影响?", "过期key..."), ("Redis 主从复制断线后如何增量同步?", "PSYNC..."), ] asyncio_qa = [ ("asyncio.Task 的 cancel 方法怎么工作的?", "cancel..."), ("asyncio.gather 和 asyncio.wait 的返回结果有什么区别?", "gather..."), ("asyncio 的事件循环怎么启动和停止?", "事件循环..."), ("asyncio.sleep 和 time.sleep 的区别是什么?", "sleep..."), ("asyncio 的 Future 对象怎么获取结果?", "Future..."), ] pg_qa = [ ("PostgreSQL 的 MVCC 机制是怎么保证读不阻塞写的?", "MVCC..."), ("PostgreSQL 的 VACUUM 为什么要定期运行?", "VACUUM..."), ("EXPLAIN ANALYZE 怎么看执行计划?", "EXPLAIN..."), ("PostgreSQL B-tree 索引和 Hash 索引的区别是什么?", "B-tree..."), ("PostgreSQL 的 TOAST 机制是什么?", "TOAST..."), ] git_qa = [ ("Git 的 rebase 和 merge 的区别是什么?", "rebase..."), ("Git reset 的 --soft、--mixed、--hard 有什么区别?", "reset..."), ("Git stash 暂存区和工作目录的区别是什么?", "stash..."), ("Git 的 bisect 怎么用来快速定位 bug?", "bisect..."), ("Git 的 reflog 怎么用来恢复误删的提交?", "reflog..."), ] def build_gate(): g = ContextGatekeeper(token_budget=4000) for i in range(5): g.add_turn(redis_qa[i][0], redis_qa[i][1]) g.add_turn(asyncio_qa[i][0], asyncio_qa[i][1]) g.add_turn(pg_qa[i][0], pg_qa[i][1]) g.add_turn(git_qa[i][0], git_qa[i][1]) return g def diagnose(query, target_topic): gate = build_gate() print(f"\n{'='*60}") print(f"Query: {query}") print(f"Target: {target_topic}") print(f"="*60) # 提取 query 锚点 q_anchors, has_deictic = gate.anchor_extractor.extract_with_deictic(query) print(f"Query anchors: {q_anchors}") print(f"Has deictic: {has_deictic}") # 话题切换检测 switched = gate.topic_gate.is_topic_switch(query, gate._active_topic) print(f"Topic switched: {switched}") # 召回的块 sel = gate.select(query) print(f"Selected blocks: {len(sel)}") for item in sel: content = item['user'] + item['assistant'] found_topics = [] for t in ['Redis', 'asyncio', 'PostgreSQL', 'Git']: if t.lower() in content.lower(): found_topics.append(t) print(f" turn {item['turn_id']}: {found_topics} -> {content[:60]}") # 检查污染 all_text = ' '.join(item['user'] + item['assistant'] for item in sel) other = [t for t in ['Redis','asyncio','PostgreSQL','Git'] if t.lower() in all_text.lower() and t.lower() != target_topic.lower()] print(f"Other topics in context: {other}") print(f"IS CONTAMINATED: {len(other) > 0}") # 诊断那两个污染案例 diagnose("Git 的 rebase 和 merge 有什么区别?", "Git") diagnose("asyncio.Task 的 cancel 方法怎么工作的?", "asyncio") # 对比:干净的例子 diagnose("Redis 惰性删除和定期删除有什么区别?", "Redis") diagnose("再问Git", "Git reset 和 revert 的应用场景有什么区别?")