Files
context-gatekeeper/experiments/diagnose_contamination.py
Elaina 9e44748f91 fix: anchor stopwords - remove generic question patterns causing cross-topic contamination
- Add ANCHOR_STOPWORDS set in anchor.py (真正通用的疑问pattern)
- Filter Chinese n-grams against stopwords in extract()
- Update sparse.py content_words extraction to use stopword-filtered query
- Diagnosis: 'Git rebase vs merge' query now correctly excludes Redis/asyncio blocks
- Phase1 results: Full CGK 42.6 tokens avg, 0% contamination (vs Last-5 67.6 tokens, 100%)
- Phase2 ablation: Gate-only accounts for most of the benefit
- Phase3 sensitivity: OVERLAP/NEW_RATIO thresholds insensitive on clean data;
  RECENT_WINDOW is the primary token budget control

Known honest limitations:
- Test set is clean 4-topic synthetic data (no real dirty dialogue)
- No strong baselines (BM25 ablation incomplete)
- No answer-level evaluation (only retrieval blocks measured)
- No parameter sensitivity on noisy real-world data
- Zero contamination on 5 queries is not generalizable
2026-04-22 22:30:18 +08:00

88 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""诊断:为什么 Full CGK 有 20% 污染率"""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from src.gatekeeper import ContextGatekeeper
redis_qa = [
("Redis 分布式锁和 RedLock 算法有什么区别?", "RedLock是..."),
("Redis 集群环境下怎么做分布式锁?", "集群下..."),
("Redis 惰性删除和定期删除有什么区别?", "惰性删除..."),
("Redis 的过期 key 对 RDB 快照有什么影响?", "过期key..."),
("Redis 主从复制断线后如何增量同步?", "PSYNC..."),
]
asyncio_qa = [
("asyncio.Task 的 cancel 方法怎么工作的?", "cancel..."),
("asyncio.gather 和 asyncio.wait 的返回结果有什么区别?", "gather..."),
("asyncio 的事件循环怎么启动和停止?", "事件循环..."),
("asyncio.sleep 和 time.sleep 的区别是什么?", "sleep..."),
("asyncio 的 Future 对象怎么获取结果?", "Future..."),
]
pg_qa = [
("PostgreSQL 的 MVCC 机制是怎么保证读不阻塞写的?", "MVCC..."),
("PostgreSQL 的 VACUUM 为什么要定期运行?", "VACUUM..."),
("EXPLAIN ANALYZE 怎么看执行计划?", "EXPLAIN..."),
("PostgreSQL B-tree 索引和 Hash 索引的区别是什么?", "B-tree..."),
("PostgreSQL 的 TOAST 机制是什么?", "TOAST..."),
]
git_qa = [
("Git 的 rebase 和 merge 的区别是什么?", "rebase..."),
("Git reset 的 --soft、--mixed、--hard 有什么区别?", "reset..."),
("Git stash 暂存区和工作目录的区别是什么?", "stash..."),
("Git 的 bisect 怎么用来快速定位 bug", "bisect..."),
("Git 的 reflog 怎么用来恢复误删的提交?", "reflog..."),
]
def build_gate():
g = ContextGatekeeper(token_budget=4000)
for i in range(5):
g.add_turn(redis_qa[i][0], redis_qa[i][1])
g.add_turn(asyncio_qa[i][0], asyncio_qa[i][1])
g.add_turn(pg_qa[i][0], pg_qa[i][1])
g.add_turn(git_qa[i][0], git_qa[i][1])
return g
def diagnose(query, target_topic):
gate = build_gate()
print(f"\n{'='*60}")
print(f"Query: {query}")
print(f"Target: {target_topic}")
print(f"="*60)
# 提取 query 锚点
q_anchors, has_deictic = gate.anchor_extractor.extract_with_deictic(query)
print(f"Query anchors: {q_anchors}")
print(f"Has deictic: {has_deictic}")
# 话题切换检测
switched = gate.topic_gate.is_topic_switch(query, gate._active_topic)
print(f"Topic switched: {switched}")
# 召回的块
sel = gate.select(query)
print(f"Selected blocks: {len(sel)}")
for item in sel:
content = item['user'] + item['assistant']
found_topics = []
for t in ['Redis', 'asyncio', 'PostgreSQL', 'Git']:
if t.lower() in content.lower():
found_topics.append(t)
print(f" turn {item['turn_id']}: {found_topics} -> {content[:60]}")
# 检查污染
all_text = ' '.join(item['user'] + item['assistant'] for item in sel)
other = [t for t in ['Redis','asyncio','PostgreSQL','Git']
if t.lower() in all_text.lower() and t.lower() != target_topic.lower()]
print(f"Other topics in context: {other}")
print(f"IS CONTAMINATED: {len(other) > 0}")
# 诊断那两个污染案例
diagnose("Git 的 rebase 和 merge 有什么区别?", "Git")
diagnose("asyncio.Task 的 cancel 方法怎么工作的?", "asyncio")
# 对比:干净的例子
diagnose("Redis 惰性删除和定期删除有什么区别?", "Redis")
diagnose("再问Git", "Git reset 和 revert 的应用场景有什么区别?")