fix: anchor stopwords - remove generic question patterns causing cross-topic contamination
- Add ANCHOR_STOPWORDS set in anchor.py (真正通用的疑问pattern) - Filter Chinese n-grams against stopwords in extract() - Update sparse.py content_words extraction to use stopword-filtered query - Diagnosis: 'Git rebase vs merge' query now correctly excludes Redis/asyncio blocks - Phase1 results: Full CGK 42.6 tokens avg, 0% contamination (vs Last-5 67.6 tokens, 100%) - Phase2 ablation: Gate-only accounts for most of the benefit - Phase3 sensitivity: OVERLAP/NEW_RATIO thresholds insensitive on clean data; RECENT_WINDOW is the primary token budget control Known honest limitations: - Test set is clean 4-topic synthetic data (no real dirty dialogue) - No strong baselines (BM25 ablation incomplete) - No answer-level evaluation (only retrieval blocks measured) - No parameter sensitivity on noisy real-world data - Zero contamination on 5 queries is not generalizable
This commit is contained in:
88
experiments/diagnose_contamination.py
Normal file
88
experiments/diagnose_contamination.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
"""诊断:为什么 Full CGK 有 20% 污染率"""
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from src.gatekeeper import ContextGatekeeper
|
||||
|
||||
redis_qa = [
|
||||
("Redis 分布式锁和 RedLock 算法有什么区别?", "RedLock是..."),
|
||||
("Redis 集群环境下怎么做分布式锁?", "集群下..."),
|
||||
("Redis 惰性删除和定期删除有什么区别?", "惰性删除..."),
|
||||
("Redis 的过期 key 对 RDB 快照有什么影响?", "过期key..."),
|
||||
("Redis 主从复制断线后如何增量同步?", "PSYNC..."),
|
||||
]
|
||||
asyncio_qa = [
|
||||
("asyncio.Task 的 cancel 方法怎么工作的?", "cancel..."),
|
||||
("asyncio.gather 和 asyncio.wait 的返回结果有什么区别?", "gather..."),
|
||||
("asyncio 的事件循环怎么启动和停止?", "事件循环..."),
|
||||
("asyncio.sleep 和 time.sleep 的区别是什么?", "sleep..."),
|
||||
("asyncio 的 Future 对象怎么获取结果?", "Future..."),
|
||||
]
|
||||
pg_qa = [
|
||||
("PostgreSQL 的 MVCC 机制是怎么保证读不阻塞写的?", "MVCC..."),
|
||||
("PostgreSQL 的 VACUUM 为什么要定期运行?", "VACUUM..."),
|
||||
("EXPLAIN ANALYZE 怎么看执行计划?", "EXPLAIN..."),
|
||||
("PostgreSQL B-tree 索引和 Hash 索引的区别是什么?", "B-tree..."),
|
||||
("PostgreSQL 的 TOAST 机制是什么?", "TOAST..."),
|
||||
]
|
||||
git_qa = [
|
||||
("Git 的 rebase 和 merge 的区别是什么?", "rebase..."),
|
||||
("Git reset 的 --soft、--mixed、--hard 有什么区别?", "reset..."),
|
||||
("Git stash 暂存区和工作目录的区别是什么?", "stash..."),
|
||||
("Git 的 bisect 怎么用来快速定位 bug?", "bisect..."),
|
||||
("Git 的 reflog 怎么用来恢复误删的提交?", "reflog..."),
|
||||
]
|
||||
|
||||
def build_gate():
|
||||
g = ContextGatekeeper(token_budget=4000)
|
||||
for i in range(5):
|
||||
g.add_turn(redis_qa[i][0], redis_qa[i][1])
|
||||
g.add_turn(asyncio_qa[i][0], asyncio_qa[i][1])
|
||||
g.add_turn(pg_qa[i][0], pg_qa[i][1])
|
||||
g.add_turn(git_qa[i][0], git_qa[i][1])
|
||||
return g
|
||||
|
||||
def diagnose(query, target_topic):
|
||||
gate = build_gate()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Query: {query}")
|
||||
print(f"Target: {target_topic}")
|
||||
print(f"="*60)
|
||||
|
||||
# 提取 query 锚点
|
||||
q_anchors, has_deictic = gate.anchor_extractor.extract_with_deictic(query)
|
||||
print(f"Query anchors: {q_anchors}")
|
||||
print(f"Has deictic: {has_deictic}")
|
||||
|
||||
# 话题切换检测
|
||||
switched = gate.topic_gate.is_topic_switch(query, gate._active_topic)
|
||||
print(f"Topic switched: {switched}")
|
||||
|
||||
# 召回的块
|
||||
sel = gate.select(query)
|
||||
print(f"Selected blocks: {len(sel)}")
|
||||
|
||||
for item in sel:
|
||||
content = item['user'] + item['assistant']
|
||||
found_topics = []
|
||||
for t in ['Redis', 'asyncio', 'PostgreSQL', 'Git']:
|
||||
if t.lower() in content.lower():
|
||||
found_topics.append(t)
|
||||
print(f" turn {item['turn_id']}: {found_topics} -> {content[:60]}")
|
||||
|
||||
# 检查污染
|
||||
all_text = ' '.join(item['user'] + item['assistant'] for item in sel)
|
||||
other = [t for t in ['Redis','asyncio','PostgreSQL','Git']
|
||||
if t.lower() in all_text.lower() and t.lower() != target_topic.lower()]
|
||||
print(f"Other topics in context: {other}")
|
||||
print(f"IS CONTAMINATED: {len(other) > 0}")
|
||||
|
||||
# 诊断那两个污染案例
|
||||
diagnose("Git 的 rebase 和 merge 有什么区别?", "Git")
|
||||
diagnose("asyncio.Task 的 cancel 方法怎么工作的?", "asyncio")
|
||||
|
||||
# 对比:干净的例子
|
||||
diagnose("Redis 惰性删除和定期删除有什么区别?", "Redis")
|
||||
diagnose("再问Git", "Git reset 和 revert 的应用场景有什么区别?")
|
||||
Reference in New Issue
Block a user