""" 锚点提取模块 - 用规则从文本中提取有代表性的锚点 """ import re from collections import Counter from typing import List, Tuple class AnchorExtractor: """提取文本中的锚点,用于话题判断和检索""" def __init__(self): # IDF 字典,用于权重计算(简单起见用内部默认值,可扩展) self._idf_cache: dict[str, float] = {} self._doc_count = 0 def extract(self, text: str) -> List[str]: """从文本中提取锚点列表""" anchors = [] # 中文 2-gram / 3-gram(转小写以便匹配) chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text) for chunk in chinese_chars: if len(chunk) >= 2: for i in range(len(chunk) - 1): anchors.append(chunk[i:i+2].lower()) if len(chunk) >= 3: for i in range(len(chunk) - 2): anchors.append(chunk[i:i+3].lower()) # 英文单词 english_words = re.findall(r'[a-zA-Z_][a-zA-Z0-9_-]*', text) anchors.extend([w.lower() for w in english_words if len(w) >= 2]) # 代码标识符(反引号内) code_segments = re.findall(r'`([^`]+)`', text) for seg in code_segments: anchors.extend(re.findall(r'[a-zA-Z_][a-zA-Z0-9_-]*', seg)) # 引号内的短语 quoted_phrases = re.findall(r'["\u201c\u201d]([^"\u201c\u201d]+)["\u201c\u201d]', text) anchors.extend(quoted_phrases) # 数字、版本号 versions = re.findall(r'v?\d+(\.\d+)*', text) anchors.extend(versions) return anchors def extract_with_deictic(self, text: str) -> Tuple[List[str], bool]: """提取锚点,并检测是否有指代词""" anchors = self.extract(text) deictic_patterns = [ '这个', '那个', '它', '上面', '刚才', '继续', '展开', '为什么会这样', '怎么改', '然后', '还有', '另外' ] has_deictic = any(p in text for p in deictic_patterns) return anchors, has_deictic def compute_idf(self, all_doc_anchors: List[List[str]]) -> None: """根据一批文档计算 IDF 值""" self._doc_count = len(all_doc_anchors) doc_freq = Counter() for anchors in all_doc_anchors: unique_anchors = set(anchors) for a in unique_anchors: doc_freq[a] += 1 for anchor, df in doc_freq.items(): self._idf_cache[anchor] = self._doc_count / df def idf(self, anchor: str) -> float: """获取锚点的 IDF 值,默认 1.0""" return self._idf_cache.get(anchor, 1.0)