""" 任务特征提取与匹配模块 用于更精确的相似任务识别 """ import re from typing import Dict, List, Set, Optional, Tuple from dataclasses import dataclass from pathlib import Path @dataclass class TaskFeatures: """任务结构化特征""" # 基础信息 raw_input: str keywords: Set[str] # 关键参数 file_formats: Set[str] # 文件格式(如 .txt, .csv, .json) directory_paths: Set[str] # 目录路径 file_names: Set[str] # 文件名 naming_patterns: List[str] # 命名规则(如 "按日期", "按序号") # 操作类型 operations: Set[str] # 操作类型(如 "批量重命名", "文件转换", "数据处理") # 数量/范围参数 quantities: List[str] # 数量相关(如 "100个", "所有") # 其他约束 constraints: List[str] # 其他约束条件 @dataclass class TaskDifference: """任务差异描述""" category: str # 差异类别 field: str # 字段名 current_value: str # 当前任务的值 history_value: str # 历史任务的值 importance: str # 重要性:critical/high/medium/low class TaskFeatureExtractor: """任务特征提取器""" # 文件格式模式 FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)' # 目录路径模式(Windows 和 Unix) DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)' # 文件名模式 FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+' # 数量模式 QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)' # 操作关键词映射 OPERATION_KEYWORDS = { '重命名': ['重命名', '改名', '命名', '更名'], '转换': ['转换', '转为', '转成', '变成', '改成'], '批量处理': ['批量', '批处理', '一次性'], '复制': ['复制', '拷贝', 'copy'], '移动': ['移动', '转移', 'move'], '删除': ['删除', '清理', '移除'], '合并': ['合并', '整合', '汇总'], '分割': ['分割', '拆分', '切分'], '压缩': ['压缩', '打包'], '解压': ['解压', '解包', '提取'], '排序': ['排序', '排列'], '筛选': ['筛选', '过滤', '查找'], '统计': ['统计', '计数', '汇总'], '生成': ['生成', '创建', '制作'], } # 命名规则关键词 NAMING_PATTERNS = { '按日期': ['日期', '时间', 'date', 'time'], '按序号': ['序号', '编号', '数字', '顺序'], '按前缀': ['前缀', '开头'], '按后缀': ['后缀', '结尾'], '按内容': ['内容', '根据'], } def extract(self, user_input: str) -> TaskFeatures: """ 从用户输入中提取结构化特征 Args: user_input: 用户输入文本 Returns: TaskFeatures: 提取的特征 """ # 提取关键词 keywords = self._extract_keywords(user_input) # 提取文件格式 file_formats = self._extract_file_formats(user_input) # 提取目录路径 directory_paths = self._extract_directory_paths(user_input) # 提取文件名 file_names = self._extract_file_names(user_input) # 提取命名规则 naming_patterns = self._extract_naming_patterns(user_input) # 提取操作类型 operations = self._extract_operations(user_input) # 提取数量信息 quantities = self._extract_quantities(user_input) # 提取其他约束 constraints = self._extract_constraints(user_input) return TaskFeatures( raw_input=user_input, keywords=keywords, file_formats=file_formats, directory_paths=directory_paths, file_names=file_names, naming_patterns=naming_patterns, operations=operations, quantities=quantities, constraints=constraints ) def _extract_keywords(self, text: str) -> Set[str]: """提取关键词(基础分词)""" words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower()) return set(w for w in words if len(w) >= 2) def _extract_file_formats(self, text: str) -> Set[str]: """提取文件格式""" matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower()) return set(f'.{m}' for m in matches) def _extract_directory_paths(self, text: str) -> Set[str]: """提取目录路径""" matches = re.findall(self.DIR_PATH_PATTERN, text) # 标准化路径 normalized = set() for path in matches: try: p = Path(path) normalized.add(str(p.resolve())) except: normalized.add(path) return normalized def _extract_file_names(self, text: str) -> Set[str]: """提取文件名""" matches = re.findall(self.FILE_NAME_PATTERN, text) return set(matches) def _extract_naming_patterns(self, text: str) -> List[str]: """提取命名规则""" patterns = [] for pattern_name, keywords in self.NAMING_PATTERNS.items(): if any(kw in text for kw in keywords): patterns.append(pattern_name) return patterns def _extract_operations(self, text: str) -> Set[str]: """提取操作类型""" operations = set() for op_name, keywords in self.OPERATION_KEYWORDS.items(): if any(kw in text for kw in keywords): operations.add(op_name) return operations def _extract_quantities(self, text: str) -> List[str]: """提取数量信息""" matches = re.findall(self.QUANTITY_PATTERN, text) return matches def _extract_constraints(self, text: str) -> List[str]: """提取其他约束条件""" constraints = [] # 条件关键词 condition_keywords = ['如果', '当', '满足', '符合', '包含', '不包含', '大于', '小于', '等于'] for keyword in condition_keywords: if keyword in text: # 提取包含该关键词的句子片段 pattern = f'[^。,;]*{keyword}[^。,;]*' matches = re.findall(pattern, text) constraints.extend(matches) return constraints class TaskMatcher: """任务匹配器""" def __init__(self): self.extractor = TaskFeatureExtractor() def calculate_similarity( self, current_input: str, history_input: str ) -> Tuple[float, List[TaskDifference]]: """ 计算两个任务的相似度,并返回差异列表 Args: current_input: 当前任务输入 history_input: 历史任务输入 Returns: (相似度分数 0-1, 差异列表) """ # 提取特征 current_features = self.extractor.extract(current_input) history_features = self.extractor.extract(history_input) # 计算各维度相似度和差异 differences = [] scores = [] # 1. 关键词相似度(基础权重 0.2) keyword_sim = self._jaccard_similarity( current_features.keywords, history_features.keywords ) scores.append(('keywords', keyword_sim, 0.2)) # 2. 文件格式相似度(权重 0.15) format_sim, format_diffs = self._compare_sets( current_features.file_formats, history_features.file_formats, 'file_formats', '文件格式', 'high' ) scores.append(('file_formats', format_sim, 0.15)) differences.extend(format_diffs) # 3. 目录路径相似度(权重 0.15) dir_sim, dir_diffs = self._compare_sets( current_features.directory_paths, history_features.directory_paths, 'directory_paths', '目录路径', 'critical' ) scores.append(('directory_paths', dir_sim, 0.15)) differences.extend(dir_diffs) # 4. 命名规则相似度(权重 0.15) naming_sim, naming_diffs = self._compare_lists( current_features.naming_patterns, history_features.naming_patterns, 'naming_patterns', '命名规则', 'high' ) scores.append(('naming_patterns', naming_sim, 0.15)) differences.extend(naming_diffs) # 5. 操作类型相似度(权重 0.2) op_sim, op_diffs = self._compare_sets( current_features.operations, history_features.operations, 'operations', '操作类型', 'critical' ) scores.append(('operations', op_sim, 0.2)) differences.extend(op_diffs) # 6. 数量信息相似度(权重 0.1) qty_sim, qty_diffs = self._compare_lists( current_features.quantities, history_features.quantities, 'quantities', '数量', 'medium' ) scores.append(('quantities', qty_sim, 0.1)) differences.extend(qty_diffs) # 7. 约束条件相似度(权重 0.05) constraint_sim, constraint_diffs = self._compare_lists( current_features.constraints, history_features.constraints, 'constraints', '约束条件', 'medium' ) scores.append(('constraints', constraint_sim, 0.05)) differences.extend(constraint_diffs) # 计算加权总分 total_score = sum(score * weight for _, score, weight in scores) return total_score, differences def _jaccard_similarity(self, set1: Set, set2: Set) -> float: """计算 Jaccard 相似度""" if not set1 and not set2: return 1.0 if not set1 or not set2: return 0.0 intersection = len(set1 & set2) union = len(set1 | set2) return intersection / union if union > 0 else 0.0 def _compare_sets( self, current: Set[str], history: Set[str], field: str, display_name: str, importance: str ) -> Tuple[float, List[TaskDifference]]: """比较两个集合,返回相似度和差异""" similarity = self._jaccard_similarity(current, history) differences = [] # 找出差异 only_current = current - history only_history = history - current if only_current or only_history: differences.append(TaskDifference( category=display_name, field=field, current_value=', '.join(sorted(only_current)) if only_current else '(无)', history_value=', '.join(sorted(only_history)) if only_history else '(无)', importance=importance )) return similarity, differences def _compare_lists( self, current: List[str], history: List[str], field: str, display_name: str, importance: str ) -> Tuple[float, List[TaskDifference]]: """比较两个列表,返回相似度和差异""" # 转为集合计算相似度 current_set = set(current) history_set = set(history) similarity = self._jaccard_similarity(current_set, history_set) differences = [] if current != history: differences.append(TaskDifference( category=display_name, field=field, current_value=', '.join(current) if current else '(无)', history_value=', '.join(history) if history else '(无)', importance=importance )) return similarity, differences # 全局单例 _matcher: Optional[TaskMatcher] = None def get_task_matcher() -> TaskMatcher: """获取任务匹配器单例""" global _matcher if _matcher is None: _matcher = TaskMatcher() return _matcher