Files
LocalAgent/history/task_features.py
Mimikko-zeus 8a538bb950 feat: refactor API key configuration and enhance application initialization
- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic.
- Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration.
- Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup.
- Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics.
- Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management.
- Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
2026-02-27 14:32:30 +08:00

381 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
任务特征提取与匹配模块
用于更精确的相似任务识别
"""
import re
from typing import Dict, List, Set, Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TaskFeatures:
"""任务结构化特征"""
# 基础信息
raw_input: str
keywords: Set[str]
# 关键参数
file_formats: Set[str] # 文件格式(如 .txt, .csv, .json
directory_paths: Set[str] # 目录路径
file_names: Set[str] # 文件名
naming_patterns: List[str] # 命名规则(如 "按日期", "按序号"
# 操作类型
operations: Set[str] # 操作类型(如 "批量重命名", "文件转换", "数据处理"
# 数量/范围参数
quantities: List[str] # 数量相关(如 "100个", "所有"
# 其他约束
constraints: List[str] # 其他约束条件
@dataclass
class TaskDifference:
"""任务差异描述"""
category: str # 差异类别
field: str # 字段名
current_value: str # 当前任务的值
history_value: str # 历史任务的值
importance: str # 重要性critical/high/medium/low
class TaskFeatureExtractor:
"""任务特征提取器"""
# 文件格式模式
FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'
# 目录路径模式Windows 和 Unix
DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'
# 文件名模式
FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'
# 数量模式
QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'
# 操作关键词映射
OPERATION_KEYWORDS = {
'重命名': ['重命名', '改名', '命名', '更名'],
'转换': ['转换', '转为', '转成', '变成', '改成'],
'批量处理': ['批量', '批处理', '一次性'],
'复制': ['复制', '拷贝', 'copy'],
'移动': ['移动', '转移', 'move'],
'删除': ['删除', '清理', '移除'],
'合并': ['合并', '整合', '汇总'],
'分割': ['分割', '拆分', '切分'],
'压缩': ['压缩', '打包'],
'解压': ['解压', '解包', '提取'],
'排序': ['排序', '排列'],
'筛选': ['筛选', '过滤', '查找'],
'统计': ['统计', '计数', '汇总'],
'生成': ['生成', '创建', '制作'],
}
# 命名规则关键词
NAMING_PATTERNS = {
'按日期': ['日期', '时间', 'date', 'time'],
'按序号': ['序号', '编号', '数字', '顺序'],
'按前缀': ['前缀', '开头'],
'按后缀': ['后缀', '结尾'],
'按内容': ['内容', '根据'],
}
def extract(self, user_input: str) -> TaskFeatures:
"""
从用户输入中提取结构化特征
Args:
user_input: 用户输入文本
Returns:
TaskFeatures: 提取的特征
"""
# 提取关键词
keywords = self._extract_keywords(user_input)
# 提取文件格式
file_formats = self._extract_file_formats(user_input)
# 提取目录路径
directory_paths = self._extract_directory_paths(user_input)
# 提取文件名
file_names = self._extract_file_names(user_input)
# 提取命名规则
naming_patterns = self._extract_naming_patterns(user_input)
# 提取操作类型
operations = self._extract_operations(user_input)
# 提取数量信息
quantities = self._extract_quantities(user_input)
# 提取其他约束
constraints = self._extract_constraints(user_input)
return TaskFeatures(
raw_input=user_input,
keywords=keywords,
file_formats=file_formats,
directory_paths=directory_paths,
file_names=file_names,
naming_patterns=naming_patterns,
operations=operations,
quantities=quantities,
constraints=constraints
)
def _extract_keywords(self, text: str) -> Set[str]:
"""提取关键词(基础分词)"""
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
return set(w for w in words if len(w) >= 2)
def _extract_file_formats(self, text: str) -> Set[str]:
"""提取文件格式"""
matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
return set(f'.{m}' for m in matches)
def _extract_directory_paths(self, text: str) -> Set[str]:
"""提取目录路径"""
matches = re.findall(self.DIR_PATH_PATTERN, text)
# 标准化路径
normalized = set()
for path in matches:
try:
p = Path(path)
normalized.add(str(p.resolve()))
except:
normalized.add(path)
return normalized
def _extract_file_names(self, text: str) -> Set[str]:
"""提取文件名"""
matches = re.findall(self.FILE_NAME_PATTERN, text)
return set(matches)
def _extract_naming_patterns(self, text: str) -> List[str]:
"""提取命名规则"""
patterns = []
for pattern_name, keywords in self.NAMING_PATTERNS.items():
if any(kw in text for kw in keywords):
patterns.append(pattern_name)
return patterns
def _extract_operations(self, text: str) -> Set[str]:
"""提取操作类型"""
operations = set()
for op_name, keywords in self.OPERATION_KEYWORDS.items():
if any(kw in text for kw in keywords):
operations.add(op_name)
return operations
def _extract_quantities(self, text: str) -> List[str]:
"""提取数量信息"""
matches = re.findall(self.QUANTITY_PATTERN, text)
return matches
def _extract_constraints(self, text: str) -> List[str]:
"""提取其他约束条件"""
constraints = []
# 条件关键词
condition_keywords = ['如果', '', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
for keyword in condition_keywords:
if keyword in text:
# 提取包含该关键词的句子片段
pattern = f'[^。,;]*{keyword}[^。,;]*'
matches = re.findall(pattern, text)
constraints.extend(matches)
return constraints
class TaskMatcher:
"""任务匹配器"""
def __init__(self):
self.extractor = TaskFeatureExtractor()
def calculate_similarity(
self,
current_input: str,
history_input: str
) -> Tuple[float, List[TaskDifference]]:
"""
计算两个任务的相似度,并返回差异列表
Args:
current_input: 当前任务输入
history_input: 历史任务输入
Returns:
(相似度分数 0-1, 差异列表)
"""
# 提取特征
current_features = self.extractor.extract(current_input)
history_features = self.extractor.extract(history_input)
# 计算各维度相似度和差异
differences = []
scores = []
# 1. 关键词相似度(基础权重 0.2
keyword_sim = self._jaccard_similarity(
current_features.keywords,
history_features.keywords
)
scores.append(('keywords', keyword_sim, 0.2))
# 2. 文件格式相似度(权重 0.15
format_sim, format_diffs = self._compare_sets(
current_features.file_formats,
history_features.file_formats,
'file_formats',
'文件格式',
'high'
)
scores.append(('file_formats', format_sim, 0.15))
differences.extend(format_diffs)
# 3. 目录路径相似度(权重 0.15
dir_sim, dir_diffs = self._compare_sets(
current_features.directory_paths,
history_features.directory_paths,
'directory_paths',
'目录路径',
'critical'
)
scores.append(('directory_paths', dir_sim, 0.15))
differences.extend(dir_diffs)
# 4. 命名规则相似度(权重 0.15
naming_sim, naming_diffs = self._compare_lists(
current_features.naming_patterns,
history_features.naming_patterns,
'naming_patterns',
'命名规则',
'high'
)
scores.append(('naming_patterns', naming_sim, 0.15))
differences.extend(naming_diffs)
# 5. 操作类型相似度(权重 0.2
op_sim, op_diffs = self._compare_sets(
current_features.operations,
history_features.operations,
'operations',
'操作类型',
'critical'
)
scores.append(('operations', op_sim, 0.2))
differences.extend(op_diffs)
# 6. 数量信息相似度(权重 0.1
qty_sim, qty_diffs = self._compare_lists(
current_features.quantities,
history_features.quantities,
'quantities',
'数量',
'medium'
)
scores.append(('quantities', qty_sim, 0.1))
differences.extend(qty_diffs)
# 7. 约束条件相似度(权重 0.05
constraint_sim, constraint_diffs = self._compare_lists(
current_features.constraints,
history_features.constraints,
'constraints',
'约束条件',
'medium'
)
scores.append(('constraints', constraint_sim, 0.05))
differences.extend(constraint_diffs)
# 计算加权总分
total_score = sum(score * weight for _, score, weight in scores)
return total_score, differences
def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
"""计算 Jaccard 相似度"""
if not set1 and not set2:
return 1.0
if not set1 or not set2:
return 0.0
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
def _compare_sets(
self,
current: Set[str],
history: Set[str],
field: str,
display_name: str,
importance: str
) -> Tuple[float, List[TaskDifference]]:
"""比较两个集合,返回相似度和差异"""
similarity = self._jaccard_similarity(current, history)
differences = []
# 找出差异
only_current = current - history
only_history = history - current
if only_current or only_history:
differences.append(TaskDifference(
category=display_name,
field=field,
current_value=', '.join(sorted(only_current)) if only_current else '(无)',
history_value=', '.join(sorted(only_history)) if only_history else '(无)',
importance=importance
))
return similarity, differences
def _compare_lists(
self,
current: List[str],
history: List[str],
field: str,
display_name: str,
importance: str
) -> Tuple[float, List[TaskDifference]]:
"""比较两个列表,返回相似度和差异"""
# 转为集合计算相似度
current_set = set(current)
history_set = set(history)
similarity = self._jaccard_similarity(current_set, history_set)
differences = []
if current != history:
differences.append(TaskDifference(
category=display_name,
field=field,
current_value=', '.join(current) if current else '(无)',
history_value=', '.join(history) if history else '(无)',
importance=importance
))
return similarity, differences
# 全局单例
_matcher: Optional[TaskMatcher] = None
def get_task_matcher() -> TaskMatcher:
"""获取任务匹配器单例"""
global _matcher
if _matcher is None:
_matcher = TaskMatcher()
return _matcher