feat: refactor API key configuration and enhance application initialization

- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic.
- Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration.
- Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup.
- Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics.
- Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management.
- Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
This commit is contained in:
Mimikko-zeus
2026-02-27 14:32:30 +08:00
parent ab5bbff6f7
commit 8a538bb950
58 changed files with 13457 additions and 350 deletions

410
history/data_governance.py Normal file
View File

@@ -0,0 +1,410 @@
"""
数据治理策略模块
实现数据分级保存、保留期管理、归档和清理策略
"""
import json
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Set
from dataclasses import dataclass, asdict
from enum import Enum
from history.data_sanitizer import get_sanitizer, SensitiveType
class DataLevel(Enum):
"""数据保存级别"""
FULL = "full" # 完整保存(无脱敏)
SANITIZED = "sanitized" # 脱敏保存
MINIMAL = "minimal" # 最小化保存(仅元数据)
ARCHIVED = "archived" # 已归档
class RetentionPolicy(Enum):
"""数据保留策略"""
SHORT = 7 # 7天
MEDIUM = 30 # 30天
LONG = 90 # 90天
PERMANENT = -1 # 永久保留
@dataclass
class DataClassification:
"""数据分类结果"""
level: DataLevel
retention_days: int
sensitivity_score: float
sensitive_fields: Set[str]
reason: str
@dataclass
class GovernanceMetrics:
"""治理度量指标"""
total_records: int
full_records: int
sanitized_records: int
minimal_records: int
archived_records: int
total_size_bytes: int
sensitive_field_hits: Dict[str, int]
expired_records: int
last_cleanup_time: str
class DataGovernancePolicy:
"""
数据治理策略
根据敏感度自动分级保存,管理数据生命周期
"""
# 字段敏感度配置
FIELD_SENSITIVITY = {
'user_input': 0.5, # 用户输入可能含敏感信息
'code': 0.7, # 代码可能含路径、密钥
'stdout': 0.6, # 输出可能含敏感数据
'stderr': 0.6, # 错误信息可能含路径
'execution_plan': 0.3, # 执行计划相对安全
'log_path': 0.4, # 日志路径
}
# 分级阈值
LEVEL_THRESHOLDS = {
DataLevel.FULL: 0.0, # 敏感度 < 0.3 完整保存
DataLevel.SANITIZED: 0.3, # 0.3 <= 敏感度 < 0.7 脱敏保存
DataLevel.MINIMAL: 0.7, # 敏感度 >= 0.7 最小化保存
}
# 保留期配置(根据数据级别)
RETENTION_CONFIG = {
DataLevel.FULL: RetentionPolicy.LONG.value, # 完整数据保留90天
DataLevel.SANITIZED: RetentionPolicy.MEDIUM.value, # 脱敏数据保留30天
DataLevel.MINIMAL: RetentionPolicy.SHORT.value, # 最小化数据保留7天
}
def __init__(self, workspace_path: Path):
self.workspace = workspace_path
self.sanitizer = get_sanitizer()
self.metrics_file = workspace_path / "governance_metrics.json"
self.archive_dir = workspace_path / "archive"
self.archive_dir.mkdir(exist_ok=True)
def classify_record(self, record_data: Dict) -> DataClassification:
"""
对记录进行分类
Args:
record_data: 记录数据字典
Returns:
数据分类结果
"""
sensitive_fields = set()
total_sensitivity = 0.0
field_count = 0
# 分析各字段敏感度
for field, weight in self.FIELD_SENSITIVITY.items():
if field in record_data and record_data[field]:
content = str(record_data[field])
field_score = self.sanitizer.get_sensitivity_score(content)
if field_score > 0.3: # 发现敏感信息
sensitive_fields.add(field)
total_sensitivity += field_score * weight
field_count += 1
# 计算综合敏感度
avg_sensitivity = total_sensitivity / field_count if field_count > 0 else 0.0
# 确定数据级别
if avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.MINIMAL]:
level = DataLevel.MINIMAL
reason = f"高敏感度({avg_sensitivity:.2f}),仅保留元数据"
elif avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.SANITIZED]:
level = DataLevel.SANITIZED
reason = f"中等敏感度({avg_sensitivity:.2f}),脱敏保存"
else:
level = DataLevel.FULL
reason = f"低敏感度({avg_sensitivity:.2f}),完整保存"
# 确定保留期
retention_days = self.RETENTION_CONFIG[level]
return DataClassification(
level=level,
retention_days=retention_days,
sensitivity_score=avg_sensitivity,
sensitive_fields=sensitive_fields,
reason=reason
)
def apply_policy(self, record_data: Dict) -> Dict:
"""
应用治理策略,返回处理后的数据
Args:
record_data: 原始记录数据
Returns:
处理后的记录数据
"""
classification = self.classify_record(record_data)
# 添加治理元数据
result = record_data.copy()
result['_governance'] = {
'level': classification.level.value,
'retention_days': classification.retention_days,
'sensitivity_score': classification.sensitivity_score,
'sensitive_fields': list(classification.sensitive_fields),
'classified_at': datetime.now().isoformat(),
'expires_at': (datetime.now() + timedelta(days=classification.retention_days)).isoformat()
}
# 根据级别处理数据
if classification.level == DataLevel.MINIMAL:
# 最小化:只保留元数据
result = self._minimize_record(result)
elif classification.level == DataLevel.SANITIZED:
# 脱敏:对敏感字段脱敏
result = self._sanitize_record(result, classification.sensitive_fields)
# FULL 级别不做处理
return result
def _minimize_record(self, record: Dict) -> Dict:
"""
最小化记录(仅保留元数据)
Args:
record: 原始记录
Returns:
最小化后的记录
"""
# 保留的字段
keep_fields = {
'task_id', 'timestamp', 'intent_label', 'intent_confidence',
'success', 'duration_ms', 'task_summary', '_governance'
}
minimal = {k: v for k, v in record.items() if k in keep_fields}
# 添加摘要信息
minimal['user_input'] = '[已删除-高敏感]'
minimal['code'] = '[已删除-高敏感]'
minimal['stdout'] = '[已删除-高敏感]'
minimal['stderr'] = '[已删除-高敏感]'
minimal['execution_plan'] = record.get('execution_plan', '')[:100] + '...'
return minimal
def _sanitize_record(self, record: Dict, sensitive_fields: Set[str]) -> Dict:
"""
脱敏记录
Args:
record: 原始记录
sensitive_fields: 需要脱敏的字段
Returns:
脱敏后的记录
"""
result = record.copy()
for field in sensitive_fields:
if field in result and result[field]:
content = str(result[field])
sanitized, matches = self.sanitizer.sanitize(content)
result[field] = sanitized
# 记录脱敏信息
if '_sanitization' not in result:
result['_sanitization'] = {}
result['_sanitization'][field] = {
'masked_count': len(matches),
'types': list(set(m.type.value for m in matches))
}
return result
def check_expiration(self, record: Dict) -> bool:
"""
检查记录是否过期
Args:
record: 记录数据
Returns:
是否过期
"""
if '_governance' not in record or record['_governance'] is None:
return False
expires_at = record['_governance'].get('expires_at')
if not expires_at:
return False
try:
expire_time = datetime.fromisoformat(expires_at)
return datetime.now() > expire_time
except (ValueError, TypeError):
return False
def archive_record(self, record: Dict) -> Path:
"""
归档记录
Args:
record: 记录数据
Returns:
归档文件路径
"""
task_id = record.get('task_id', 'unknown')
timestamp = record.get('timestamp', datetime.now().strftime('%Y%m%d_%H%M%S'))
# 生成归档文件名
archive_file = self.archive_dir / f"{task_id}_{timestamp}.json"
# 标记为已归档
record['_governance']['level'] = DataLevel.ARCHIVED.value
record['_governance']['archived_at'] = datetime.now().isoformat()
# 保存到归档目录
with open(archive_file, 'w', encoding='utf-8') as f:
json.dump(record, f, ensure_ascii=False, indent=2)
return archive_file
def cleanup_expired(self, records: List[Dict]) -> tuple[List[Dict], int, int]:
"""
清理过期记录
Args:
records: 记录列表
Returns:
(保留的记录列表, 归档数量, 删除数量)
"""
kept_records = []
archived_count = 0
deleted_count = 0
for record in records:
if not self.check_expiration(record):
kept_records.append(record)
continue
# 过期处理
level = record.get('_governance', {}).get('level')
if level == DataLevel.FULL.value:
# 完整数据:降级为脱敏
record['_governance']['level'] = DataLevel.SANITIZED.value
record['_governance']['retention_days'] = RetentionPolicy.MEDIUM.value
record['_governance']['expires_at'] = (
datetime.now() + timedelta(days=RetentionPolicy.MEDIUM.value)
).isoformat()
# 执行脱敏
sensitive_fields = set(record['_governance'].get('sensitive_fields', []))
record = self._sanitize_record(record, sensitive_fields)
kept_records.append(record)
elif level == DataLevel.SANITIZED.value:
# 脱敏数据:归档
self.archive_record(record)
archived_count += 1
else:
# 最小化数据:直接删除
deleted_count += 1
return kept_records, archived_count, deleted_count
def collect_metrics(self, records: List[Dict]) -> GovernanceMetrics:
"""
收集治理度量指标
Args:
records: 记录列表
Returns:
度量指标
"""
metrics = GovernanceMetrics(
total_records=len(records),
full_records=0,
sanitized_records=0,
minimal_records=0,
archived_records=0,
total_size_bytes=0,
sensitive_field_hits={},
expired_records=0,
last_cleanup_time=datetime.now().isoformat()
)
for record in records:
# 统计数据级别
level = record.get('_governance', {}).get('level')
if level == DataLevel.FULL.value:
metrics.full_records += 1
elif level == DataLevel.SANITIZED.value:
metrics.sanitized_records += 1
elif level == DataLevel.MINIMAL.value:
metrics.minimal_records += 1
elif level == DataLevel.ARCHIVED.value:
metrics.archived_records += 1
# 统计敏感字段命中
sensitive_fields = record.get('_governance', {}).get('sensitive_fields', [])
for field in sensitive_fields:
metrics.sensitive_field_hits[field] = metrics.sensitive_field_hits.get(field, 0) + 1
# 统计过期记录
if self.check_expiration(record):
metrics.expired_records += 1
# 估算大小
metrics.total_size_bytes += len(json.dumps(record, ensure_ascii=False))
return metrics
def save_metrics(self, metrics: GovernanceMetrics):
"""保存度量指标"""
with open(self.metrics_file, 'w', encoding='utf-8') as f:
data = asdict(metrics)
json.dump(data, f, ensure_ascii=False, indent=2)
def load_metrics(self) -> Optional[GovernanceMetrics]:
"""加载度量指标"""
if not self.metrics_file.exists():
return None
try:
with open(self.metrics_file, 'r', encoding='utf-8') as f:
data = json.load(f)
return GovernanceMetrics(**data)
except Exception as e:
print(f"[警告] 加载度量指标失败: {e}")
return None
# 全局单例
_policy: Optional[DataGovernancePolicy] = None
def get_governance_policy(workspace_path: Path) -> DataGovernancePolicy:
"""获取数据治理策略单例"""
global _policy
if _policy is None:
_policy = DataGovernancePolicy(workspace_path)
return _policy

311
history/data_sanitizer.py Normal file
View File

@@ -0,0 +1,311 @@
"""
数据脱敏模块
对历史记录中的敏感信息进行识别和脱敏处理
"""
import re
from typing import Dict, List, Tuple, Set
from dataclasses import dataclass
from enum import Enum
class SensitiveType(Enum):
"""敏感信息类型"""
FILE_PATH = "file_path" # 文件路径
IP_ADDRESS = "ip_address" # IP地址
EMAIL = "email" # 邮箱
PHONE = "phone" # 电话号码
API_KEY = "api_key" # API密钥
PASSWORD = "password" # 密码
TOKEN = "token" # Token
DATABASE_URI = "database_uri" # 数据库连接串
CREDIT_CARD = "credit_card" # 信用卡号
ID_CARD = "id_card" # 身份证号
@dataclass
class SensitiveMatch:
"""敏感信息匹配结果"""
type: SensitiveType
value: str
start: int
end: int
masked_value: str
class DataSanitizer:
"""
数据脱敏器
识别并脱敏敏感信息,支持多种敏感数据类型
"""
# 敏感信息正则模式
PATTERNS = {
SensitiveType.FILE_PATH: [
r'[A-Za-z]:\\(?:[^\\/:*?"<>|\r\n]+\\)*[^\\/:*?"<>|\r\n]*', # Windows路径
r'/(?:[^/\0]+/)*[^/\0]*', # Unix路径需要额外验证
],
SensitiveType.IP_ADDRESS: [
r'\b(?:\d{1,3}\.){3}\d{1,3}\b', # IPv4
r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b', # IPv6
],
SensitiveType.EMAIL: [
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
],
SensitiveType.PHONE: [
r'\b1[3-9]\d{9}\b', # 中国手机号
r'\b\d{3}-\d{4}-\d{4}\b', # 美国电话
],
SensitiveType.API_KEY: [
r'\b[A-Za-z0-9_-]{32,}\b', # 通用API密钥
r'sk-[A-Za-z0-9]{48}', # OpenAI风格
r'AIza[0-9A-Za-z_-]{35}', # Google API
],
SensitiveType.PASSWORD: [
r'(?i)password\s*[:=]\s*["\']?([^"\'\s]+)["\']?',
r'(?i)pwd\s*[:=]\s*["\']?([^"\'\s]+)["\']?',
],
SensitiveType.TOKEN: [
r'(?i)token\s*[:=]\s*["\']?([A-Za-z0-9_.-]+)["\']?',
r'(?i)bearer\s+([A-Za-z0-9_.-]+)',
],
SensitiveType.DATABASE_URI: [
r'(?i)(mysql|postgresql|mongodb|redis)://[^\s]+',
],
SensitiveType.CREDIT_CARD: [
r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
],
SensitiveType.ID_CARD: [
r'\b\d{17}[\dXx]\b', # 中国身份证
],
}
# 需要特殊处理的类型(避免误判)
SPECIAL_VALIDATION = {
SensitiveType.FILE_PATH: '_validate_file_path',
SensitiveType.API_KEY: '_validate_api_key',
}
def __init__(self, enabled_types: Set[SensitiveType] = None):
"""
初始化脱敏器
Args:
enabled_types: 启用的敏感类型None表示全部启用
"""
self.enabled_types = enabled_types or set(SensitiveType)
self._compile_patterns()
def _compile_patterns(self):
"""编译正则表达式"""
self.compiled_patterns: Dict[SensitiveType, List[re.Pattern]] = {}
for sens_type in self.enabled_types:
if sens_type in self.PATTERNS:
self.compiled_patterns[sens_type] = [
re.compile(pattern) for pattern in self.PATTERNS[sens_type]
]
def _validate_file_path(self, text: str) -> bool:
"""验证是否为真实文件路径(避免误判)"""
# 排除短路径和常见误判
if len(text) < 5:
return False
# 必须包含常见路径特征
path_indicators = ['\\', '/', '.py', '.txt', '.json', '.log', 'Users', 'Program']
return any(indicator in text for indicator in path_indicators)
def _validate_api_key(self, text: str) -> bool:
"""验证是否为真实API密钥避免误判"""
# 排除纯数字或纯字母
has_digit = any(c.isdigit() for c in text)
has_alpha = any(c.isalpha() for c in text)
has_special = any(c in '-_' for c in text)
# 长度要求
return has_digit and has_alpha and len(text) >= 20
def find_sensitive_data(self, text: str) -> List[SensitiveMatch]:
"""
查找文本中的敏感信息
Args:
text: 待检测文本
Returns:
敏感信息匹配列表
"""
matches = []
for sens_type, patterns in self.compiled_patterns.items():
for pattern in patterns:
for match in pattern.finditer(text):
value = match.group(0)
# 特殊验证
if sens_type in self.SPECIAL_VALIDATION:
validator = getattr(self, self.SPECIAL_VALIDATION[sens_type])
if not validator(value):
continue
# 生成脱敏值
masked = self._mask_value(value, sens_type)
matches.append(SensitiveMatch(
type=sens_type,
value=value,
start=match.start(),
end=match.end(),
masked_value=masked
))
# 按位置排序,避免重叠
matches.sort(key=lambda m: m.start)
return self._remove_overlaps(matches)
def _remove_overlaps(self, matches: List[SensitiveMatch]) -> List[SensitiveMatch]:
"""移除重叠的匹配项(保留优先级高的)"""
if not matches:
return []
# 定义优先级(越小越优先)
priority = {
SensitiveType.PASSWORD: 1,
SensitiveType.API_KEY: 2,
SensitiveType.TOKEN: 3,
SensitiveType.DATABASE_URI: 4,
SensitiveType.CREDIT_CARD: 5,
SensitiveType.ID_CARD: 6,
SensitiveType.EMAIL: 7,
SensitiveType.PHONE: 8,
SensitiveType.IP_ADDRESS: 9,
SensitiveType.FILE_PATH: 10,
}
result = []
last_end = -1
for match in sorted(matches, key=lambda m: (m.start, priority.get(m.type, 99))):
if match.start >= last_end:
result.append(match)
last_end = match.end
return result
def _mask_value(self, value: str, sens_type: SensitiveType) -> str:
"""
生成脱敏值
Args:
value: 原始值
sens_type: 敏感类型
Returns:
脱敏后的值
"""
if sens_type == SensitiveType.FILE_PATH:
# 保留文件名,隐藏路径
parts = value.replace('\\', '/').split('/')
if len(parts) > 1:
return f"***/{parts[-1]}"
return "***"
elif sens_type == SensitiveType.EMAIL:
# 保留首尾字符
parts = value.split('@')
if len(parts) == 2:
name = parts[0]
domain = parts[1]
masked_name = name[0] + '***' + name[-1] if len(name) > 2 else '***'
return f"{masked_name}@{domain}"
elif sens_type == SensitiveType.PHONE:
# 保留前3后4
if len(value) >= 11:
return value[:3] + '****' + value[-4:]
elif sens_type == SensitiveType.IP_ADDRESS:
# 保留前两段
parts = value.split('.')
if len(parts) == 4:
return f"{parts[0]}.{parts[1]}.*.*"
elif sens_type == SensitiveType.CREDIT_CARD:
# 只保留后4位
digits = re.sub(r'[\s-]', '', value)
return '**** **** **** ' + digits[-4:]
elif sens_type == SensitiveType.ID_CARD:
# 保留前6后4
return value[:6] + '********' + value[-4:]
# 默认:完全隐藏
return f"[{sens_type.value.upper()}_MASKED]"
def sanitize(self, text: str) -> Tuple[str, List[SensitiveMatch]]:
"""
脱敏文本
Args:
text: 原始文本
Returns:
(脱敏后的文本, 匹配列表)
"""
matches = self.find_sensitive_data(text)
if not matches:
return text, []
# 从后往前替换,避免位置偏移
result = text
for match in reversed(matches):
result = result[:match.start] + match.masked_value + result[match.end:]
return result, matches
def get_sensitivity_score(self, text: str) -> float:
"""
计算文本的敏感度评分0-1
Args:
text: 待评估文本
Returns:
敏感度评分
"""
matches = self.find_sensitive_data(text)
if not matches:
return 0.0
# 根据敏感类型加权
weights = {
SensitiveType.PASSWORD: 1.0,
SensitiveType.API_KEY: 1.0,
SensitiveType.TOKEN: 0.9,
SensitiveType.DATABASE_URI: 0.9,
SensitiveType.CREDIT_CARD: 1.0,
SensitiveType.ID_CARD: 1.0,
SensitiveType.EMAIL: 0.6,
SensitiveType.PHONE: 0.6,
SensitiveType.IP_ADDRESS: 0.5,
SensitiveType.FILE_PATH: 0.3,
}
total_weight = sum(weights.get(m.type, 0.5) for m in matches)
# 归一化到 0-1
return min(1.0, total_weight / 3.0)
# 全局单例
_sanitizer: DataSanitizer = None
def get_sanitizer() -> DataSanitizer:
"""获取数据脱敏器单例"""
global _sanitizer
if _sanitizer is None:
_sanitizer = DataSanitizer()
return _sanitizer

View File

@@ -1,6 +1,6 @@
"""
任务历史记录管理器
保存和加载任务执行历史
保存和加载任务执行历史,集成数据治理策略
"""
import json
@@ -9,6 +9,8 @@ from pathlib import Path
from typing import Optional, List
from dataclasses import dataclass, asdict
from history.data_governance import get_governance_policy, GovernanceMetrics
@dataclass
class TaskRecord:
@@ -26,16 +28,19 @@ class TaskRecord:
stderr: str
log_path: str
task_summary: str = "" # 任务摘要(由小模型生成)
_governance: dict = None # 治理元数据
_sanitization: dict = None # 脱敏信息
class HistoryManager:
"""
历史记录管理器
将任务历史保存为 JSON 文件
将任务历史保存为 JSON 文件,集成数据治理策略
"""
MAX_HISTORY_SIZE = 100 # 最多保存 100 条记录
AUTO_CLEANUP_ENABLED = True # 自动清理过期数据
def __init__(self, workspace_path: Optional[Path] = None):
if workspace_path:
@@ -45,7 +50,15 @@ class HistoryManager:
self.history_file = self.workspace / "history.json"
self._history: List[TaskRecord] = []
# 初始化数据治理策略
self.governance = get_governance_policy(self.workspace)
self._load()
# 启动时自动清理过期数据
if self.AUTO_CLEANUP_ENABLED:
self._auto_cleanup()
def _load(self):
"""从文件加载历史记录"""
@@ -53,7 +66,14 @@ class HistoryManager:
try:
with open(self.history_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._history = [TaskRecord(**record) for record in data]
self._history = []
for record in data:
# 兼容旧数据(没有治理字段)
if '_governance' not in record:
record['_governance'] = None
if '_sanitization' not in record:
record['_sanitization'] = None
self._history.append(TaskRecord(**record))
except (json.JSONDecodeError, TypeError, KeyError) as e:
print(f"[警告] 加载历史记录失败: {e}")
self._history = []
@@ -61,14 +81,29 @@ class HistoryManager:
self._history = []
def _save(self):
"""保存历史记录到文件"""
"""保存历史记录到文件(应用数据治理策略)"""
try:
# 确保目录存在
self.history_file.parent.mkdir(parents=True, exist_ok=True)
# 应用数据治理策略
governed_data = []
for record in self._history:
record_dict = asdict(record)
# 如果记录还没有治理元数据,应用策略
if not record_dict.get('_governance'):
record_dict = self.governance.apply_policy(record_dict)
governed_data.append(record_dict)
with open(self.history_file, 'w', encoding='utf-8') as f:
data = [asdict(record) for record in self._history]
json.dump(data, f, ensure_ascii=False, indent=2)
json.dump(governed_data, f, ensure_ascii=False, indent=2)
# 收集并保存度量指标
metrics = self.governance.collect_metrics(governed_data)
self.governance.save_metrics(metrics)
except Exception as e:
print(f"[警告] 保存历史记录失败: {e}")
@@ -216,56 +251,136 @@ class HistoryManager:
'avg_duration_ms': int(avg_duration)
}
def find_similar_success(self, user_input: str, threshold: float = 0.6) -> Optional[TaskRecord]:
def find_similar_success(
self,
user_input: str,
threshold: float = 0.6,
return_details: bool = False
) -> Optional[TaskRecord] | tuple:
"""
查找相似的成功任务
使用简单的关键词匹配来判断相似度
查找相似的成功任务(增强版:结构化特征匹配)
Args:
user_input: 用户输入
threshold: 相似度阈值
return_details: 是否返回详细信息(相似度和差异列表)
Returns:
最相似的成功任务记录,如果没有则返回 None
如果 return_details=False: 最相似的成功任务记录,如果没有则返回 None
如果 return_details=True: (TaskRecord, 相似度, 差异列表) 或 None
"""
# 提取关键词
def extract_keywords(text: str) -> set:
# 简单分词:按空格和标点分割
import re
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
# 过滤掉太短的词
return set(w for w in words if len(w) >= 2)
from history.task_features import get_task_matcher
input_keywords = extract_keywords(user_input)
if not input_keywords:
return None
matcher = get_task_matcher()
best_match = None
best_score = 0.0
best_differences = []
for record in self._history:
if not record.success:
continue
record_keywords = extract_keywords(record.user_input)
if not record_keywords:
continue
# 计算 Jaccard 相似度
intersection = len(input_keywords & record_keywords)
union = len(input_keywords | record_keywords)
score = intersection / union if union > 0 else 0
# 使用增强的特征匹配
score, differences = matcher.calculate_similarity(
user_input,
record.user_input
)
if score > best_score and score >= threshold:
best_score = score
best_match = record
best_differences = differences
return best_match
if best_match is None:
return None
if return_details:
return (best_match, best_score, best_differences)
else:
return best_match
def get_successful_records(self) -> List[TaskRecord]:
"""获取所有成功的任务记录"""
return [r for r in self._history if r.success]
def _auto_cleanup(self):
"""自动清理过期数据"""
try:
records_data = [asdict(r) for r in self._history]
kept_records, archived, deleted = self.governance.cleanup_expired(records_data)
if archived > 0 or deleted > 0:
# 更新历史记录
self._history = []
for record_dict in kept_records:
if '_governance' not in record_dict:
record_dict['_governance'] = None
if '_sanitization' not in record_dict:
record_dict['_sanitization'] = None
self._history.append(TaskRecord(**record_dict))
self._save()
print(f"[数据治理] 自动清理完成: 归档 {archived} 条, 删除 {deleted}")
except Exception as e:
print(f"[警告] 自动清理失败: {e}")
def manual_cleanup(self) -> dict:
"""
手动触发数据清理
Returns:
清理统计信息
"""
records_data = [asdict(r) for r in self._history]
kept_records, archived, deleted = self.governance.cleanup_expired(records_data)
# 更新历史记录
self._history = []
for record_dict in kept_records:
if '_governance' not in record_dict:
record_dict['_governance'] = None
if '_sanitization' not in record_dict:
record_dict['_sanitization'] = None
self._history.append(TaskRecord(**record_dict))
self._save()
return {
'archived': archived,
'deleted': deleted,
'remaining': len(self._history)
}
def get_governance_metrics(self) -> Optional[GovernanceMetrics]:
"""获取数据治理度量指标"""
return self.governance.load_metrics()
def export_sanitized(self, output_path: Path) -> int:
"""
导出脱敏后的历史记录
Args:
output_path: 导出文件路径
Returns:
导出的记录数量
"""
sanitized_data = []
for record in self._history:
record_dict = asdict(record)
# 确保已应用治理策略
if not record_dict.get('_governance'):
record_dict = self.governance.apply_policy(record_dict)
sanitized_data.append(record_dict)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(sanitized_data, f, ensure_ascii=False, indent=2)
return len(sanitized_data)
# 全局单例

252
history/reuse_metrics.py Normal file
View File

@@ -0,0 +1,252 @@
"""
任务复用度量指标收集模块
"""
import json
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, List
from dataclasses import dataclass, asdict
@dataclass
class ReuseEvent:
"""复用事件记录"""
timestamp: str
original_task_id: str # 被复用的任务 ID
new_task_id: Optional[str] # 新任务 ID如果执行了
similarity_score: float # 相似度分数
user_action: str # 用户操作accepted/rejected/rollback/failed
differences_count: int # 差异数量
critical_differences: int # 关键差异数量
execution_success: Optional[bool] # 执行是否成功(如果执行了)
class ReuseMetrics:
"""复用指标管理器"""
def __init__(self, workspace_path: Path):
self.workspace = workspace_path
self.metrics_file = workspace_path / "reuse_metrics.json"
self._events: List[ReuseEvent] = []
self._load()
def _load(self):
"""加载指标数据"""
if self.metrics_file.exists():
try:
with open(self.metrics_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self._events = [ReuseEvent(**event) for event in data]
except Exception as e:
print(f"[警告] 加载复用指标失败: {e}")
self._events = []
def _save(self):
"""保存指标数据"""
try:
self.metrics_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.metrics_file, 'w', encoding='utf-8') as f:
data = [asdict(event) for event in self._events]
json.dump(data, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"[警告] 保存复用指标失败: {e}")
def record_reuse_offered(
self,
original_task_id: str,
similarity_score: float,
differences_count: int,
critical_differences: int
):
"""记录复用建议被提供"""
event = ReuseEvent(
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
original_task_id=original_task_id,
new_task_id=None,
similarity_score=similarity_score,
user_action='offered',
differences_count=differences_count,
critical_differences=critical_differences,
execution_success=None
)
self._events.append(event)
self._save()
return event
def record_reuse_accepted(
self,
original_task_id: str,
similarity_score: float,
differences_count: int,
critical_differences: int
):
"""记录用户接受复用"""
event = ReuseEvent(
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
original_task_id=original_task_id,
new_task_id=None,
similarity_score=similarity_score,
user_action='accepted',
differences_count=differences_count,
critical_differences=critical_differences,
execution_success=None
)
self._events.append(event)
self._save()
return event
def record_reuse_rejected(
self,
original_task_id: str,
similarity_score: float,
differences_count: int,
critical_differences: int
):
"""记录用户拒绝复用"""
event = ReuseEvent(
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
original_task_id=original_task_id,
new_task_id=None,
similarity_score=similarity_score,
user_action='rejected',
differences_count=differences_count,
critical_differences=critical_differences,
execution_success=None
)
self._events.append(event)
self._save()
return event
def record_reuse_execution(
self,
original_task_id: str,
new_task_id: str,
success: bool
):
"""记录复用后的执行结果"""
# 查找最近的 accepted 事件并更新
for event in reversed(self._events):
if (event.original_task_id == original_task_id and
event.user_action == 'accepted' and
event.new_task_id is None):
event.new_task_id = new_task_id
event.execution_success = success
self._save()
return event
# 如果没找到,创建新记录
event = ReuseEvent(
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
original_task_id=original_task_id,
new_task_id=new_task_id,
similarity_score=0.0,
user_action='executed',
differences_count=0,
critical_differences=0,
execution_success=success
)
self._events.append(event)
self._save()
return event
def record_reuse_rollback(
self,
original_task_id: str,
new_task_id: str
):
"""记录复用后回滚(用户撤销/重做)"""
event = ReuseEvent(
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
original_task_id=original_task_id,
new_task_id=new_task_id,
similarity_score=0.0,
user_action='rollback',
differences_count=0,
critical_differences=0,
execution_success=False
)
self._events.append(event)
self._save()
return event
def get_statistics(self) -> Dict:
"""获取统计数据"""
if not self._events:
return {
'total_offered': 0,
'total_accepted': 0,
'total_rejected': 0,
'total_executed': 0,
'total_rollback': 0,
'acceptance_rate': 0.0,
'rejection_rate': 0.0,
'success_rate': 0.0,
'failure_rate': 0.0,
'rollback_rate': 0.0,
'avg_similarity': 0.0,
'avg_differences': 0.0,
'avg_critical_differences': 0.0
}
offered = [e for e in self._events if e.user_action == 'offered']
accepted = [e for e in self._events if e.user_action == 'accepted']
rejected = [e for e in self._events if e.user_action == 'rejected']
executed = [e for e in self._events if e.execution_success is not None]
rollback = [e for e in self._events if e.user_action == 'rollback']
total_offered = len(offered)
total_accepted = len(accepted)
total_rejected = len(rejected)
total_executed = len(executed)
total_rollback = len(rollback)
# 计算成功和失败
successful = [e for e in executed if e.execution_success]
failed = [e for e in executed if not e.execution_success]
# 计算率
acceptance_rate = total_accepted / total_offered if total_offered > 0 else 0.0
rejection_rate = total_rejected / total_offered if total_offered > 0 else 0.0
success_rate = len(successful) / total_executed if total_executed > 0 else 0.0
failure_rate = len(failed) / total_executed if total_executed > 0 else 0.0
rollback_rate = total_rollback / total_executed if total_executed > 0 else 0.0
# 平均值
all_events = offered + accepted + rejected
avg_similarity = sum(e.similarity_score for e in all_events) / len(all_events) if all_events else 0.0
avg_differences = sum(e.differences_count for e in all_events) / len(all_events) if all_events else 0.0
avg_critical_differences = sum(e.critical_differences for e in all_events) / len(all_events) if all_events else 0.0
return {
'total_offered': total_offered,
'total_accepted': total_accepted,
'total_rejected': total_rejected,
'total_executed': total_executed,
'total_rollback': total_rollback,
'acceptance_rate': acceptance_rate,
'rejection_rate': rejection_rate,
'success_rate': success_rate,
'failure_rate': failure_rate,
'rollback_rate': rollback_rate,
'avg_similarity': avg_similarity,
'avg_differences': avg_differences,
'avg_critical_differences': avg_critical_differences
}
def get_recent_events(self, count: int = 20) -> List[ReuseEvent]:
"""获取最近的事件"""
return self._events[-count:] if self._events else []
# 全局单例
_metrics: Optional[ReuseMetrics] = None
def get_reuse_metrics(workspace_path: Path) -> ReuseMetrics:
"""获取复用指标管理器单例"""
global _metrics
if _metrics is None:
_metrics = ReuseMetrics(workspace_path)
return _metrics

380
history/task_features.py Normal file
View File

@@ -0,0 +1,380 @@
"""
任务特征提取与匹配模块
用于更精确的相似任务识别
"""
import re
from typing import Dict, List, Set, Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TaskFeatures:
"""任务结构化特征"""
# 基础信息
raw_input: str
keywords: Set[str]
# 关键参数
file_formats: Set[str] # 文件格式(如 .txt, .csv, .json
directory_paths: Set[str] # 目录路径
file_names: Set[str] # 文件名
naming_patterns: List[str] # 命名规则(如 "按日期", "按序号"
# 操作类型
operations: Set[str] # 操作类型(如 "批量重命名", "文件转换", "数据处理"
# 数量/范围参数
quantities: List[str] # 数量相关(如 "100个", "所有"
# 其他约束
constraints: List[str] # 其他约束条件
@dataclass
class TaskDifference:
"""任务差异描述"""
category: str # 差异类别
field: str # 字段名
current_value: str # 当前任务的值
history_value: str # 历史任务的值
importance: str # 重要性critical/high/medium/low
class TaskFeatureExtractor:
"""任务特征提取器"""
# 文件格式模式
FILE_FORMAT_PATTERN = r'\.(txt|csv|json|xml|xlsx?|docx?|pdf|png|jpe?g|gif|mp[34]|avi|mov|zip|rar|7z|py|js|java|cpp|html?|css)'
# 目录路径模式Windows 和 Unix
DIR_PATH_PATTERN = r'(?:[a-zA-Z]:\\[\w\\\s\u4e00-\u9fa5.-]+|/[\w/\s\u4e00-\u9fa5.-]+|[./][\w/\\\s\u4e00-\u9fa5.-]+)'
# 文件名模式
FILE_NAME_PATTERN = r'[\w\u4e00-\u9fa5.-]+\.[a-zA-Z0-9]+'
# 数量模式
QUANTITY_PATTERN = r'(\d+\s*[个张份条篇页行列]|所有|全部|批量)'
# 操作关键词映射
OPERATION_KEYWORDS = {
'重命名': ['重命名', '改名', '命名', '更名'],
'转换': ['转换', '转为', '转成', '变成', '改成'],
'批量处理': ['批量', '批处理', '一次性'],
'复制': ['复制', '拷贝', 'copy'],
'移动': ['移动', '转移', 'move'],
'删除': ['删除', '清理', '移除'],
'合并': ['合并', '整合', '汇总'],
'分割': ['分割', '拆分', '切分'],
'压缩': ['压缩', '打包'],
'解压': ['解压', '解包', '提取'],
'排序': ['排序', '排列'],
'筛选': ['筛选', '过滤', '查找'],
'统计': ['统计', '计数', '汇总'],
'生成': ['生成', '创建', '制作'],
}
# 命名规则关键词
NAMING_PATTERNS = {
'按日期': ['日期', '时间', 'date', 'time'],
'按序号': ['序号', '编号', '数字', '顺序'],
'按前缀': ['前缀', '开头'],
'按后缀': ['后缀', '结尾'],
'按内容': ['内容', '根据'],
}
def extract(self, user_input: str) -> TaskFeatures:
"""
从用户输入中提取结构化特征
Args:
user_input: 用户输入文本
Returns:
TaskFeatures: 提取的特征
"""
# 提取关键词
keywords = self._extract_keywords(user_input)
# 提取文件格式
file_formats = self._extract_file_formats(user_input)
# 提取目录路径
directory_paths = self._extract_directory_paths(user_input)
# 提取文件名
file_names = self._extract_file_names(user_input)
# 提取命名规则
naming_patterns = self._extract_naming_patterns(user_input)
# 提取操作类型
operations = self._extract_operations(user_input)
# 提取数量信息
quantities = self._extract_quantities(user_input)
# 提取其他约束
constraints = self._extract_constraints(user_input)
return TaskFeatures(
raw_input=user_input,
keywords=keywords,
file_formats=file_formats,
directory_paths=directory_paths,
file_names=file_names,
naming_patterns=naming_patterns,
operations=operations,
quantities=quantities,
constraints=constraints
)
def _extract_keywords(self, text: str) -> Set[str]:
"""提取关键词(基础分词)"""
words = re.findall(r'[\u4e00-\u9fa5]+|[a-zA-Z]+', text.lower())
return set(w for w in words if len(w) >= 2)
def _extract_file_formats(self, text: str) -> Set[str]:
"""提取文件格式"""
matches = re.findall(self.FILE_FORMAT_PATTERN, text.lower())
return set(f'.{m}' for m in matches)
def _extract_directory_paths(self, text: str) -> Set[str]:
"""提取目录路径"""
matches = re.findall(self.DIR_PATH_PATTERN, text)
# 标准化路径
normalized = set()
for path in matches:
try:
p = Path(path)
normalized.add(str(p.resolve()))
except:
normalized.add(path)
return normalized
def _extract_file_names(self, text: str) -> Set[str]:
"""提取文件名"""
matches = re.findall(self.FILE_NAME_PATTERN, text)
return set(matches)
def _extract_naming_patterns(self, text: str) -> List[str]:
"""提取命名规则"""
patterns = []
for pattern_name, keywords in self.NAMING_PATTERNS.items():
if any(kw in text for kw in keywords):
patterns.append(pattern_name)
return patterns
def _extract_operations(self, text: str) -> Set[str]:
"""提取操作类型"""
operations = set()
for op_name, keywords in self.OPERATION_KEYWORDS.items():
if any(kw in text for kw in keywords):
operations.add(op_name)
return operations
def _extract_quantities(self, text: str) -> List[str]:
"""提取数量信息"""
matches = re.findall(self.QUANTITY_PATTERN, text)
return matches
def _extract_constraints(self, text: str) -> List[str]:
"""提取其他约束条件"""
constraints = []
# 条件关键词
condition_keywords = ['如果', '', '满足', '符合', '包含', '不包含', '大于', '小于', '等于']
for keyword in condition_keywords:
if keyword in text:
# 提取包含该关键词的句子片段
pattern = f'[^。,;]*{keyword}[^。,;]*'
matches = re.findall(pattern, text)
constraints.extend(matches)
return constraints
class TaskMatcher:
"""任务匹配器"""
def __init__(self):
self.extractor = TaskFeatureExtractor()
def calculate_similarity(
self,
current_input: str,
history_input: str
) -> Tuple[float, List[TaskDifference]]:
"""
计算两个任务的相似度,并返回差异列表
Args:
current_input: 当前任务输入
history_input: 历史任务输入
Returns:
(相似度分数 0-1, 差异列表)
"""
# 提取特征
current_features = self.extractor.extract(current_input)
history_features = self.extractor.extract(history_input)
# 计算各维度相似度和差异
differences = []
scores = []
# 1. 关键词相似度(基础权重 0.2
keyword_sim = self._jaccard_similarity(
current_features.keywords,
history_features.keywords
)
scores.append(('keywords', keyword_sim, 0.2))
# 2. 文件格式相似度(权重 0.15
format_sim, format_diffs = self._compare_sets(
current_features.file_formats,
history_features.file_formats,
'file_formats',
'文件格式',
'high'
)
scores.append(('file_formats', format_sim, 0.15))
differences.extend(format_diffs)
# 3. 目录路径相似度(权重 0.15
dir_sim, dir_diffs = self._compare_sets(
current_features.directory_paths,
history_features.directory_paths,
'directory_paths',
'目录路径',
'critical'
)
scores.append(('directory_paths', dir_sim, 0.15))
differences.extend(dir_diffs)
# 4. 命名规则相似度(权重 0.15
naming_sim, naming_diffs = self._compare_lists(
current_features.naming_patterns,
history_features.naming_patterns,
'naming_patterns',
'命名规则',
'high'
)
scores.append(('naming_patterns', naming_sim, 0.15))
differences.extend(naming_diffs)
# 5. 操作类型相似度(权重 0.2
op_sim, op_diffs = self._compare_sets(
current_features.operations,
history_features.operations,
'operations',
'操作类型',
'critical'
)
scores.append(('operations', op_sim, 0.2))
differences.extend(op_diffs)
# 6. 数量信息相似度(权重 0.1
qty_sim, qty_diffs = self._compare_lists(
current_features.quantities,
history_features.quantities,
'quantities',
'数量',
'medium'
)
scores.append(('quantities', qty_sim, 0.1))
differences.extend(qty_diffs)
# 7. 约束条件相似度(权重 0.05
constraint_sim, constraint_diffs = self._compare_lists(
current_features.constraints,
history_features.constraints,
'constraints',
'约束条件',
'medium'
)
scores.append(('constraints', constraint_sim, 0.05))
differences.extend(constraint_diffs)
# 计算加权总分
total_score = sum(score * weight for _, score, weight in scores)
return total_score, differences
def _jaccard_similarity(self, set1: Set, set2: Set) -> float:
"""计算 Jaccard 相似度"""
if not set1 and not set2:
return 1.0
if not set1 or not set2:
return 0.0
intersection = len(set1 & set2)
union = len(set1 | set2)
return intersection / union if union > 0 else 0.0
def _compare_sets(
self,
current: Set[str],
history: Set[str],
field: str,
display_name: str,
importance: str
) -> Tuple[float, List[TaskDifference]]:
"""比较两个集合,返回相似度和差异"""
similarity = self._jaccard_similarity(current, history)
differences = []
# 找出差异
only_current = current - history
only_history = history - current
if only_current or only_history:
differences.append(TaskDifference(
category=display_name,
field=field,
current_value=', '.join(sorted(only_current)) if only_current else '(无)',
history_value=', '.join(sorted(only_history)) if only_history else '(无)',
importance=importance
))
return similarity, differences
def _compare_lists(
self,
current: List[str],
history: List[str],
field: str,
display_name: str,
importance: str
) -> Tuple[float, List[TaskDifference]]:
"""比较两个列表,返回相似度和差异"""
# 转为集合计算相似度
current_set = set(current)
history_set = set(history)
similarity = self._jaccard_similarity(current_set, history_set)
differences = []
if current != history:
differences.append(TaskDifference(
category=display_name,
field=field,
current_value=', '.join(current) if current else '(无)',
history_value=', '.join(history) if history else '(无)',
importance=importance
))
return similarity, differences
# 全局单例
_matcher: Optional[TaskMatcher] = None
def get_task_matcher() -> TaskMatcher:
"""获取任务匹配器单例"""
global _matcher
if _matcher is None:
_matcher = TaskMatcher()
return _matcher