- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic. - Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration. - Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup. - Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics. - Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management. - Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
411 lines
13 KiB
Python
411 lines
13 KiB
Python
"""
|
|
数据治理策略模块
|
|
实现数据分级保存、保留期管理、归档和清理策略
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set
|
|
from dataclasses import dataclass, asdict
|
|
from enum import Enum
|
|
|
|
from history.data_sanitizer import get_sanitizer, SensitiveType
|
|
|
|
|
|
class DataLevel(Enum):
|
|
"""数据保存级别"""
|
|
FULL = "full" # 完整保存(无脱敏)
|
|
SANITIZED = "sanitized" # 脱敏保存
|
|
MINIMAL = "minimal" # 最小化保存(仅元数据)
|
|
ARCHIVED = "archived" # 已归档
|
|
|
|
|
|
class RetentionPolicy(Enum):
|
|
"""数据保留策略"""
|
|
SHORT = 7 # 7天
|
|
MEDIUM = 30 # 30天
|
|
LONG = 90 # 90天
|
|
PERMANENT = -1 # 永久保留
|
|
|
|
|
|
@dataclass
|
|
class DataClassification:
|
|
"""数据分类结果"""
|
|
level: DataLevel
|
|
retention_days: int
|
|
sensitivity_score: float
|
|
sensitive_fields: Set[str]
|
|
reason: str
|
|
|
|
|
|
@dataclass
|
|
class GovernanceMetrics:
|
|
"""治理度量指标"""
|
|
total_records: int
|
|
full_records: int
|
|
sanitized_records: int
|
|
minimal_records: int
|
|
archived_records: int
|
|
total_size_bytes: int
|
|
sensitive_field_hits: Dict[str, int]
|
|
expired_records: int
|
|
last_cleanup_time: str
|
|
|
|
|
|
class DataGovernancePolicy:
|
|
"""
|
|
数据治理策略
|
|
|
|
根据敏感度自动分级保存,管理数据生命周期
|
|
"""
|
|
|
|
# 字段敏感度配置
|
|
FIELD_SENSITIVITY = {
|
|
'user_input': 0.5, # 用户输入可能含敏感信息
|
|
'code': 0.7, # 代码可能含路径、密钥
|
|
'stdout': 0.6, # 输出可能含敏感数据
|
|
'stderr': 0.6, # 错误信息可能含路径
|
|
'execution_plan': 0.3, # 执行计划相对安全
|
|
'log_path': 0.4, # 日志路径
|
|
}
|
|
|
|
# 分级阈值
|
|
LEVEL_THRESHOLDS = {
|
|
DataLevel.FULL: 0.0, # 敏感度 < 0.3 完整保存
|
|
DataLevel.SANITIZED: 0.3, # 0.3 <= 敏感度 < 0.7 脱敏保存
|
|
DataLevel.MINIMAL: 0.7, # 敏感度 >= 0.7 最小化保存
|
|
}
|
|
|
|
# 保留期配置(根据数据级别)
|
|
RETENTION_CONFIG = {
|
|
DataLevel.FULL: RetentionPolicy.LONG.value, # 完整数据保留90天
|
|
DataLevel.SANITIZED: RetentionPolicy.MEDIUM.value, # 脱敏数据保留30天
|
|
DataLevel.MINIMAL: RetentionPolicy.SHORT.value, # 最小化数据保留7天
|
|
}
|
|
|
|
def __init__(self, workspace_path: Path):
|
|
self.workspace = workspace_path
|
|
self.sanitizer = get_sanitizer()
|
|
self.metrics_file = workspace_path / "governance_metrics.json"
|
|
self.archive_dir = workspace_path / "archive"
|
|
self.archive_dir.mkdir(exist_ok=True)
|
|
|
|
def classify_record(self, record_data: Dict) -> DataClassification:
|
|
"""
|
|
对记录进行分类
|
|
|
|
Args:
|
|
record_data: 记录数据字典
|
|
|
|
Returns:
|
|
数据分类结果
|
|
"""
|
|
sensitive_fields = set()
|
|
total_sensitivity = 0.0
|
|
field_count = 0
|
|
|
|
# 分析各字段敏感度
|
|
for field, weight in self.FIELD_SENSITIVITY.items():
|
|
if field in record_data and record_data[field]:
|
|
content = str(record_data[field])
|
|
field_score = self.sanitizer.get_sensitivity_score(content)
|
|
|
|
if field_score > 0.3: # 发现敏感信息
|
|
sensitive_fields.add(field)
|
|
|
|
total_sensitivity += field_score * weight
|
|
field_count += 1
|
|
|
|
# 计算综合敏感度
|
|
avg_sensitivity = total_sensitivity / field_count if field_count > 0 else 0.0
|
|
|
|
# 确定数据级别
|
|
if avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.MINIMAL]:
|
|
level = DataLevel.MINIMAL
|
|
reason = f"高敏感度({avg_sensitivity:.2f}),仅保留元数据"
|
|
elif avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.SANITIZED]:
|
|
level = DataLevel.SANITIZED
|
|
reason = f"中等敏感度({avg_sensitivity:.2f}),脱敏保存"
|
|
else:
|
|
level = DataLevel.FULL
|
|
reason = f"低敏感度({avg_sensitivity:.2f}),完整保存"
|
|
|
|
# 确定保留期
|
|
retention_days = self.RETENTION_CONFIG[level]
|
|
|
|
return DataClassification(
|
|
level=level,
|
|
retention_days=retention_days,
|
|
sensitivity_score=avg_sensitivity,
|
|
sensitive_fields=sensitive_fields,
|
|
reason=reason
|
|
)
|
|
|
|
def apply_policy(self, record_data: Dict) -> Dict:
|
|
"""
|
|
应用治理策略,返回处理后的数据
|
|
|
|
Args:
|
|
record_data: 原始记录数据
|
|
|
|
Returns:
|
|
处理后的记录数据
|
|
"""
|
|
classification = self.classify_record(record_data)
|
|
|
|
# 添加治理元数据
|
|
result = record_data.copy()
|
|
result['_governance'] = {
|
|
'level': classification.level.value,
|
|
'retention_days': classification.retention_days,
|
|
'sensitivity_score': classification.sensitivity_score,
|
|
'sensitive_fields': list(classification.sensitive_fields),
|
|
'classified_at': datetime.now().isoformat(),
|
|
'expires_at': (datetime.now() + timedelta(days=classification.retention_days)).isoformat()
|
|
}
|
|
|
|
# 根据级别处理数据
|
|
if classification.level == DataLevel.MINIMAL:
|
|
# 最小化:只保留元数据
|
|
result = self._minimize_record(result)
|
|
|
|
elif classification.level == DataLevel.SANITIZED:
|
|
# 脱敏:对敏感字段脱敏
|
|
result = self._sanitize_record(result, classification.sensitive_fields)
|
|
|
|
# FULL 级别不做处理
|
|
|
|
return result
|
|
|
|
def _minimize_record(self, record: Dict) -> Dict:
|
|
"""
|
|
最小化记录(仅保留元数据)
|
|
|
|
Args:
|
|
record: 原始记录
|
|
|
|
Returns:
|
|
最小化后的记录
|
|
"""
|
|
# 保留的字段
|
|
keep_fields = {
|
|
'task_id', 'timestamp', 'intent_label', 'intent_confidence',
|
|
'success', 'duration_ms', 'task_summary', '_governance'
|
|
}
|
|
|
|
minimal = {k: v for k, v in record.items() if k in keep_fields}
|
|
|
|
# 添加摘要信息
|
|
minimal['user_input'] = '[已删除-高敏感]'
|
|
minimal['code'] = '[已删除-高敏感]'
|
|
minimal['stdout'] = '[已删除-高敏感]'
|
|
minimal['stderr'] = '[已删除-高敏感]'
|
|
minimal['execution_plan'] = record.get('execution_plan', '')[:100] + '...'
|
|
|
|
return minimal
|
|
|
|
def _sanitize_record(self, record: Dict, sensitive_fields: Set[str]) -> Dict:
|
|
"""
|
|
脱敏记录
|
|
|
|
Args:
|
|
record: 原始记录
|
|
sensitive_fields: 需要脱敏的字段
|
|
|
|
Returns:
|
|
脱敏后的记录
|
|
"""
|
|
result = record.copy()
|
|
|
|
for field in sensitive_fields:
|
|
if field in result and result[field]:
|
|
content = str(result[field])
|
|
sanitized, matches = self.sanitizer.sanitize(content)
|
|
result[field] = sanitized
|
|
|
|
# 记录脱敏信息
|
|
if '_sanitization' not in result:
|
|
result['_sanitization'] = {}
|
|
result['_sanitization'][field] = {
|
|
'masked_count': len(matches),
|
|
'types': list(set(m.type.value for m in matches))
|
|
}
|
|
|
|
return result
|
|
|
|
def check_expiration(self, record: Dict) -> bool:
|
|
"""
|
|
检查记录是否过期
|
|
|
|
Args:
|
|
record: 记录数据
|
|
|
|
Returns:
|
|
是否过期
|
|
"""
|
|
if '_governance' not in record or record['_governance'] is None:
|
|
return False
|
|
|
|
expires_at = record['_governance'].get('expires_at')
|
|
if not expires_at:
|
|
return False
|
|
|
|
try:
|
|
expire_time = datetime.fromisoformat(expires_at)
|
|
return datetime.now() > expire_time
|
|
except (ValueError, TypeError):
|
|
return False
|
|
|
|
def archive_record(self, record: Dict) -> Path:
|
|
"""
|
|
归档记录
|
|
|
|
Args:
|
|
record: 记录数据
|
|
|
|
Returns:
|
|
归档文件路径
|
|
"""
|
|
task_id = record.get('task_id', 'unknown')
|
|
timestamp = record.get('timestamp', datetime.now().strftime('%Y%m%d_%H%M%S'))
|
|
|
|
# 生成归档文件名
|
|
archive_file = self.archive_dir / f"{task_id}_{timestamp}.json"
|
|
|
|
# 标记为已归档
|
|
record['_governance']['level'] = DataLevel.ARCHIVED.value
|
|
record['_governance']['archived_at'] = datetime.now().isoformat()
|
|
|
|
# 保存到归档目录
|
|
with open(archive_file, 'w', encoding='utf-8') as f:
|
|
json.dump(record, f, ensure_ascii=False, indent=2)
|
|
|
|
return archive_file
|
|
|
|
def cleanup_expired(self, records: List[Dict]) -> tuple[List[Dict], int, int]:
|
|
"""
|
|
清理过期记录
|
|
|
|
Args:
|
|
records: 记录列表
|
|
|
|
Returns:
|
|
(保留的记录列表, 归档数量, 删除数量)
|
|
"""
|
|
kept_records = []
|
|
archived_count = 0
|
|
deleted_count = 0
|
|
|
|
for record in records:
|
|
if not self.check_expiration(record):
|
|
kept_records.append(record)
|
|
continue
|
|
|
|
# 过期处理
|
|
level = record.get('_governance', {}).get('level')
|
|
|
|
if level == DataLevel.FULL.value:
|
|
# 完整数据:降级为脱敏
|
|
record['_governance']['level'] = DataLevel.SANITIZED.value
|
|
record['_governance']['retention_days'] = RetentionPolicy.MEDIUM.value
|
|
record['_governance']['expires_at'] = (
|
|
datetime.now() + timedelta(days=RetentionPolicy.MEDIUM.value)
|
|
).isoformat()
|
|
|
|
# 执行脱敏
|
|
sensitive_fields = set(record['_governance'].get('sensitive_fields', []))
|
|
record = self._sanitize_record(record, sensitive_fields)
|
|
kept_records.append(record)
|
|
|
|
elif level == DataLevel.SANITIZED.value:
|
|
# 脱敏数据:归档
|
|
self.archive_record(record)
|
|
archived_count += 1
|
|
|
|
else:
|
|
# 最小化数据:直接删除
|
|
deleted_count += 1
|
|
|
|
return kept_records, archived_count, deleted_count
|
|
|
|
def collect_metrics(self, records: List[Dict]) -> GovernanceMetrics:
|
|
"""
|
|
收集治理度量指标
|
|
|
|
Args:
|
|
records: 记录列表
|
|
|
|
Returns:
|
|
度量指标
|
|
"""
|
|
metrics = GovernanceMetrics(
|
|
total_records=len(records),
|
|
full_records=0,
|
|
sanitized_records=0,
|
|
minimal_records=0,
|
|
archived_records=0,
|
|
total_size_bytes=0,
|
|
sensitive_field_hits={},
|
|
expired_records=0,
|
|
last_cleanup_time=datetime.now().isoformat()
|
|
)
|
|
|
|
for record in records:
|
|
# 统计数据级别
|
|
level = record.get('_governance', {}).get('level')
|
|
if level == DataLevel.FULL.value:
|
|
metrics.full_records += 1
|
|
elif level == DataLevel.SANITIZED.value:
|
|
metrics.sanitized_records += 1
|
|
elif level == DataLevel.MINIMAL.value:
|
|
metrics.minimal_records += 1
|
|
elif level == DataLevel.ARCHIVED.value:
|
|
metrics.archived_records += 1
|
|
|
|
# 统计敏感字段命中
|
|
sensitive_fields = record.get('_governance', {}).get('sensitive_fields', [])
|
|
for field in sensitive_fields:
|
|
metrics.sensitive_field_hits[field] = metrics.sensitive_field_hits.get(field, 0) + 1
|
|
|
|
# 统计过期记录
|
|
if self.check_expiration(record):
|
|
metrics.expired_records += 1
|
|
|
|
# 估算大小
|
|
metrics.total_size_bytes += len(json.dumps(record, ensure_ascii=False))
|
|
|
|
return metrics
|
|
|
|
def save_metrics(self, metrics: GovernanceMetrics):
|
|
"""保存度量指标"""
|
|
with open(self.metrics_file, 'w', encoding='utf-8') as f:
|
|
data = asdict(metrics)
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
def load_metrics(self) -> Optional[GovernanceMetrics]:
|
|
"""加载度量指标"""
|
|
if not self.metrics_file.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(self.metrics_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
return GovernanceMetrics(**data)
|
|
except Exception as e:
|
|
print(f"[警告] 加载度量指标失败: {e}")
|
|
return None
|
|
|
|
|
|
# 全局单例
|
|
_policy: Optional[DataGovernancePolicy] = None
|
|
|
|
|
|
def get_governance_policy(workspace_path: Path) -> DataGovernancePolicy:
|
|
"""获取数据治理策略单例"""
|
|
global _policy
|
|
if _policy is None:
|
|
_policy = DataGovernancePolicy(workspace_path)
|
|
return _policy
|
|
|