feat: refactor API key configuration and enhance application initialization

- Renamed `check_environment` to `check_api_key_configured` for clarity, simplifying the API key validation logic. - Removed the blocking behavior of the API key check during application startup, allowing the app to run while providing a prompt for configuration. - Updated `LocalAgentApp` to accept an `api_configured` parameter, enabling conditional messaging for API key setup. - Enhanced the `SandboxRunner` to support backup management and improved execution result handling with detailed metrics. - Integrated data governance strategies into the `HistoryManager`, ensuring compliance and improved data management. - Added privacy settings and metrics tracking across various components to enhance user experience and application safety.
2026-02-27 14:32:30 +08:00
parent ab5bbff6f7
commit 8a538bb950
58 changed files with 13457 additions and 350 deletions
--- a/history/data_governance.py
+++ b/history/data_governance.py
@@ -0,0 +1,410 @@
+"""
+数据治理策略模块
+实现数据分级保存、保留期管理、归档和清理策略
+"""
+
+import json
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+from dataclasses import dataclass, asdict
+from enum import Enum
+
+from history.data_sanitizer import get_sanitizer, SensitiveType
+
+
+class DataLevel(Enum):
+    """数据保存级别"""
+    FULL = "full"           # 完整保存（无脱敏）
+    SANITIZED = "sanitized" # 脱敏保存
+    MINIMAL = "minimal"     # 最小化保存（仅元数据）
+    ARCHIVED = "archived"   # 已归档
+
+
+class RetentionPolicy(Enum):
+    """数据保留策略"""
+    SHORT = 7      # 7天
+    MEDIUM = 30    # 30天
+    LONG = 90      # 90天
+    PERMANENT = -1 # 永久保留
+
+
+@dataclass
+class DataClassification:
+    """数据分类结果"""
+    level: DataLevel
+    retention_days: int
+    sensitivity_score: float
+    sensitive_fields: Set[str]
+    reason: str
+
+
+@dataclass
+class GovernanceMetrics:
+    """治理度量指标"""
+    total_records: int
+    full_records: int
+    sanitized_records: int
+    minimal_records: int
+    archived_records: int
+    total_size_bytes: int
+    sensitive_field_hits: Dict[str, int]
+    expired_records: int
+    last_cleanup_time: str
+
+
+class DataGovernancePolicy:
+    """
+    数据治理策略
+    
+    根据敏感度自动分级保存，管理数据生命周期
+    """
+    
+    # 字段敏感度配置
+    FIELD_SENSITIVITY = {
+        'user_input': 0.5,      # 用户输入可能含敏感信息
+        'code': 0.7,            # 代码可能含路径、密钥
+        'stdout': 0.6,          # 输出可能含敏感数据
+        'stderr': 0.6,          # 错误信息可能含路径
+        'execution_plan': 0.3,  # 执行计划相对安全
+        'log_path': 0.4,        # 日志路径
+    }
+    
+    # 分级阈值
+    LEVEL_THRESHOLDS = {
+        DataLevel.FULL: 0.0,        # 敏感度 < 0.3 完整保存
+        DataLevel.SANITIZED: 0.3,   # 0.3 <= 敏感度 < 0.7 脱敏保存
+        DataLevel.MINIMAL: 0.7,     # 敏感度 >= 0.7 最小化保存
+    }
+    
+    # 保留期配置（根据数据级别）
+    RETENTION_CONFIG = {
+        DataLevel.FULL: RetentionPolicy.LONG.value,       # 完整数据保留90天
+        DataLevel.SANITIZED: RetentionPolicy.MEDIUM.value, # 脱敏数据保留30天
+        DataLevel.MINIMAL: RetentionPolicy.SHORT.value,    # 最小化数据保留7天
+    }
+    
+    def __init__(self, workspace_path: Path):
+        self.workspace = workspace_path
+        self.sanitizer = get_sanitizer()
+        self.metrics_file = workspace_path / "governance_metrics.json"
+        self.archive_dir = workspace_path / "archive"
+        self.archive_dir.mkdir(exist_ok=True)
+    
+    def classify_record(self, record_data: Dict) -> DataClassification:
+        """
+        对记录进行分类
+        
+        Args:
+            record_data: 记录数据字典
+            
+        Returns:
+            数据分类结果
+        """
+        sensitive_fields = set()
+        total_sensitivity = 0.0
+        field_count = 0
+        
+        # 分析各字段敏感度
+        for field, weight in self.FIELD_SENSITIVITY.items():
+            if field in record_data and record_data[field]:
+                content = str(record_data[field])
+                field_score = self.sanitizer.get_sensitivity_score(content)
+                
+                if field_score > 0.3:  # 发现敏感信息
+                    sensitive_fields.add(field)
+                
+                total_sensitivity += field_score * weight
+                field_count += 1
+        
+        # 计算综合敏感度
+        avg_sensitivity = total_sensitivity / field_count if field_count > 0 else 0.0
+        
+        # 确定数据级别
+        if avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.MINIMAL]:
+            level = DataLevel.MINIMAL
+            reason = f"高敏感度({avg_sensitivity:.2f})，仅保留元数据"
+        elif avg_sensitivity >= self.LEVEL_THRESHOLDS[DataLevel.SANITIZED]:
+            level = DataLevel.SANITIZED
+            reason = f"中等敏感度({avg_sensitivity:.2f})，脱敏保存"
+        else:
+            level = DataLevel.FULL
+            reason = f"低敏感度({avg_sensitivity:.2f})，完整保存"
+        
+        # 确定保留期
+        retention_days = self.RETENTION_CONFIG[level]
+        
+        return DataClassification(
+            level=level,
+            retention_days=retention_days,
+            sensitivity_score=avg_sensitivity,
+            sensitive_fields=sensitive_fields,
+            reason=reason
+        )
+    
+    def apply_policy(self, record_data: Dict) -> Dict:
+        """
+        应用治理策略，返回处理后的数据
+        
+        Args:
+            record_data: 原始记录数据
+            
+        Returns:
+            处理后的记录数据
+        """
+        classification = self.classify_record(record_data)
+        
+        # 添加治理元数据
+        result = record_data.copy()
+        result['_governance'] = {
+            'level': classification.level.value,
+            'retention_days': classification.retention_days,
+            'sensitivity_score': classification.sensitivity_score,
+            'sensitive_fields': list(classification.sensitive_fields),
+            'classified_at': datetime.now().isoformat(),
+            'expires_at': (datetime.now() + timedelta(days=classification.retention_days)).isoformat()
+        }
+        
+        # 根据级别处理数据
+        if classification.level == DataLevel.MINIMAL:
+            # 最小化：只保留元数据
+            result = self._minimize_record(result)
+        
+        elif classification.level == DataLevel.SANITIZED:
+            # 脱敏：对敏感字段脱敏
+            result = self._sanitize_record(result, classification.sensitive_fields)
+        
+        # FULL 级别不做处理
+        
+        return result
+    
+    def _minimize_record(self, record: Dict) -> Dict:
+        """
+        最小化记录（仅保留元数据）
+        
+        Args:
+            record: 原始记录
+            
+        Returns:
+            最小化后的记录
+        """
+        # 保留的字段
+        keep_fields = {
+            'task_id', 'timestamp', 'intent_label', 'intent_confidence',
+            'success', 'duration_ms', 'task_summary', '_governance'
+        }
+        
+        minimal = {k: v for k, v in record.items() if k in keep_fields}
+        
+        # 添加摘要信息
+        minimal['user_input'] = '[已删除-高敏感]'
+        minimal['code'] = '[已删除-高敏感]'
+        minimal['stdout'] = '[已删除-高敏感]'
+        minimal['stderr'] = '[已删除-高敏感]'
+        minimal['execution_plan'] = record.get('execution_plan', '')[:100] + '...'
+        
+        return minimal
+    
+    def _sanitize_record(self, record: Dict, sensitive_fields: Set[str]) -> Dict:
+        """
+        脱敏记录
+        
+        Args:
+            record: 原始记录
+            sensitive_fields: 需要脱敏的字段
+            
+        Returns:
+            脱敏后的记录
+        """
+        result = record.copy()
+        
+        for field in sensitive_fields:
+            if field in result and result[field]:
+                content = str(result[field])
+                sanitized, matches = self.sanitizer.sanitize(content)
+                result[field] = sanitized
+                
+                # 记录脱敏信息
+                if '_sanitization' not in result:
+                    result['_sanitization'] = {}
+                result['_sanitization'][field] = {
+                    'masked_count': len(matches),
+                    'types': list(set(m.type.value for m in matches))
+                }
+        
+        return result
+    
+    def check_expiration(self, record: Dict) -> bool:
+        """
+        检查记录是否过期
+        
+        Args:
+            record: 记录数据
+            
+        Returns:
+            是否过期
+        """
+        if '_governance' not in record or record['_governance'] is None:
+            return False
+        
+        expires_at = record['_governance'].get('expires_at')
+        if not expires_at:
+            return False
+        
+        try:
+            expire_time = datetime.fromisoformat(expires_at)
+            return datetime.now() > expire_time
+        except (ValueError, TypeError):
+            return False
+    
+    def archive_record(self, record: Dict) -> Path:
+        """
+        归档记录
+        
+        Args:
+            record: 记录数据
+            
+        Returns:
+            归档文件路径
+        """
+        task_id = record.get('task_id', 'unknown')
+        timestamp = record.get('timestamp', datetime.now().strftime('%Y%m%d_%H%M%S'))
+        
+        # 生成归档文件名
+        archive_file = self.archive_dir / f"{task_id}_{timestamp}.json"
+        
+        # 标记为已归档
+        record['_governance']['level'] = DataLevel.ARCHIVED.value
+        record['_governance']['archived_at'] = datetime.now().isoformat()
+        
+        # 保存到归档目录
+        with open(archive_file, 'w', encoding='utf-8') as f:
+            json.dump(record, f, ensure_ascii=False, indent=2)
+        
+        return archive_file
+    
+    def cleanup_expired(self, records: List[Dict]) -> tuple[List[Dict], int, int]:
+        """
+        清理过期记录
+        
+        Args:
+            records: 记录列表
+            
+        Returns:
+            (保留的记录列表, 归档数量, 删除数量)
+        """
+        kept_records = []
+        archived_count = 0
+        deleted_count = 0
+        
+        for record in records:
+            if not self.check_expiration(record):
+                kept_records.append(record)
+                continue
+            
+            # 过期处理
+            level = record.get('_governance', {}).get('level')
+            
+            if level == DataLevel.FULL.value:
+                # 完整数据：降级为脱敏
+                record['_governance']['level'] = DataLevel.SANITIZED.value
+                record['_governance']['retention_days'] = RetentionPolicy.MEDIUM.value
+                record['_governance']['expires_at'] = (
+                    datetime.now() + timedelta(days=RetentionPolicy.MEDIUM.value)
+                ).isoformat()
+                
+                # 执行脱敏
+                sensitive_fields = set(record['_governance'].get('sensitive_fields', []))
+                record = self._sanitize_record(record, sensitive_fields)
+                kept_records.append(record)
+            
+            elif level == DataLevel.SANITIZED.value:
+                # 脱敏数据：归档
+                self.archive_record(record)
+                archived_count += 1
+            
+            else:
+                # 最小化数据：直接删除
+                deleted_count += 1
+        
+        return kept_records, archived_count, deleted_count
+    
+    def collect_metrics(self, records: List[Dict]) -> GovernanceMetrics:
+        """
+        收集治理度量指标
+        
+        Args:
+            records: 记录列表
+            
+        Returns:
+            度量指标
+        """
+        metrics = GovernanceMetrics(
+            total_records=len(records),
+            full_records=0,
+            sanitized_records=0,
+            minimal_records=0,
+            archived_records=0,
+            total_size_bytes=0,
+            sensitive_field_hits={},
+            expired_records=0,
+            last_cleanup_time=datetime.now().isoformat()
+        )
+        
+        for record in records:
+            # 统计数据级别
+            level = record.get('_governance', {}).get('level')
+            if level == DataLevel.FULL.value:
+                metrics.full_records += 1
+            elif level == DataLevel.SANITIZED.value:
+                metrics.sanitized_records += 1
+            elif level == DataLevel.MINIMAL.value:
+                metrics.minimal_records += 1
+            elif level == DataLevel.ARCHIVED.value:
+                metrics.archived_records += 1
+            
+            # 统计敏感字段命中
+            sensitive_fields = record.get('_governance', {}).get('sensitive_fields', [])
+            for field in sensitive_fields:
+                metrics.sensitive_field_hits[field] = metrics.sensitive_field_hits.get(field, 0) + 1
+            
+            # 统计过期记录
+            if self.check_expiration(record):
+                metrics.expired_records += 1
+            
+            # 估算大小
+            metrics.total_size_bytes += len(json.dumps(record, ensure_ascii=False))
+        
+        return metrics
+    
+    def save_metrics(self, metrics: GovernanceMetrics):
+        """保存度量指标"""
+        with open(self.metrics_file, 'w', encoding='utf-8') as f:
+            data = asdict(metrics)
+            json.dump(data, f, ensure_ascii=False, indent=2)
+    
+    def load_metrics(self) -> Optional[GovernanceMetrics]:
+        """加载度量指标"""
+        if not self.metrics_file.exists():
+            return None
+        
+        try:
+            with open(self.metrics_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                return GovernanceMetrics(**data)
+        except Exception as e:
+            print(f"[警告] 加载度量指标失败: {e}")
+            return None
+
+
+# 全局单例
+_policy: Optional[DataGovernancePolicy] = None
+
+
+def get_governance_policy(workspace_path: Path) -> DataGovernancePolicy:
+    """获取数据治理策略单例"""
+    global _policy
+    if _policy is None:
+        _policy = DataGovernancePolicy(workspace_path)
+    return _policy
+