""" 数据迁移工具 - 从JSON迁移到Chroma向量数据库 """ import asyncio import json from pathlib import Path from datetime import datetime from src.ai.vector_store import ChromaVectorStore, JSONVectorStore, VectorMemory from src.utils.logger import setup_logger logger = setup_logger('DataMigration') async def migrate_json_to_chroma( json_path: Path, chroma_path: Path, dry_run: bool = False ): """ 从JSON迁移到Chroma Args: json_path: JSON文件路径 chroma_path: Chroma数据库路径 dry_run: 是否只是预览,不实际迁移 """ logger.info("=" * 60) logger.info("开始数据迁移:JSON -> Chroma") logger.info("=" * 60) # 1. 加载JSON数据 logger.info(f"📂 读取JSON文件: {json_path}") if not json_path.exists(): logger.error(f"❌ JSON文件不存在: {json_path}") return False json_store = JSONVectorStore(json_path) # 统计数据 total_users = len(json_store.memories) total_memories = sum(len(memories) for memories in json_store.memories.values()) logger.info(f"📊 找到 {total_users} 个用户,共 {total_memories} 条记忆") if total_memories == 0: logger.info("✅ 没有数据需要迁移") return True # 显示详细信息 for user_id, memories in json_store.memories.items(): logger.info(f" 用户 {user_id}: {len(memories)} 条记忆") if dry_run: logger.info("🔍 预览模式,不执行实际迁移") return True # 2. 初始化Chroma logger.info(f"📂 初始化Chroma数据库: {chroma_path}") try: chroma_store = ChromaVectorStore(chroma_path) except Exception as e: logger.error(f"❌ Chroma初始化失败: {e}") return False # 3. 迁移数据 logger.info("🚀 开始迁移数据...") success_count = 0 fail_count = 0 for user_id, memories in json_store.memories.items(): logger.info(f" 迁移用户 {user_id} 的数据...") for memory in memories: try: success = await chroma_store.add( id=memory.id, user_id=memory.user_id, content=memory.content, embedding=memory.embedding, importance=memory.importance, metadata=memory.metadata ) if success: success_count += 1 else: fail_count += 1 logger.warning(f" ⚠️ 迁移失败: {memory.id}") except Exception as e: fail_count += 1 logger.error(f" ❌ 迁移出错: {memory.id} - {e}") # 4. 关闭连接 await chroma_store.close() # 5. 显示结果 logger.info("=" * 60) logger.info("迁移完成!") logger.info(f"✅ 成功: {success_count} 条") if fail_count > 0: logger.info(f"❌ 失败: {fail_count} 条") logger.info("=" * 60) return fail_count == 0 async def backup_json(json_path: Path): """备份JSON文件""" if not json_path.exists(): logger.warning(f"⚠️ JSON文件不存在,无需备份: {json_path}") return None timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backup_path = json_path.parent / f"{json_path.stem}_backup_{timestamp}.json" logger.info(f"💾 备份JSON文件: {backup_path}") try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) with open(backup_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.info(f"✅ 备份成功: {backup_path}") return backup_path except Exception as e: logger.error(f"❌ 备份失败: {e}") return None async def main(): """主函数""" import argparse parser = argparse.ArgumentParser(description='数据迁移工具:JSON -> Chroma') parser.add_argument( '--json-path', type=str, default='data/ai/long_term_memory.json', help='JSON文件路径' ) parser.add_argument( '--chroma-path', type=str, default='data/ai/chroma_db', help='Chroma数据库路径' ) parser.add_argument( '--dry-run', action='store_true', help='预览模式,不实际迁移' ) parser.add_argument( '--no-backup', action='store_true', help='不备份原JSON文件' ) args = parser.parse_args() json_path = Path(args.json_path) chroma_path = Path(args.chroma_path) print("\n" + "=" * 60) print("数据迁移工具 - JSON to Chroma") print("=" * 60) print(f"JSON路径: {json_path}") print(f"Chroma路径: {chroma_path}") print(f"预览模式: {'是' if args.dry_run else '否'}") print(f"备份: {'否' if args.no_backup else '是'}") print("=" * 60 + "\n") # 备份 if not args.no_backup and not args.dry_run: backup_path = await backup_json(json_path) if backup_path: print(f"\n✅ 已备份到: {backup_path}\n") # 迁移 success = await migrate_json_to_chroma( json_path=json_path, chroma_path=chroma_path, dry_run=args.dry_run ) if success: print("\n✅ 迁移成功!") if not args.dry_run: print(f"\n💡 提示:") print(f"1. 请在 .env 中设置 AI_USE_VECTOR_DB=true") print(f"2. 重启机器人即可使用Chroma向量数据库") print(f"3. 原JSON文件已备份,可以安全删除") else: print("\n❌ 迁移失败,请查看日志") print() if __name__ == "__main__": asyncio.run(main())