""" 抖音搜索 FastAPI 接口 提供搜索抖音视频的 HTTP API """ from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field from typing import Optional, List import asyncio import sys import os from pathlib import Path import json from datetime import datetime from playwright.async_api import async_playwright from dotenv import load_dotenv # 加载环境变量 load_dotenv() # 添加 douyin_data_soupce 到路径 sys.path.append(str(Path(__file__).parent / "douyin_data_soupce")) from douyin_data_soupce.douyin_search_crawler import DouyinSearchCrawler from ai_analyzer import AIAnalyzer from ai_agent import create_agent app = FastAPI(title="抖音数据API", description="提供抖音视频搜索、创作指导、AI分析和智能代理功能") class SearchRequest(BaseModel): keyword: str = Field(..., description="搜索关键词", min_length=1) max_scroll: int = Field(default=5, description="最大滚动次数", ge=1, le=20) headless: bool = Field(default=True, description="是否使用无头模式") cookie_file: str = Field(default="douyin_data_soupce/douyin_cookie.json", description="Cookie文件路径") class VideoInfo(BaseModel): url: str title: str author: str publishTime: str duration: str playCount: str tags: List[str] class SearchResponse(BaseModel): success: bool keyword: str total_count: int videos: List[VideoInfo] message: Optional[str] = None class CreativeGuidanceRequest(BaseModel): category: str = Field(default="全部", description="分类标签(全部/美食/旅行/泛生活/汽车/科技/游戏/二次元)") headless: bool = Field(default=True, description="是否使用无头模式") cookie_file: str = Field(default="douyin_data_soupce/douyin_cookie.json", description="Cookie文件路径") output_dir: str = Field(default="douyin_data_soupce/douyin_data", description="输出目录") class CreativeVideoInfo(BaseModel): index: int author: str description: str authorLink: str duration: Optional[str] = None hot: Optional[str] = None plays: Optional[str] = None likes: Optional[str] = None comments: Optional[str] = None hotWords: List[str] = [] hashTags: List[str] = [] class CreativeGuidanceResponse(BaseModel): success: bool category: str total_count: int videos: List[CreativeVideoInfo] page_url: str crawl_time: str message: Optional[str] = None class AnalyzeRequest(BaseModel): videos: List[dict] = Field(..., description="视频数据列表") prompt_file: str = Field(default="prompts/analyze_prompt.md", description="提示词文件路径") custom_instruction: Optional[str] = Field(None, description="自定义分析指令") model: str = Field(default="qwen-plus", description="使用的模型名称") api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选,默认从环境变量读取)") class AnalyzeFileRequest(BaseModel): json_file: str = Field(..., description="JSON数据文件路径") prompt_file: str = Field(default="prompts/analyze_prompt.md", description="提示词文件路径") custom_instruction: Optional[str] = Field(None, description="自定义分析指令") model: str = Field(default="qwen-plus", description="使用的模型名称") api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选,默认从环境变量读取)") class AnalyzeResponse(BaseModel): success: bool analysis: Optional[str] = None model: Optional[str] = None video_count: int usage: Optional[dict] = None error: Optional[str] = None class AgentRequest(BaseModel): query: str = Field(..., description="用户查询", min_length=1) system_prompt_file: str = Field(default="prompts/agent_prompt.md", description="系统提示词文件路径") max_iterations: int = Field(default=10, description="最大迭代次数", ge=1, le=20) model: str = Field(default="qwen-plus", description="使用的模型名称") api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选)") class AgentResponse(BaseModel): success: bool final_answer: Optional[str] = None iteration: int tool_calls: List[dict] = [] error: Optional[str] = None class ExtractKeywordsRequest(BaseModel): query: str = Field(..., description="用户输入的文字", min_length=1) class ExtractKeywordsResponse(BaseModel): success: bool categories: List[str] = [] primary_category: Optional[str] = None original_query: str method: Optional[str] = None error: Optional[str] = None @app.get("/") async def root(): """根路径""" return { "message": "抖音数据API", "docs": "/docs", "endpoints": { "search": "/api/search", "creative_guidance": "/api/creative-guidance", "analyze": "/api/analyze", "analyze_file": "/api/analyze-file", "agent": "/api/agent" } } async def load_cookies_for_creative(cookie_file: str): """加载Cookie文件""" try: with open(cookie_file, 'r', encoding='utf-8') as f: cookies = json.load(f) return cookies except Exception as e: return None async def extract_creative_video_data(page): """从创作指导页面中提取视频数据""" data = await page.evaluate(""" () => { const videos = []; const authorLinks = Array.from(document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]')); authorLinks.forEach((authorLink, index) => { try { const author = authorLink.textContent.trim(); let container = authorLink.parentElement; let maxLevels = 10; while (container && maxLevels > 0) { if (container.querySelector('.contain-info-LpWGHS')) { break; } container = container.parentElement; maxLevels--; } if (!container) return; const paragraphs = Array.from(container.querySelectorAll('p')); let description = ''; for (let p of paragraphs) { const text = p.textContent.trim(); if (text && text !== '|' && text.length > 5 && !text.includes('万') && !text.includes(':')) { description = text; break; } } let hot = '', plays = '', likes = '', comments = ''; const infoContainer = container.querySelector('.contain-info-LpWGHS'); if (infoContainer) { const infoItems = infoContainer.querySelectorAll('.each-info-TpmTI0'); infoItems.forEach(item => { const img = item.querySelector('img'); const text = item.textContent.trim(); if (img && img.src) { if (img.src.includes('hot_')) { hot = text; } else if (img.src.includes('play')) { plays = text; } else if (img.src.includes('digg')) { likes = text; } else if (img.src.includes('comment')) { comments = text; } } }); } const hotWords = []; const hotWordElements = container.querySelectorAll('.other-text-XeleRf'); hotWordElements.forEach((el, i) => { const text = el.textContent.trim(); if (i === 0 && text.includes('热词')) { } else if (text && !text.includes('热词')) { hotWords.push(text); } }); const hashTags = description.match(/#[^\\s#]+/g) || []; let duration = ''; const timeElements = container.querySelectorAll('.time-text-mask-WmpK85 p'); if (timeElements.length > 0) { duration = timeElements[0].textContent.trim(); } if (author && description) { videos.push({ index: index + 1, author: author, description: description, authorLink: authorLink.href, duration: duration, hot: hot, plays: plays, likes: likes, comments: comments, hotWords: hotWords, hashTags: hashTags }); } } catch (e) { } }); return { total: videos.length, videos: videos, crawlTime: new Date().toISOString(), pageTitle: document.title, pageUrl: window.location.href }; } """) return data async def crawl_creative_guidance_api(category: str, headless: bool, cookie_file: str, output_dir: str): """抓取抖音创作指导页面 - API版本""" url = "https://creator.douyin.com/creator-micro/creative-guidance" async with async_playwright() as p: browser = await p.chromium.launch(headless=headless) context_options = {} if Path(cookie_file).exists(): cookies = await load_cookies_for_creative(cookie_file) if cookies: context_options['storage_state'] = {'cookies': cookies} context = await browser.new_context(**context_options) page = await context.new_page() try: await page.goto(url, wait_until="domcontentloaded", timeout=60000) await asyncio.sleep(10) if category and category != "全部": # 先展开所有分类 await page.evaluate(""" () => { const showButtons = document.querySelectorAll('.show-button-sDo51G'); showButtons.forEach(btn => { const text = btn.textContent.trim(); // 如果按钮不是"收起",说明需要展开 if (!text.includes('收起')) { btn.click(); } }); } """) await asyncio.sleep(1) # 查找并点击分类标签 category_clicked = await page.evaluate(f""" () => {{ const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN')); const targetDiv = categoryDivs.find(div => div.textContent.trim() === '{category}' ); if (targetDiv) {{ targetDiv.click(); return true; }} return false; }} """) if category_clicked: await asyncio.sleep(8) data = await extract_creative_video_data(page) if data['total'] > 0: # 保存到文件 output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") category_suffix = f"_{category}" if category else "" json_file = output_path / f"douyin_creative_guidance{category_suffix}_{timestamp}.json" result_data = { 'page_url': url, 'page_title': '抖音创作指导 - 热门视频', 'category': category or '全部', 'platform': 'douyin', 'crawl_time': datetime.now().isoformat(), 'total_videos': len(data['videos']), 'videos': data['videos'] } with open(json_file, "w", encoding="utf-8") as f: json.dump(result_data, f, ensure_ascii=False, indent=2) return data finally: await browser.close() @app.post("/api/creative-guidance", response_model=CreativeGuidanceResponse) async def get_creative_guidance(request: CreativeGuidanceRequest): """ 获取抖音创作指导数据 - **category**: 分类标签(全部/美食/旅行/泛生活/汽车/科技/游戏/二次元),默认"全部" - **headless**: 是否使用无头模式,默认True - **cookie_file**: Cookie文件路径 - **output_dir**: 输出目录 """ try: # 检查Cookie文件 if not Path(request.cookie_file).exists(): raise HTTPException( status_code=400, detail=f"Cookie文件不存在: {request.cookie_file}" ) # 抓取数据 data = await crawl_creative_guidance_api( category=request.category, headless=request.headless, cookie_file=request.cookie_file, output_dir=request.output_dir ) if not data or data['total'] == 0: return CreativeGuidanceResponse( success=False, category=request.category, total_count=0, videos=[], page_url="https://creator.douyin.com/creator-micro/creative-guidance", crawl_time=datetime.now().isoformat(), message="未获取到任何视频数据,请确保已登录并且页面加载完成" ) return CreativeGuidanceResponse( success=True, category=request.category, total_count=data['total'], videos=data['videos'], page_url=data['pageUrl'], crawl_time=data['crawlTime'], message=f"成功获取 {data['total']} 个视频" ) except Exception as e: raise HTTPException(status_code=500, detail=f"获取创作指导数据时出错: {str(e)}") @app.post("/api/search", response_model=SearchResponse) async def search_douyin(request: SearchRequest): """ 搜索抖音视频 - **keyword**: 搜索关键词(必填) - **max_scroll**: 最大滚动次数,默认5次 - **headless**: 是否使用无头模式,默认True - **cookie_file**: Cookie文件路径 """ crawler = DouyinSearchCrawler(headless=request.headless) try: # 初始化浏览器 await crawler.init_browser() # 加载cookies cookie_loaded = await crawler.load_cookies(request.cookie_file) if not cookie_loaded: raise HTTPException( status_code=400, detail=f"无法加载Cookie文件: {request.cookie_file}" ) # 搜索视频 videos = await crawler.search_videos(request.keyword, max_scroll=request.max_scroll) if not videos: return SearchResponse( success=False, keyword=request.keyword, total_count=0, videos=[], message="未获取到任何视频数据" ) # 保存结果 await crawler.save_results(request.keyword, videos) return SearchResponse( success=True, keyword=request.keyword, total_count=len(videos), videos=videos, message="搜索成功" ) except Exception as e: raise HTTPException(status_code=500, detail=f"搜索过程中出错: {str(e)}") finally: await crawler.close() @app.post("/api/analyze", response_model=AnalyzeResponse) async def analyze_videos(request: AnalyzeRequest): """ 使用AI分析视频数据 - **videos**: 视频数据列表(必填) - **prompt_file**: 提示词文件路径,默认"prompts/analyze_prompt.md" - **custom_instruction**: 自定义分析指令(可选) - **model**: 使用的模型名称,默认"qwen-plus" - **api_key**: 阿里云百炼API Key(可选,默认从环境变量DASHSCOPE_API_KEY读取) """ try: # 检查提示词文件 if not Path(request.prompt_file).exists(): raise HTTPException( status_code=400, detail=f"提示词文件不存在: {request.prompt_file}" ) # 创建分析器 try: analyzer = AIAnalyzer(api_key=request.api_key, model=request.model) except ValueError as e: raise HTTPException( status_code=400, detail=str(e) ) # 分析数据 result = analyzer.analyze( videos=request.videos, prompt_file=request.prompt_file, custom_instruction=request.custom_instruction ) return AnalyzeResponse(**result) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"分析过程中出错: {str(e)}") @app.post("/api/analyze-file", response_model=AnalyzeResponse) async def analyze_from_file(request: AnalyzeFileRequest): """ 从JSON文件读取数据并使用AI分析 - **json_file**: JSON数据文件路径(必填) - **prompt_file**: 提示词文件路径,默认"prompts/analyze_prompt.md" - **custom_instruction**: 自定义分析指令(可选) - **model**: 使用的模型名称,默认"qwen-plus" - **api_key**: 阿里云百炼API Key(可选,默认从环境变量DASHSCOPE_API_KEY读取) """ try: # 检查文件 if not Path(request.json_file).exists(): raise HTTPException( status_code=400, detail=f"JSON文件不存在: {request.json_file}" ) if not Path(request.prompt_file).exists(): raise HTTPException( status_code=400, detail=f"提示词文件不存在: {request.prompt_file}" ) # 创建分析器 try: analyzer = AIAnalyzer(api_key=request.api_key, model=request.model) except ValueError as e: raise HTTPException( status_code=400, detail=str(e) ) # 分析数据 result = analyzer.analyze_from_file( json_file=request.json_file, prompt_file=request.prompt_file, custom_instruction=request.custom_instruction ) return AnalyzeResponse(**result) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"分析过程中出错: {str(e)}") @app.post("/api/agent", response_model=AgentResponse) async def run_agent(request: AgentRequest): """ 运行AI智能代理 - 创作灵感生成 AI代理会根据用户描述自动: 1. 理解用户需求并提取内容分类 2. 获取相关的热门视频数据 3. 生成9个具体可执行的创作灵感 4. 如果用户不满意,可以获取更多灵感 **支持的内容分类**(共26个): - 美食、旅行、泛生活、汽车、科技、游戏、二次元 - 娱乐、明星、体育、文化教育、校园、政务 - 时尚、才艺、随拍、动植物、图文控 - 剧情、亲子、三农、创意、户外、公益 **参数**: - **query**: 用户描述(必填),例如: - "我想做一些美食相关的短视频,主要是家常菜的制作教程" - "我想拍一些关于大学生活的有趣视频,记录校园日常" - "我想做一些关于健身的短视频,分享简单的居家锻炼方法" - **system_prompt_file**: 系统提示词文件,默认"prompts/agent_prompt.md" - **max_iterations**: 最大迭代次数,默认10(建议设置为15以支持完整流程) - **model**: 使用的模型名称,默认"qwen-plus" - **api_key**: 阿里云百炼API Key(可选) **返回**: - 识别的内容分类 - 9个创作灵感(包含标题、核心创意、执行建议、热门标签) - 创作提示和建议 **示例**: ```json { "query": "我想做一些美食相关的短视频,主要是家常菜的制作教程", "max_iterations": 15 } ``` """ try: # 检查提示词文件 if not Path(request.system_prompt_file).exists(): raise HTTPException( status_code=400, detail=f"系统提示词文件不存在: {request.system_prompt_file}" ) # 创建代理 try: agent = create_agent() if request.api_key: agent.api_key = request.api_key import dashscope dashscope.api_key = request.api_key agent.model = request.model except ValueError as e: raise HTTPException( status_code=400, detail=str(e) ) # 运行代理 result = await agent.run( user_input=request.query, system_prompt_file=request.system_prompt_file, max_iterations=request.max_iterations ) return AgentResponse( success=result["success"], final_answer=result.get("final_answer"), iteration=result["iteration"], tool_calls=result.get("tool_calls", []), error=result.get("error") ) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Agent执行过程中出错: {str(e)}") @app.post("/api/extract-keywords", response_model=ExtractKeywordsResponse) async def extract_keywords(request: ExtractKeywordsRequest): """ 提取关键词 - 从用户输入中提取1-3个内容分类关键词 这个接口用于在用户输入后,先提取关键词展示给用户确认, 然后用户可以选择关键词后再调用 /api/agent 接口生成灵感。 **工作流程**: 1. 用户输入一段描述 2. 调用此接口提取关键词 3. 前端展示关键词给用户 4. 用户确认后,前端调用 /api/agent 接口生成灵感 **支持的内容分类**(共26个): - 美食、旅行、泛生活、汽车、科技、游戏、二次元 - 娱乐、明星、体育、文化教育、校园、政务 - 时尚、才艺、随拍、动植物、图文控 - 剧情、亲子、三农、创意、户外、公益 **参数**: - **query**: 用户输入的文字(必填) **返回**: - **success**: 是否成功 - **categories**: 提取的关键词列表(1-3个,可能为空) - **primary_category**: 主要关键词(第一个,可能为null) - **original_query**: 用户原始输入 - **method**: 提取方法 - **error**: 错误信息(如果有) **示例请求**: ```json { "query": "我想做一些关于大学生活的有趣视频,记录校园日常" } ``` **示例返回**: ```json { "success": true, "categories": ["校园", "青春"], "primary_category": "校园", "original_query": "我想做一些关于大学生活的有趣视频,记录校园日常", "method": "ai_classification" } ``` """ try: from ai_agent import extract_search_keywords # 调用关键词提取函数 result = extract_search_keywords(request.query) return ExtractKeywordsResponse( success=result.get("success", True), categories=result.get("categories", []), primary_category=result.get("primary_category"), original_query=result.get("original_query", request.query), method=result.get("method"), error=None ) except Exception as e: return ExtractKeywordsResponse( success=False, categories=[], primary_category=None, original_query=request.query, method="error", error=str(e) ) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8002)