You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

623 lines
22 KiB

"""
抖音搜索 FastAPI 接口
提供搜索抖音视频的 HTTP API
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from typing import Optional, List
import asyncio
import sys
import os
from pathlib import Path
import json
from datetime import datetime
from playwright.async_api import async_playwright
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# 添加 douyin_data_soupce 到路径
sys.path.append(str(Path(__file__).parent / "douyin_data_soupce"))
from douyin_data_soupce.douyin_search_crawler import DouyinSearchCrawler
from ai_analyzer import AIAnalyzer
from ai_agent import create_agent
app = FastAPI(title="抖音数据API", description="提供抖音视频搜索、创作指导、AI分析和智能代理功能")
class SearchRequest(BaseModel):
keyword: str = Field(..., description="搜索关键词", min_length=1)
max_scroll: int = Field(default=5, description="最大滚动次数", ge=1, le=20)
headless: bool = Field(default=True, description="是否使用无头模式")
cookie_file: str = Field(default="douyin_data_soupce/douyin_cookie.json", description="Cookie文件路径")
class VideoInfo(BaseModel):
url: str
title: str
author: str
publishTime: str
duration: str
playCount: str
tags: List[str]
class SearchResponse(BaseModel):
success: bool
keyword: str
total_count: int
videos: List[VideoInfo]
message: Optional[str] = None
class CreativeGuidanceRequest(BaseModel):
category: str = Field(default="全部", description="分类标签(全部/美食/旅行/泛生活/汽车/科技/游戏/二次元)")
headless: bool = Field(default=True, description="是否使用无头模式")
cookie_file: str = Field(default="douyin_data_soupce/douyin_cookie.json", description="Cookie文件路径")
output_dir: str = Field(default="douyin_data_soupce/douyin_data", description="输出目录")
class CreativeVideoInfo(BaseModel):
index: int
author: str
description: str
authorLink: str
duration: Optional[str] = None
hot: Optional[str] = None
plays: Optional[str] = None
likes: Optional[str] = None
comments: Optional[str] = None
hotWords: List[str] = []
hashTags: List[str] = []
class CreativeGuidanceResponse(BaseModel):
success: bool
category: str
total_count: int
videos: List[CreativeVideoInfo]
page_url: str
crawl_time: str
message: Optional[str] = None
class AnalyzeRequest(BaseModel):
videos: List[dict] = Field(..., description="视频数据列表")
prompt_file: str = Field(default="prompts/analyze_prompt.md", description="提示词文件路径")
custom_instruction: Optional[str] = Field(None, description="自定义分析指令")
model: str = Field(default="qwen-plus", description="使用的模型名称")
api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选,默认从环境变量读取)")
class AnalyzeFileRequest(BaseModel):
json_file: str = Field(..., description="JSON数据文件路径")
prompt_file: str = Field(default="prompts/analyze_prompt.md", description="提示词文件路径")
custom_instruction: Optional[str] = Field(None, description="自定义分析指令")
model: str = Field(default="qwen-plus", description="使用的模型名称")
api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选,默认从环境变量读取)")
class AnalyzeResponse(BaseModel):
success: bool
analysis: Optional[str] = None
model: Optional[str] = None
video_count: int
usage: Optional[dict] = None
error: Optional[str] = None
class AgentRequest(BaseModel):
query: str = Field(..., description="用户查询", min_length=1)
system_prompt_file: str = Field(default="prompts/agent_prompt.md", description="系统提示词文件路径")
max_iterations: int = Field(default=10, description="最大迭代次数", ge=1, le=20)
model: str = Field(default="qwen-plus", description="使用的模型名称")
api_key: Optional[str] = Field(None, description="阿里云百炼API Key(可选)")
class AgentResponse(BaseModel):
success: bool
final_answer: Optional[str] = None
iteration: int
tool_calls: List[dict] = []
error: Optional[str] = None
@app.get("/")
async def root():
"""根路径"""
return {
"message": "抖音数据API",
"docs": "/docs",
"endpoints": {
"search": "/api/search",
"creative_guidance": "/api/creative-guidance",
"analyze": "/api/analyze",
"analyze_file": "/api/analyze-file",
"agent": "/api/agent"
}
}
async def load_cookies_for_creative(cookie_file: str):
"""加载Cookie文件"""
try:
with open(cookie_file, 'r', encoding='utf-8') as f:
cookies = json.load(f)
return cookies
except Exception as e:
return None
async def extract_creative_video_data(page):
"""从创作指导页面中提取视频数据"""
data = await page.evaluate("""
() => {
const videos = [];
const authorLinks = Array.from(document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]'));
authorLinks.forEach((authorLink, index) => {
try {
const author = authorLink.textContent.trim();
let container = authorLink.parentElement;
let maxLevels = 10;
while (container && maxLevels > 0) {
if (container.querySelector('.contain-info-LpWGHS')) {
break;
}
container = container.parentElement;
maxLevels--;
}
if (!container) return;
const paragraphs = Array.from(container.querySelectorAll('p'));
let description = '';
for (let p of paragraphs) {
const text = p.textContent.trim();
if (text && text !== '|' && text.length > 5 && !text.includes('') && !text.includes(':')) {
description = text;
break;
}
}
let hot = '', plays = '', likes = '', comments = '';
const infoContainer = container.querySelector('.contain-info-LpWGHS');
if (infoContainer) {
const infoItems = infoContainer.querySelectorAll('.each-info-TpmTI0');
infoItems.forEach(item => {
const img = item.querySelector('img');
const text = item.textContent.trim();
if (img && img.src) {
if (img.src.includes('hot_')) {
hot = text;
} else if (img.src.includes('play')) {
plays = text;
} else if (img.src.includes('digg')) {
likes = text;
} else if (img.src.includes('comment')) {
comments = text;
}
}
});
}
const hotWords = [];
const hotWordElements = container.querySelectorAll('.other-text-XeleRf');
hotWordElements.forEach((el, i) => {
const text = el.textContent.trim();
if (i === 0 && text.includes('热词')) {
} else if (text && !text.includes('热词')) {
hotWords.push(text);
}
});
const hashTags = description.match(/#[^\\s#]+/g) || [];
let duration = '';
const timeElements = container.querySelectorAll('.time-text-mask-WmpK85 p');
if (timeElements.length > 0) {
duration = timeElements[0].textContent.trim();
}
if (author && description) {
videos.push({
index: index + 1,
author: author,
description: description,
authorLink: authorLink.href,
duration: duration,
hot: hot,
plays: plays,
likes: likes,
comments: comments,
hotWords: hotWords,
hashTags: hashTags
});
}
} catch (e) {
}
});
return {
total: videos.length,
videos: videos,
crawlTime: new Date().toISOString(),
pageTitle: document.title,
pageUrl: window.location.href
};
}
""")
return data
async def crawl_creative_guidance_api(category: str, headless: bool, cookie_file: str, output_dir: str):
"""抓取抖音创作指导页面 - API版本"""
url = "https://creator.douyin.com/creator-micro/creative-guidance"
async with async_playwright() as p:
browser = await p.chromium.launch(headless=headless)
context_options = {}
if Path(cookie_file).exists():
cookies = await load_cookies_for_creative(cookie_file)
if cookies:
context_options['storage_state'] = {'cookies': cookies}
context = await browser.new_context(**context_options)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
await asyncio.sleep(10)
if category and category != "全部":
# 先展开所有分类
await page.evaluate("""
() => {
const showButtons = document.querySelectorAll('.show-button-sDo51G');
showButtons.forEach(btn => {
const text = btn.textContent.trim();
// 如果按钮不是"收起",说明需要展开
if (!text.includes('收起')) {
btn.click();
}
});
}
""")
await asyncio.sleep(1)
# 查找并点击分类标签
category_clicked = await page.evaluate(f"""
() => {{
const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN'));
const targetDiv = categoryDivs.find(div =>
div.textContent.trim() === '{category}'
);
if (targetDiv) {{
targetDiv.click();
return true;
}}
return false;
}}
""")
if category_clicked:
await asyncio.sleep(8)
data = await extract_creative_video_data(page)
if data['total'] > 0:
# 保存到文件
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
category_suffix = f"_{category}" if category else ""
json_file = output_path / f"douyin_creative_guidance{category_suffix}_{timestamp}.json"
result_data = {
'page_url': url,
'page_title': '抖音创作指导 - 热门视频',
'category': category or '全部',
'platform': 'douyin',
'crawl_time': datetime.now().isoformat(),
'total_videos': len(data['videos']),
'videos': data['videos']
}
with open(json_file, "w", encoding="utf-8") as f:
json.dump(result_data, f, ensure_ascii=False, indent=2)
return data
finally:
await browser.close()
@app.post("/api/creative-guidance", response_model=CreativeGuidanceResponse)
async def get_creative_guidance(request: CreativeGuidanceRequest):
"""
获取抖音创作指导数据
- **category**: 分类标签(全部/美食/旅行/泛生活/汽车/科技/游戏/二次元),默认"全部"
- **headless**: 是否使用无头模式,默认True
- **cookie_file**: Cookie文件路径
- **output_dir**: 输出目录
"""
try:
# 检查Cookie文件
if not Path(request.cookie_file).exists():
raise HTTPException(
status_code=400,
detail=f"Cookie文件不存在: {request.cookie_file}"
)
# 抓取数据
data = await crawl_creative_guidance_api(
category=request.category,
headless=request.headless,
cookie_file=request.cookie_file,
output_dir=request.output_dir
)
if not data or data['total'] == 0:
return CreativeGuidanceResponse(
success=False,
category=request.category,
total_count=0,
videos=[],
page_url="https://creator.douyin.com/creator-micro/creative-guidance",
crawl_time=datetime.now().isoformat(),
message="未获取到任何视频数据,请确保已登录并且页面加载完成"
)
return CreativeGuidanceResponse(
success=True,
category=request.category,
total_count=data['total'],
videos=data['videos'],
page_url=data['pageUrl'],
crawl_time=data['crawlTime'],
message=f"成功获取 {data['total']} 个视频"
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"获取创作指导数据时出错: {str(e)}")
@app.post("/api/search", response_model=SearchResponse)
async def search_douyin(request: SearchRequest):
"""
搜索抖音视频
- **keyword**: 搜索关键词(必填)
- **max_scroll**: 最大滚动次数,默认5次
- **headless**: 是否使用无头模式,默认True
- **cookie_file**: Cookie文件路径
"""
crawler = DouyinSearchCrawler(headless=request.headless)
try:
# 初始化浏览器
await crawler.init_browser()
# 加载cookies
cookie_loaded = await crawler.load_cookies(request.cookie_file)
if not cookie_loaded:
raise HTTPException(
status_code=400,
detail=f"无法加载Cookie文件: {request.cookie_file}"
)
# 搜索视频
videos = await crawler.search_videos(request.keyword, max_scroll=request.max_scroll)
if not videos:
return SearchResponse(
success=False,
keyword=request.keyword,
total_count=0,
videos=[],
message="未获取到任何视频数据"
)
# 保存结果
await crawler.save_results(request.keyword, videos)
return SearchResponse(
success=True,
keyword=request.keyword,
total_count=len(videos),
videos=videos,
message="搜索成功"
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"搜索过程中出错: {str(e)}")
finally:
await crawler.close()
@app.post("/api/analyze", response_model=AnalyzeResponse)
async def analyze_videos(request: AnalyzeRequest):
"""
使用AI分析视频数据
- **videos**: 视频数据列表(必填)
- **prompt_file**: 提示词文件路径,默认"prompts/analyze_prompt.md"
- **custom_instruction**: 自定义分析指令(可选)
- **model**: 使用的模型名称,默认"qwen-plus"
- **api_key**: 阿里云百炼API Key(可选,默认从环境变量DASHSCOPE_API_KEY读取)
"""
try:
# 检查提示词文件
if not Path(request.prompt_file).exists():
raise HTTPException(
status_code=400,
detail=f"提示词文件不存在: {request.prompt_file}"
)
# 创建分析器
try:
analyzer = AIAnalyzer(api_key=request.api_key, model=request.model)
except ValueError as e:
raise HTTPException(
status_code=400,
detail=str(e)
)
# 分析数据
result = analyzer.analyze(
videos=request.videos,
prompt_file=request.prompt_file,
custom_instruction=request.custom_instruction
)
return AnalyzeResponse(**result)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"分析过程中出错: {str(e)}")
@app.post("/api/analyze-file", response_model=AnalyzeResponse)
async def analyze_from_file(request: AnalyzeFileRequest):
"""
从JSON文件读取数据并使用AI分析
- **json_file**: JSON数据文件路径(必填)
- **prompt_file**: 提示词文件路径,默认"prompts/analyze_prompt.md"
- **custom_instruction**: 自定义分析指令(可选)
- **model**: 使用的模型名称,默认"qwen-plus"
- **api_key**: 阿里云百炼API Key(可选,默认从环境变量DASHSCOPE_API_KEY读取)
"""
try:
# 检查文件
if not Path(request.json_file).exists():
raise HTTPException(
status_code=400,
detail=f"JSON文件不存在: {request.json_file}"
)
if not Path(request.prompt_file).exists():
raise HTTPException(
status_code=400,
detail=f"提示词文件不存在: {request.prompt_file}"
)
# 创建分析器
try:
analyzer = AIAnalyzer(api_key=request.api_key, model=request.model)
except ValueError as e:
raise HTTPException(
status_code=400,
detail=str(e)
)
# 分析数据
result = analyzer.analyze_from_file(
json_file=request.json_file,
prompt_file=request.prompt_file,
custom_instruction=request.custom_instruction
)
return AnalyzeResponse(**result)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"分析过程中出错: {str(e)}")
@app.post("/api/agent", response_model=AgentResponse)
async def run_agent(request: AgentRequest):
"""
运行AI智能代理 - 创作灵感生成
AI代理会根据用户描述自动:
1. 理解用户需求并提取内容分类
2. 获取相关的热门视频数据
3. 生成9个具体可执行的创作灵感
4. 如果用户不满意,可以获取更多灵感
**支持的内容分类**(共26个):
- 美食、旅行、泛生活、汽车、科技、游戏、二次元
- 娱乐、明星、体育、文化教育、校园、政务
- 时尚、才艺、随拍、动植物、图文控
- 剧情、亲子、三农、创意、户外、公益
**参数**:
- **query**: 用户描述(必填),例如:
- "我想做一些美食相关的短视频,主要是家常菜的制作教程"
- "我想拍一些关于大学生活的有趣视频,记录校园日常"
- "我想做一些关于健身的短视频,分享简单的居家锻炼方法"
- **system_prompt_file**: 系统提示词文件,默认"prompts/agent_prompt.md"
- **max_iterations**: 最大迭代次数,默认10(建议设置为15以支持完整流程)
- **model**: 使用的模型名称,默认"qwen-plus"
- **api_key**: 阿里云百炼API Key(可选)
**返回**:
- 识别的内容分类
- 9个创作灵感(包含标题、核心创意、执行建议、热门标签)
- 创作提示和建议
**示例**:
```json
{
"query": "我想做一些美食相关的短视频,主要是家常菜的制作教程",
"max_iterations": 15
}
```
"""
try:
# 检查提示词文件
if not Path(request.system_prompt_file).exists():
raise HTTPException(
status_code=400,
detail=f"系统提示词文件不存在: {request.system_prompt_file}"
)
# 创建代理
try:
agent = create_agent()
if request.api_key:
agent.api_key = request.api_key
import dashscope
dashscope.api_key = request.api_key
agent.model = request.model
except ValueError as e:
raise HTTPException(
status_code=400,
detail=str(e)
)
# 运行代理
result = await agent.run(
user_input=request.query,
system_prompt_file=request.system_prompt_file,
max_iterations=request.max_iterations
)
return AgentResponse(
success=result["success"],
final_answer=result.get("final_answer"),
iteration=result["iteration"],
tool_calls=result.get("tool_calls", []),
error=result.get("error")
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Agent执行过程中出错: {str(e)}")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)