You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
575 lines
22 KiB
575 lines
22 KiB
"""
|
|
抖音创作指导页面爬虫 - 使用Playwright
|
|
支持Cookie登录、手动登录和分类抓取
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
if sys.platform == "win32":
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|
|
|
|
|
async def get_available_categories(page):
|
|
"""从页面获取所有可用的分类"""
|
|
try:
|
|
# 先尝试展开所有分类(点击展开按钮)
|
|
# 注意:按钮显示"收起"时说明已经展开了,我们要找的是未展开的按钮
|
|
expanded = await page.evaluate("""
|
|
() => {
|
|
const showButtons = document.querySelectorAll('.show-button-sDo51G');
|
|
let clicked = false;
|
|
showButtons.forEach(btn => {
|
|
const text = btn.textContent.trim();
|
|
// 如果按钮不是"收起",说明需要展开
|
|
if (!text.includes('收起')) {
|
|
btn.click();
|
|
clicked = true;
|
|
}
|
|
});
|
|
return clicked;
|
|
}
|
|
""")
|
|
|
|
if expanded:
|
|
print("已展开分类列表")
|
|
await asyncio.sleep(1)
|
|
|
|
# 获取所有分类
|
|
categories = await page.evaluate("""
|
|
() => {
|
|
const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN'));
|
|
return categoryDivs.map(div => div.textContent.trim()).filter(text => text);
|
|
}
|
|
""")
|
|
|
|
return categories
|
|
except Exception as e:
|
|
print(f"获取分类列表失败: {e}")
|
|
return []
|
|
|
|
|
|
async def list_categories(cookie_file: str = None):
|
|
"""列出所有可用的分类"""
|
|
url = "https://creator.douyin.com/creator-micro/creative-guidance"
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
|
|
context_options = {}
|
|
if cookie_file and Path(cookie_file).exists():
|
|
cookies = await load_cookies(cookie_file)
|
|
if cookies:
|
|
context_options['storage_state'] = {'cookies': cookies}
|
|
|
|
context = await browser.new_context(**context_options)
|
|
page = await context.new_page()
|
|
|
|
try:
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
await asyncio.sleep(5)
|
|
|
|
categories = await get_available_categories(page)
|
|
|
|
print("\n可用的分类列表:")
|
|
print("=" * 60)
|
|
for i, cat in enumerate(categories, 1):
|
|
print(f"{i:2d}. {cat}")
|
|
print("=" * 60)
|
|
print(f"总计: {len(categories)} 个分类\n")
|
|
|
|
return categories
|
|
|
|
finally:
|
|
await browser.close()
|
|
|
|
|
|
async def load_cookies(cookie_file: str):
|
|
"""加载Cookie文件"""
|
|
try:
|
|
with open(cookie_file, 'r', encoding='utf-8') as f:
|
|
cookies = json.load(f)
|
|
print(f"✓ 已加载Cookie文件: {cookie_file}")
|
|
return cookies
|
|
except Exception as e:
|
|
print(f"✗ 加载Cookie失败: {e}")
|
|
return None
|
|
|
|
|
|
async def extract_video_data(page, debug_mode=False):
|
|
"""从页面中提取视频数据"""
|
|
|
|
print("正在提取视频数据...")
|
|
|
|
# 先检查页面上有什么
|
|
debug_info = await page.evaluate("""
|
|
() => {
|
|
const authorLinks = document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]');
|
|
const allLinks = document.querySelectorAll('a');
|
|
|
|
// 获取第一个视频容器的HTML结构用于调试
|
|
let sampleHTML = '';
|
|
if (authorLinks.length > 0) {
|
|
let container = authorLinks[0].closest('div[class*="video"]') ||
|
|
authorLinks[0].closest('div[class*="item"]') ||
|
|
authorLinks[0].closest('div[class*="card"]');
|
|
|
|
if (!container) {
|
|
container = authorLinks[0].parentElement;
|
|
for (let i = 0; i < 5; i++) {
|
|
if (container.querySelector('p')) break;
|
|
container = container.parentElement;
|
|
if (!container) break;
|
|
}
|
|
}
|
|
|
|
if (container) {
|
|
sampleHTML = container.outerHTML.substring(0, 2000);
|
|
}
|
|
}
|
|
|
|
return {
|
|
authorLinksCount: authorLinks.length,
|
|
allLinksCount: allLinks.length,
|
|
pageText: document.body.innerText.substring(0, 500),
|
|
sampleHTML: sampleHTML
|
|
};
|
|
}
|
|
""")
|
|
|
|
print(f"调试信息:")
|
|
print(f" 作者链接数量: {debug_info['authorLinksCount']}")
|
|
print(f" 所有链接数量: {debug_info['allLinksCount']}")
|
|
print(f" 页面文本预览: {debug_info['pageText'][:200]}...")
|
|
|
|
if debug_mode and debug_info['sampleHTML']:
|
|
print(f"\n第一个视频容器HTML结构(前2000字符):")
|
|
print(debug_info['sampleHTML'])
|
|
print()
|
|
|
|
print()
|
|
|
|
# 执行JavaScript提取数据
|
|
data = await page.evaluate("""
|
|
() => {
|
|
const videos = [];
|
|
|
|
// 查找所有作者链接
|
|
const authorLinks = Array.from(document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]'));
|
|
|
|
authorLinks.forEach((authorLink, index) => {
|
|
try {
|
|
// 获取作者名
|
|
const author = authorLink.textContent.trim();
|
|
|
|
// 向上查找包含 contain-info-LpWGHS 的容器
|
|
let container = authorLink.parentElement;
|
|
let maxLevels = 10;
|
|
|
|
while (container && maxLevels > 0) {
|
|
if (container.querySelector('.contain-info-LpWGHS')) {
|
|
break;
|
|
}
|
|
container = container.parentElement;
|
|
maxLevels--;
|
|
}
|
|
|
|
if (!container) return;
|
|
|
|
// 获取描述
|
|
const paragraphs = Array.from(container.querySelectorAll('p'));
|
|
let description = '';
|
|
for (let p of paragraphs) {
|
|
const text = p.textContent.trim();
|
|
if (text && text !== '|' && text.length > 5 && !text.includes('万') && !text.includes(':')) {
|
|
description = text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// 提取互动数据
|
|
let hot = '', plays = '', likes = '', comments = '';
|
|
|
|
const infoContainer = container.querySelector('.contain-info-LpWGHS');
|
|
if (infoContainer) {
|
|
const infoItems = infoContainer.querySelectorAll('.each-info-TpmTI0');
|
|
|
|
infoItems.forEach(item => {
|
|
const img = item.querySelector('img');
|
|
const text = item.textContent.trim();
|
|
|
|
if (img && img.src) {
|
|
if (img.src.includes('hot_first') || img.src.includes('hot_second') || img.src.includes('hot_third') || img.src.includes('hot_')) {
|
|
hot = text;
|
|
} else if (img.src.includes('play')) {
|
|
plays = text;
|
|
} else if (img.src.includes('digg')) {
|
|
likes = text;
|
|
} else if (img.src.includes('comment')) {
|
|
comments = text;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
// 获取热词
|
|
const hotWords = [];
|
|
const hotWordElements = container.querySelectorAll('.other-text-XeleRf');
|
|
hotWordElements.forEach((el, i) => {
|
|
const text = el.textContent.trim();
|
|
if (i === 0 && text.includes('热词')) {
|
|
// 跳过"热词 :"标签
|
|
} else if (text && !text.includes('热词')) {
|
|
hotWords.push(text);
|
|
}
|
|
});
|
|
|
|
// 查找标签
|
|
const hashTags = description.match(/#[^\\s#]+/g) || [];
|
|
|
|
// 查找视频时长
|
|
let duration = '';
|
|
const timeElements = container.querySelectorAll('.time-text-mask-WmpK85 p');
|
|
if (timeElements.length > 0) {
|
|
duration = timeElements[0].textContent.trim();
|
|
}
|
|
|
|
if (author && description) {
|
|
videos.push({
|
|
index: index + 1,
|
|
author: author,
|
|
description: description,
|
|
authorLink: authorLink.href,
|
|
duration: duration,
|
|
hot: hot,
|
|
plays: plays,
|
|
likes: likes,
|
|
comments: comments,
|
|
hotWords: hotWords,
|
|
hashTags: hashTags
|
|
});
|
|
}
|
|
} catch (e) {
|
|
// 静默处理错误
|
|
}
|
|
});
|
|
|
|
return {
|
|
total: videos.length,
|
|
videos: videos,
|
|
crawlTime: new Date().toISOString(),
|
|
pageTitle: document.title,
|
|
pageUrl: window.location.href
|
|
};
|
|
}
|
|
""")
|
|
|
|
return data
|
|
|
|
|
|
async def crawl_creative_guidance(output_dir: str, headless: bool = True, manual_mode: bool = False, cookie_file: str = None, category: str = None, debug_mode: bool = False):
|
|
"""抓取抖音创作指导页面"""
|
|
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
url = "https://creator.douyin.com/creator-micro/creative-guidance"
|
|
|
|
print(f"正在抓取抖音创作指导页面: {url}")
|
|
if category:
|
|
print(f"目标分类: {category}")
|
|
if debug_mode:
|
|
print("调试模式已启用")
|
|
print()
|
|
|
|
async with async_playwright() as p:
|
|
# 启动浏览器
|
|
browser = await p.chromium.launch(headless=headless)
|
|
|
|
# 创建上下文,如果有Cookie则加载
|
|
context_options = {}
|
|
|
|
if cookie_file and Path(cookie_file).exists():
|
|
cookies = await load_cookies(cookie_file)
|
|
if cookies:
|
|
context_options['storage_state'] = {'cookies': cookies}
|
|
|
|
context = await browser.new_context(**context_options)
|
|
page = await context.new_page()
|
|
|
|
try:
|
|
# 访问页面
|
|
print("正在打开页面...")
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
|
|
if manual_mode:
|
|
# 手动模式:等待用户登录
|
|
print("\n" + "=" * 80)
|
|
print("请在浏览器中完成以下操作:")
|
|
print("1. 登录抖音账号(如果需要)")
|
|
print("2. 等待页面完全加载,确保能看到热门视频列表")
|
|
if category:
|
|
print(f"3. (可选)手动点击【{category}】分类,或者等待脚本自动点击")
|
|
print("4. 完成后,回到这里按回车键继续")
|
|
else:
|
|
print("3. 完成后,回到这里按回车键继续")
|
|
print("=" * 80 + "\n")
|
|
|
|
input("按回车键开始抓取数据...")
|
|
print()
|
|
|
|
# 等待页面稳定
|
|
print("等待页面稳定...")
|
|
await asyncio.sleep(3)
|
|
|
|
# 检查当前URL,如果发生了跳转,重新导航
|
|
current_url = page.url
|
|
if current_url != url:
|
|
print(f"检测到页面跳转到: {current_url}")
|
|
print("重新导航到目标页面...")
|
|
await page.goto(url, wait_until="domcontentloaded", timeout=60000)
|
|
await asyncio.sleep(5)
|
|
else:
|
|
# 自动模式:等待页面加载
|
|
print("等待页面加载...")
|
|
await asyncio.sleep(10)
|
|
|
|
# 如果指定了分类,点击分类标签
|
|
if category and category != "全部":
|
|
print(f"正在切换到【{category}】分类...")
|
|
|
|
# 先展开所有分类
|
|
await page.evaluate("""
|
|
() => {
|
|
const showButtons = document.querySelectorAll('.show-button-sDo51G');
|
|
showButtons.forEach(btn => {
|
|
const text = btn.textContent.trim();
|
|
// 如果按钮不是"收起",说明需要展开
|
|
if (!text.includes('收起')) {
|
|
btn.click();
|
|
}
|
|
});
|
|
}
|
|
""")
|
|
await asyncio.sleep(1)
|
|
|
|
try:
|
|
# 查找并点击分类标签
|
|
category_clicked = await page.evaluate(f"""
|
|
() => {{
|
|
const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN'));
|
|
const targetDiv = categoryDivs.find(div =>
|
|
div.textContent.trim() === '{category}'
|
|
);
|
|
if (targetDiv) {{
|
|
targetDiv.click();
|
|
return true;
|
|
}}
|
|
return false;
|
|
}}
|
|
""")
|
|
|
|
if category_clicked:
|
|
print(f"✓ 已点击【{category}】分类")
|
|
print("等待分类内容加载...")
|
|
await asyncio.sleep(8) # 增加等待时间
|
|
else:
|
|
print(f"⚠ 未找到【{category}】分类标签")
|
|
print("提示:使用 --list-categories 查看所有可用分类")
|
|
except Exception as e:
|
|
print(f"⚠ 点击分类失败: {e}")
|
|
print("将抓取当前显示的内容")
|
|
|
|
# 提取数据
|
|
data = await extract_video_data(page, debug_mode=debug_mode)
|
|
|
|
if data['total'] == 0:
|
|
print("⚠ 未提取到视频数据")
|
|
print("提示:")
|
|
print(" 1. 确保已登录抖音账号")
|
|
print(" 2. 确保页面已完全加载")
|
|
print(" 3. 尝试使用 --manual 参数手动控制")
|
|
if not cookie_file:
|
|
print(" 4. 或者使用 --cookie 参数提供Cookie文件")
|
|
return
|
|
|
|
print(f"✓ 提取到 {data['total']} 个视频\n")
|
|
|
|
# 保存结果
|
|
save_results(data['videos'], url, output_path, category)
|
|
|
|
except Exception as e:
|
|
print(f"✗ 错误: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
finally:
|
|
await browser.close()
|
|
|
|
|
|
def save_results(videos, url, output_path, category=None):
|
|
"""保存结果"""
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# 文件名包含分类信息
|
|
category_suffix = f"_{category}" if category else ""
|
|
|
|
# JSON格式
|
|
json_file = output_path / f"douyin_creative_guidance{category_suffix}_{timestamp}.json"
|
|
result_data = {
|
|
'page_url': url,
|
|
'page_title': '抖音创作指导 - 热门视频',
|
|
'category': category or '全部',
|
|
'platform': 'douyin',
|
|
'crawl_time': datetime.now().isoformat(),
|
|
'total_videos': len(videos),
|
|
'videos': videos
|
|
}
|
|
|
|
with open(json_file, "w", encoding="utf-8") as f:
|
|
json.dump(result_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ JSON已保存: {json_file}")
|
|
|
|
# 文本格式
|
|
txt_file = json_file.with_suffix('.txt')
|
|
with open(txt_file, "w", encoding="utf-8") as f:
|
|
f.write(f"抖音创作指导 - 热门视频列表\n")
|
|
f.write(f"页面: {url}\n")
|
|
if category:
|
|
f.write(f"分类: {category}\n")
|
|
f.write(f"抓取时间: {datetime.now():%Y-%m-%d %H:%M:%S}\n")
|
|
f.write(f"视频总数: {len(videos)}\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
for video in videos:
|
|
f.write(f"视频 {video['index']}:\n")
|
|
f.write(f" 作者: {video.get('author', 'N/A')}\n")
|
|
f.write(f" 描述: {video.get('description', 'N/A')}\n")
|
|
|
|
if video.get('duration'):
|
|
f.write(f" 时长: {video['duration']}\n")
|
|
|
|
if video.get('authorLink'):
|
|
f.write(f" 作者链接: {video['authorLink']}\n")
|
|
|
|
if video.get('hot'):
|
|
f.write(f" 热度: {video['hot']}\n")
|
|
if video.get('plays'):
|
|
f.write(f" 播放量: {video['plays']}\n")
|
|
if video.get('likes'):
|
|
f.write(f" 点赞: {video['likes']}\n")
|
|
if video.get('comments'):
|
|
f.write(f" 评论: {video['comments']}\n")
|
|
|
|
if video.get('hashTags'):
|
|
f.write(f" 标签: {', '.join(video['hashTags'])}\n")
|
|
|
|
if video.get('hotWords'):
|
|
f.write(f" 热词: {', '.join(video['hotWords'])}\n")
|
|
|
|
f.write("-" * 80 + "\n")
|
|
|
|
print(f"✓ 文本已保存: {txt_file}")
|
|
|
|
# CSV格式
|
|
csv_file = json_file.with_suffix('.csv')
|
|
with open(csv_file, "w", encoding="utf-8-sig") as f:
|
|
f.write("序号,作者,描述,时长,热度,播放量,点赞,评论,标签,作者链接\n")
|
|
for video in videos:
|
|
tags = '|'.join(video.get('hashTags', []))
|
|
f.write(f"{video['index']},")
|
|
f.write(f'"{video.get("author", "")}",')
|
|
f.write(f'"{video.get("description", "")}",')
|
|
f.write(f'"{video.get("duration", "")}",')
|
|
f.write(f'"{video.get("hot", "")}",')
|
|
f.write(f'"{video.get("plays", "")}",')
|
|
f.write(f'"{video.get("likes", "")}",')
|
|
f.write(f'"{video.get("comments", "")}",')
|
|
f.write(f'"{tags}",')
|
|
f.write(f'"{video.get("authorLink", "")}"\n')
|
|
|
|
print(f"✓ CSV已保存: {csv_file}\n")
|
|
|
|
print(f"统计:")
|
|
print(f" 总视频数: {len(videos)}")
|
|
print(f" 有标签: {sum(1 for v in videos if v.get('hashTags'))}")
|
|
print(f" 有互动数据: {sum(1 for v in videos if v.get('plays') or v.get('likes'))}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="抖音创作指导页面爬虫 (Playwright版)")
|
|
parser.add_argument("--output-dir", "-o", default="douyin_data", help="输出目录")
|
|
parser.add_argument("--headless", action="store_true", help="无头模式(不显示浏览器)")
|
|
parser.add_argument("--manual", "-m", action="store_true", help="手动模式:等待用户登录后按回车继续")
|
|
parser.add_argument("--cookie", "-c", help="Cookie文件路径(JSON格式)")
|
|
parser.add_argument("--category", "-t", help="分类标签")
|
|
parser.add_argument("--debug", "-d", action="store_true", help="调试模式:显示页面HTML结构")
|
|
parser.add_argument("--list-categories", "-l", action="store_true", help="列出所有可用的分类")
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 80)
|
|
print("抖音创作指导页面爬虫 (Playwright版)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# 如果是列出分类模式
|
|
if args.list_categories:
|
|
try:
|
|
asyncio.run(list_categories(cookie_file=args.cookie))
|
|
except KeyboardInterrupt:
|
|
print("\n已中断")
|
|
except Exception as e:
|
|
print(f"\n错误: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return
|
|
|
|
if args.cookie:
|
|
if Path(args.cookie).exists():
|
|
print(f"✓ 将使用Cookie文件: {args.cookie}")
|
|
else:
|
|
print(f"⚠ Cookie文件不存在: {args.cookie}")
|
|
print()
|
|
|
|
if args.category:
|
|
print(f"✓ 目标分类: {args.category}")
|
|
print()
|
|
|
|
if not args.headless:
|
|
print("⚠ 浏览器模式已启用")
|
|
if args.manual:
|
|
print(" 手动模式:登录后按回车键继续抓取")
|
|
else:
|
|
print(" 自动模式:将自动等待10秒后抓取")
|
|
print()
|
|
|
|
try:
|
|
asyncio.run(crawl_creative_guidance(
|
|
output_dir=args.output_dir,
|
|
headless=args.headless,
|
|
manual_mode=args.manual,
|
|
cookie_file=args.cookie,
|
|
category=args.category,
|
|
debug_mode=args.debug,
|
|
))
|
|
except KeyboardInterrupt:
|
|
print("\n已中断")
|
|
except Exception as e:
|
|
print(f"\n错误: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|