You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
273 lines
9.8 KiB
273 lines
9.8 KiB
"""
|
|
抖音搜索页面数据爬取脚本
|
|
支持按关键词搜索并爬取视频信息
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from playwright.async_api import async_playwright
|
|
import argparse
|
|
|
|
|
|
class DouyinSearchCrawler:
|
|
def __init__(self, headless=False):
|
|
self.headless = headless
|
|
self.browser = None
|
|
self.context = None
|
|
self.page = None
|
|
|
|
async def init_browser(self):
|
|
"""初始化浏览器"""
|
|
self.playwright = await async_playwright().start()
|
|
self.browser = await self.playwright.chromium.launch(
|
|
headless=self.headless,
|
|
args=[
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox'
|
|
]
|
|
)
|
|
|
|
# 创建浏览器上下文
|
|
self.context = await self.browser.new_context(
|
|
viewport={'width': 1920, 'height': 1080},
|
|
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
)
|
|
|
|
self.page = await self.context.new_page()
|
|
|
|
# 设置超时时间
|
|
self.page.set_default_timeout(30000)
|
|
|
|
# 设置额外的headers
|
|
await self.page.set_extra_http_headers({
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
})
|
|
|
|
async def load_cookies(self, cookie_file='douyin_cookie.json'):
|
|
"""加载cookies"""
|
|
cookie_path = Path(cookie_file)
|
|
if cookie_path.exists():
|
|
try:
|
|
with open(cookie_path, 'r', encoding='utf-8') as f:
|
|
cookies = json.load(f)
|
|
await self.context.add_cookies(cookies)
|
|
print(f"✓ 成功加载 cookies: {cookie_file}")
|
|
return True
|
|
except Exception as e:
|
|
print(f"✗ 加载 cookies 失败: {e}")
|
|
return False
|
|
else:
|
|
print(f"✗ Cookie 文件不存在: {cookie_file}")
|
|
return False
|
|
|
|
async def search_videos(self, keyword, max_scroll=5):
|
|
"""搜索视频"""
|
|
print(f"\n开始搜索关键词: {keyword}")
|
|
|
|
# 访问搜索页面
|
|
search_url = f"https://www.douyin.com/search/{keyword}?type=video"
|
|
print(f"访问URL: {search_url}")
|
|
|
|
try:
|
|
await self.page.goto(search_url, wait_until='networkidle', timeout=60000)
|
|
print("✓ 页面加载完成")
|
|
except Exception as e:
|
|
print(f"⚠ 页面加载超时,继续尝试: {e}")
|
|
|
|
# 增加初始等待时间,确保页面完全加载
|
|
print("等待页面内容加载...")
|
|
await asyncio.sleep(5)
|
|
|
|
# 等待视频列表元素出现
|
|
try:
|
|
await self.page.wait_for_selector('a[href*="/video/"]', timeout=10000)
|
|
print("✓ 检测到视频元素")
|
|
except Exception as e:
|
|
print(f"⚠ 未检测到视频元素: {e}")
|
|
|
|
all_videos = []
|
|
no_new_data_count = 0 # 连续没有新数据的次数
|
|
|
|
# 滚动加载更多内容
|
|
for scroll_count in range(max_scroll):
|
|
print(f"\n第 {scroll_count + 1}/{max_scroll} 次滚动加载...")
|
|
|
|
previous_count = len(all_videos)
|
|
|
|
# 提取当前页面的视频数据
|
|
videos = await self.extract_videos()
|
|
|
|
# 去重并添加到结果中
|
|
for video in videos:
|
|
if video['url'] not in [v['url'] for v in all_videos]:
|
|
all_videos.append(video)
|
|
|
|
new_count = len(all_videos) - previous_count
|
|
print(f"本次新增 {new_count} 个视频,当前已获取 {len(all_videos)} 个视频")
|
|
|
|
# 如果连续3次没有新数据,提前结束
|
|
if new_count == 0:
|
|
no_new_data_count += 1
|
|
if no_new_data_count >= 3:
|
|
print("连续3次未获取到新数据,停止滚动")
|
|
break
|
|
else:
|
|
no_new_data_count = 0
|
|
|
|
# 滚动到页面底部
|
|
await self.page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
|
|
|
|
# 增加滚动后的等待时间
|
|
await asyncio.sleep(3)
|
|
|
|
return all_videos
|
|
|
|
async def extract_videos(self):
|
|
"""提取页面上的视频信息"""
|
|
# 先等待一下确保DOM更新
|
|
await asyncio.sleep(1)
|
|
|
|
videos_data = await self.page.evaluate('''() => {
|
|
const links = Array.from(document.querySelectorAll('a[href*="/video/"]'));
|
|
|
|
return links.map(link => {
|
|
const text = link.textContent?.trim() || '';
|
|
const url = link.getAttribute('href') || '';
|
|
|
|
// 提取时长 (格式: HH:MM 或 MM:SS)
|
|
const durationMatch = text.match(/(\\d{1,2}:\\d{2})/);
|
|
const duration = durationMatch ? durationMatch[1] : '';
|
|
|
|
// 提取播放量
|
|
const playCountMatch = text.match(/([\\d.]+万|[\\d]+)(?!:)/);
|
|
const playCount = playCountMatch ? playCountMatch[1] : '';
|
|
|
|
// 提取发布时间
|
|
const timeMatch = text.match(/(\\d+天前|\\d+小时前|\\d+分钟前|\\d{4}-\\d{2}-\\d{2})/);
|
|
const publishTime = timeMatch ? timeMatch[1] : '';
|
|
|
|
// 提取作者 - 先提取,然后移除时间信息
|
|
const authorMatch = text.match(/@([^\\s@]+)/);
|
|
let author = authorMatch ? authorMatch[1] : '';
|
|
// 从作者中移除时间信息(如果存在)
|
|
if (author && publishTime) {
|
|
author = author.replace(publishTime, '').trim();
|
|
}
|
|
// 移除可能残留的数字和"前"字
|
|
author = author.replace(/\\d+(天|小时|分钟)前$/, '').trim();
|
|
|
|
// 提取标题和标签
|
|
let title = text;
|
|
// 移除时长、播放量、作者、时间等信息
|
|
title = title.replace(/(\\d{1,2}:\\d{2})/g, '');
|
|
title = title.replace(/([\\d.]+万|[\\d]+)(?!:)/g, '');
|
|
title = title.replace(/@[^\\s@]+/g, '');
|
|
title = title.replace(/(\\d+天前|\\d+小时前|\\d+分钟前|\\d{4}-\\d{2}-\\d{2})/g, '');
|
|
title = title.replace(/合集/g, '');
|
|
title = title.trim();
|
|
|
|
// 提取标签
|
|
const tags = [];
|
|
const tagMatches = title.matchAll(/#([^#\\s]+)/g);
|
|
for (const match of tagMatches) {
|
|
tags.push(match[1]);
|
|
}
|
|
|
|
return {
|
|
url: url.startsWith('//') ? 'https:' + url : url,
|
|
title: title,
|
|
author: author,
|
|
publishTime: publishTime,
|
|
duration: duration,
|
|
playCount: playCount,
|
|
tags: tags
|
|
};
|
|
});
|
|
}''')
|
|
|
|
# 过滤掉空数据
|
|
videos_data = [v for v in videos_data if v['url'] and v['title']]
|
|
|
|
return videos_data
|
|
|
|
async def save_results(self, keyword, videos):
|
|
"""保存结果到JSON文件"""
|
|
# 创建输出目录
|
|
output_dir = Path('douyin_data')
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# 生成文件名
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
filename = output_dir / f'douyin_search_{keyword}_{timestamp}.json'
|
|
|
|
# 准备数据
|
|
data = {
|
|
'keyword': keyword,
|
|
'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
'total_count': len(videos),
|
|
'videos': videos
|
|
}
|
|
|
|
# 保存到文件
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\n✓ 数据已保存到: {filename}")
|
|
print(f"✓ 共爬取 {len(videos)} 个视频")
|
|
|
|
return filename
|
|
|
|
async def close(self):
|
|
"""关闭浏览器"""
|
|
if self.page:
|
|
await self.page.close()
|
|
if self.context:
|
|
await self.context.close()
|
|
if self.browser:
|
|
await self.browser.close()
|
|
if hasattr(self, 'playwright'):
|
|
await self.playwright.stop()
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description='抖音搜索页面爬虫')
|
|
parser.add_argument('keyword', help='搜索关键词')
|
|
parser.add_argument('--max-scroll', type=int, default=5, help='最大滚动次数 (默认: 5)')
|
|
parser.add_argument('--headless', action='store_true', help='无头模式运行')
|
|
parser.add_argument('--cookie', default='douyin_cookie.json', help='Cookie文件路径')
|
|
|
|
args = parser.parse_args()
|
|
|
|
crawler = DouyinSearchCrawler(headless=args.headless)
|
|
|
|
try:
|
|
# 初始化浏览器
|
|
await crawler.init_browser()
|
|
|
|
# 加载cookies
|
|
await crawler.load_cookies(args.cookie)
|
|
|
|
# 搜索视频
|
|
videos = await crawler.search_videos(args.keyword, max_scroll=args.max_scroll)
|
|
|
|
# 保存结果
|
|
if videos:
|
|
await crawler.save_results(args.keyword, videos)
|
|
else:
|
|
print("\n✗ 未获取到任何视频数据")
|
|
|
|
except Exception as e:
|
|
print(f"\n✗ 爬取过程中出错: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
finally:
|
|
await crawler.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(main())
|
|
|