"""
抖音搜索页面数据爬取脚本
支持按关键词搜索并爬取视频信息
"""

import asyncio
import json
import re
from datetime import datetime
from pathlib import Path
from playwright.async_api import async_playwright
import argparse


class DouyinSearchCrawler:
    def __init__(self, headless=False):
        self.headless = headless
        self.browser = None
        self.context = None
        self.page = None
        
    async def init_browser(self):
        """初始化浏览器"""
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=self.headless,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--no-sandbox',
                '--disable-setuid-sandbox'
            ]
        )
        
        # 创建浏览器上下文
        self.context = await self.browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        )
        
        self.page = await self.context.new_page()
        
        # 设置超时时间
        self.page.set_default_timeout(30000)
        
        # 设置额外的headers
        await self.page.set_extra_http_headers({
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        })
        
    async def load_cookies(self, cookie_file='douyin_cookie.json'):
        """加载cookies"""
        cookie_path = Path(cookie_file)
        if cookie_path.exists():
            try:
                with open(cookie_path, 'r', encoding='utf-8') as f:
                    cookies = json.load(f)
                await self.context.add_cookies(cookies)
                print(f"✓ 成功加载 cookies: {cookie_file}")
                return True
            except Exception as e:
                print(f"✗ 加载 cookies 失败: {e}")
                return False
        else:
            print(f"✗ Cookie 文件不存在: {cookie_file}")
            return False
    
    async def search_videos(self, keyword, max_scroll=5):
        """搜索视频"""
        print(f"\n开始搜索关键词: {keyword}")
        
        # 访问搜索页面
        search_url = f"https://www.douyin.com/search/{keyword}?type=video"
        print(f"访问URL: {search_url}")
        
        try:
            await self.page.goto(search_url, wait_until='networkidle', timeout=60000)
            print("✓ 页面加载完成")
        except Exception as e:
            print(f"⚠ 页面加载超时，继续尝试: {e}")
        
        # 增加初始等待时间，确保页面完全加载
        print("等待页面内容加载...")
        await asyncio.sleep(5)
        
        # 等待视频列表元素出现
        try:
            await self.page.wait_for_selector('a[href*="/video/"]', timeout=10000)
            print("✓ 检测到视频元素")
        except Exception as e:
            print(f"⚠ 未检测到视频元素: {e}")
        
        all_videos = []
        no_new_data_count = 0  # 连续没有新数据的次数
        
        # 滚动加载更多内容
        for scroll_count in range(max_scroll):
            print(f"\n第 {scroll_count + 1}/{max_scroll} 次滚动加载...")
            
            previous_count = len(all_videos)
            
            # 提取当前页面的视频数据
            videos = await self.extract_videos()
            
            # 去重并添加到结果中
            for video in videos:
                if video['url'] not in [v['url'] for v in all_videos]:
                    all_videos.append(video)
            
            new_count = len(all_videos) - previous_count
            print(f"本次新增 {new_count} 个视频，当前已获取 {len(all_videos)} 个视频")
            
            # 如果连续3次没有新数据，提前结束
            if new_count == 0:
                no_new_data_count += 1
                if no_new_data_count >= 3:
                    print("连续3次未获取到新数据，停止滚动")
                    break
            else:
                no_new_data_count = 0
            
            # 滚动到页面底部
            await self.page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
            
            # 增加滚动后的等待时间
            await asyncio.sleep(3)
        
        return all_videos
    
    async def extract_videos(self):
        """提取页面上的视频信息"""
        # 先等待一下确保DOM更新
        await asyncio.sleep(1)
        
        videos_data = await self.page.evaluate('''() => {
            const links = Array.from(document.querySelectorAll('a[href*="/video/"]'));
            
            return links.map(link => {
                const text = link.textContent?.trim() || '';
                const url = link.getAttribute('href') || '';
                
                // 提取时长 (格式: HH:MM 或 MM:SS)
                const durationMatch = text.match(/(\\d{1,2}:\\d{2})/);
                const duration = durationMatch ? durationMatch[1] : '';
                
                // 提取播放量
                const playCountMatch = text.match(/([\\d.]+万|[\\d]+)(?!:)/);
                const playCount = playCountMatch ? playCountMatch[1] : '';
                
                // 提取发布时间
                const timeMatch = text.match(/(\\d+天前|\\d+小时前|\\d+分钟前|\\d{4}-\\d{2}-\\d{2})/);
                const publishTime = timeMatch ? timeMatch[1] : '';
                
                // 提取作者 - 先提取，然后移除时间信息
                const authorMatch = text.match(/@([^\\s@]+)/);
                let author = authorMatch ? authorMatch[1] : '';
                // 从作者中移除时间信息（如果存在）
                if (author && publishTime) {
                    author = author.replace(publishTime, '').trim();
                }
                // 移除可能残留的数字和"前"字
                author = author.replace(/\\d+(天|小时|分钟)前$/, '').trim();
                
                // 提取标题和标签
                let title = text;
                // 移除时长、播放量、作者、时间等信息
                title = title.replace(/(\\d{1,2}:\\d{2})/g, '');
                title = title.replace(/([\\d.]+万|[\\d]+)(?!:)/g, '');
                title = title.replace(/@[^\\s@]+/g, '');
                title = title.replace(/(\\d+天前|\\d+小时前|\\d+分钟前|\\d{4}-\\d{2}-\\d{2})/g, '');
                title = title.replace(/合集/g, '');
                title = title.trim();
                
                // 提取标签
                const tags = [];
                const tagMatches = title.matchAll(/#([^#\\s]+)/g);
                for (const match of tagMatches) {
                    tags.push(match[1]);
                }
                
                return {
                    url: url.startsWith('//') ? 'https:' + url : url,
                    title: title,
                    author: author,
                    publishTime: publishTime,
                    duration: duration,
                    playCount: playCount,
                    tags: tags
                };
            });
        }''')
        
        # 过滤掉空数据
        videos_data = [v for v in videos_data if v['url'] and v['title']]
        
        return videos_data
    
    async def save_results(self, keyword, videos):
        """保存结果到JSON文件"""
        # 创建输出目录
        output_dir = Path('douyin_data')
        output_dir.mkdir(exist_ok=True)
        
        # 生成文件名
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = output_dir / f'douyin_search_{keyword}_{timestamp}.json'
        
        # 准备数据
        data = {
            'keyword': keyword,
            'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'total_count': len(videos),
            'videos': videos
        }
        
        # 保存到文件
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        print(f"\n✓ 数据已保存到: {filename}")
        print(f"✓ 共爬取 {len(videos)} 个视频")
        
        return filename
    
    async def close(self):
        """关闭浏览器"""
        if self.page:
            await self.page.close()
        if self.context:
            await self.context.close()
        if self.browser:
            await self.browser.close()
        if hasattr(self, 'playwright'):
            await self.playwright.stop()


async def main():
    parser = argparse.ArgumentParser(description='抖音搜索页面爬虫')
    parser.add_argument('keyword', help='搜索关键词')
    parser.add_argument('--max-scroll', type=int, default=5, help='最大滚动次数 (默认: 5)')
    parser.add_argument('--headless', action='store_true', help='无头模式运行')
    parser.add_argument('--cookie', default='douyin_cookie.json', help='Cookie文件路径')
    
    args = parser.parse_args()
    
    crawler = DouyinSearchCrawler(headless=args.headless)
    
    try:
        # 初始化浏览器
        await crawler.init_browser()
        
        # 加载cookies
        await crawler.load_cookies(args.cookie)
        
        # 搜索视频
        videos = await crawler.search_videos(args.keyword, max_scroll=args.max_scroll)
        
        # 保存结果
        if videos:
            await crawler.save_results(args.keyword, videos)
        else:
            print("\n✗ 未获取到任何视频数据")
        
    except Exception as e:
        print(f"\n✗ 爬取过程中出错: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        await crawler.close()


if __name__ == '__main__':
    asyncio.run(main())