"""
抖音创作指导页面爬虫 - 使用Playwright
支持Cookie登录、手动登录和分类抓取
"""

import argparse
import asyncio
import json
import os
import sys
from datetime import datetime
from pathlib import Path

from playwright.async_api import async_playwright

if sys.platform == "win32":
    os.environ["PYTHONIOENCODING"] = "utf-8"
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')


async def get_available_categories(page):
    """从页面获取所有可用的分类"""
    try:
        # 先尝试展开所有分类（点击展开按钮）
        # 注意：按钮显示"收起"时说明已经展开了，我们要找的是未展开的按钮
        expanded = await page.evaluate("""
            () => {
                const showButtons = document.querySelectorAll('.show-button-sDo51G');
                let clicked = false;
                showButtons.forEach(btn => {
                    const text = btn.textContent.trim();
                    // 如果按钮不是"收起"，说明需要展开
                    if (!text.includes('收起')) {
                        btn.click();
                        clicked = true;
                    }
                });
                return clicked;
            }
        """)
        
        if expanded:
            print("已展开分类列表")
            await asyncio.sleep(1)
        
        # 获取所有分类
        categories = await page.evaluate("""
            () => {
                const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN'));
                return categoryDivs.map(div => div.textContent.trim()).filter(text => text);
            }
        """)
        
        return categories
    except Exception as e:
        print(f"获取分类列表失败: {e}")
        return []


async def list_categories(cookie_file: str = None):
    """列出所有可用的分类"""
    url = "https://creator.douyin.com/creator-micro/creative-guidance"
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        
        context_options = {}
        if cookie_file and Path(cookie_file).exists():
            cookies = await load_cookies(cookie_file)
            if cookies:
                context_options['storage_state'] = {'cookies': cookies}
        
        context = await browser.new_context(**context_options)
        page = await context.new_page()
        
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            await asyncio.sleep(5)
            
            categories = await get_available_categories(page)
            
            print("\n可用的分类列表：")
            print("=" * 60)
            for i, cat in enumerate(categories, 1):
                print(f"{i:2d}. {cat}")
            print("=" * 60)
            print(f"总计: {len(categories)} 个分类\n")
            
            return categories
            
        finally:
            await browser.close()


async def load_cookies(cookie_file: str):
    """加载Cookie文件"""
    try:
        with open(cookie_file, 'r', encoding='utf-8') as f:
            cookies = json.load(f)
        print(f"✓ 已加载Cookie文件: {cookie_file}")
        return cookies
    except Exception as e:
        print(f"✗ 加载Cookie失败: {e}")
        return None


async def extract_video_data(page, debug_mode=False):
    """从页面中提取视频数据"""
    
    print("正在提取视频数据...")
    
    # 先检查页面上有什么
    debug_info = await page.evaluate("""
        () => {
            const authorLinks = document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]');
            const allLinks = document.querySelectorAll('a');
            
            // 获取第一个视频容器的HTML结构用于调试
            let sampleHTML = '';
            if (authorLinks.length > 0) {
                let container = authorLinks[0].closest('div[class*="video"]') || 
                               authorLinks[0].closest('div[class*="item"]') ||
                               authorLinks[0].closest('div[class*="card"]');
                
                if (!container) {
                    container = authorLinks[0].parentElement;
                    for (let i = 0; i < 5; i++) {
                        if (container.querySelector('p')) break;
                        container = container.parentElement;
                        if (!container) break;
                    }
                }
                
                if (container) {
                    sampleHTML = container.outerHTML.substring(0, 2000);
                }
            }
            
            return {
                authorLinksCount: authorLinks.length,
                allLinksCount: allLinks.length,
                pageText: document.body.innerText.substring(0, 500),
                sampleHTML: sampleHTML
            };
        }
    """)
    
    print(f"调试信息：")
    print(f"  作者链接数量: {debug_info['authorLinksCount']}")
    print(f"  所有链接数量: {debug_info['allLinksCount']}")
    print(f"  页面文本预览: {debug_info['pageText'][:200]}...")
    
    if debug_mode and debug_info['sampleHTML']:
        print(f"\n第一个视频容器HTML结构（前2000字符）：")
        print(debug_info['sampleHTML'])
        print()
    
    print()
    
    # 执行JavaScript提取数据
    data = await page.evaluate("""
        () => {
            const videos = [];
            
            // 查找所有作者链接
            const authorLinks = Array.from(document.querySelectorAll('a[href*="iesdouyin.com/share/user/"]'));
            
            authorLinks.forEach((authorLink, index) => {
                try {
                    // 获取作者名
                    const author = authorLink.textContent.trim();
                    
                    // 向上查找包含 contain-info-LpWGHS 的容器
                    let container = authorLink.parentElement;
                    let maxLevels = 10;
                    
                    while (container && maxLevels > 0) {
                        if (container.querySelector('.contain-info-LpWGHS')) {
                            break;
                        }
                        container = container.parentElement;
                        maxLevels--;
                    }
                    
                    if (!container) return;
                    
                    // 获取描述
                    const paragraphs = Array.from(container.querySelectorAll('p'));
                    let description = '';
                    for (let p of paragraphs) {
                        const text = p.textContent.trim();
                        if (text && text !== '|' && text.length > 5 && !text.includes('万') && !text.includes(':')) {
                            description = text;
                            break;
                        }
                    }
                    
                    // 提取互动数据
                    let hot = '', plays = '', likes = '', comments = '';
                    
                    const infoContainer = container.querySelector('.contain-info-LpWGHS');
                    if (infoContainer) {
                        const infoItems = infoContainer.querySelectorAll('.each-info-TpmTI0');
                        
                        infoItems.forEach(item => {
                            const img = item.querySelector('img');
                            const text = item.textContent.trim();
                            
                            if (img && img.src) {
                                if (img.src.includes('hot_first') || img.src.includes('hot_second') || img.src.includes('hot_third') || img.src.includes('hot_')) {
                                    hot = text;
                                } else if (img.src.includes('play')) {
                                    plays = text;
                                } else if (img.src.includes('digg')) {
                                    likes = text;
                                } else if (img.src.includes('comment')) {
                                    comments = text;
                                }
                            }
                        });
                    }
                    
                    // 获取热词
                    const hotWords = [];
                    const hotWordElements = container.querySelectorAll('.other-text-XeleRf');
                    hotWordElements.forEach((el, i) => {
                        const text = el.textContent.trim();
                        if (i === 0 && text.includes('热词')) {
                            // 跳过"热词 :"标签
                        } else if (text && !text.includes('热词')) {
                            hotWords.push(text);
                        }
                    });
                    
                    // 查找标签
                    const hashTags = description.match(/#[^\\s#]+/g) || [];
                    
                    // 查找视频时长
                    let duration = '';
                    const timeElements = container.querySelectorAll('.time-text-mask-WmpK85 p');
                    if (timeElements.length > 0) {
                        duration = timeElements[0].textContent.trim();
                    }
                    
                    if (author && description) {
                        videos.push({
                            index: index + 1,
                            author: author,
                            description: description,
                            authorLink: authorLink.href,
                            duration: duration,
                            hot: hot,
                            plays: plays,
                            likes: likes,
                            comments: comments,
                            hotWords: hotWords,
                            hashTags: hashTags
                        });
                    }
                } catch (e) {
                    // 静默处理错误
                }
            });
            
            return {
                total: videos.length,
                videos: videos,
                crawlTime: new Date().toISOString(),
                pageTitle: document.title,
                pageUrl: window.location.href
            };
        }
    """)
    
    return data


async def crawl_creative_guidance(output_dir: str, headless: bool = True, manual_mode: bool = False, cookie_file: str = None, category: str = None, debug_mode: bool = False):
    """抓取抖音创作指导页面"""
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    url = "https://creator.douyin.com/creator-micro/creative-guidance"
    
    print(f"正在抓取抖音创作指导页面: {url}")
    if category:
        print(f"目标分类: {category}")
    if debug_mode:
        print("调试模式已启用")
    print()
    
    async with async_playwright() as p:
        # 启动浏览器
        browser = await p.chromium.launch(headless=headless)
        
        # 创建上下文，如果有Cookie则加载
        context_options = {}
        
        if cookie_file and Path(cookie_file).exists():
            cookies = await load_cookies(cookie_file)
            if cookies:
                context_options['storage_state'] = {'cookies': cookies}
        
        context = await browser.new_context(**context_options)
        page = await context.new_page()
        
        try:
            # 访问页面
            print("正在打开页面...")
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            
            if manual_mode:
                # 手动模式：等待用户登录
                print("\n" + "=" * 80)
                print("请在浏览器中完成以下操作：")
                print("1. 登录抖音账号（如果需要）")
                print("2. 等待页面完全加载，确保能看到热门视频列表")
                if category:
                    print(f"3. （可选）手动点击【{category}】分类，或者等待脚本自动点击")
                    print("4. 完成后，回到这里按回车键继续")
                else:
                    print("3. 完成后，回到这里按回车键继续")
                print("=" * 80 + "\n")
                
                input("按回车键开始抓取数据...")
                print()
                
                # 等待页面稳定
                print("等待页面稳定...")
                await asyncio.sleep(3)
                
                # 检查当前URL，如果发生了跳转，重新导航
                current_url = page.url
                if current_url != url:
                    print(f"检测到页面跳转到: {current_url}")
                    print("重新导航到目标页面...")
                    await page.goto(url, wait_until="domcontentloaded", timeout=60000)
                    await asyncio.sleep(5)
            else:
                # 自动模式：等待页面加载
                print("等待页面加载...")
                await asyncio.sleep(10)
            
            # 如果指定了分类，点击分类标签
            if category and category != "全部":
                print(f"正在切换到【{category}】分类...")
                
                # 先展开所有分类
                await page.evaluate("""
                    () => {
                        const showButtons = document.querySelectorAll('.show-button-sDo51G');
                        showButtons.forEach(btn => {
                            const text = btn.textContent.trim();
                            // 如果按钮不是"收起"，说明需要展开
                            if (!text.includes('收起')) {
                                btn.click();
                            }
                        });
                    }
                """)
                await asyncio.sleep(1)
                
                try:
                    # 查找并点击分类标签
                    category_clicked = await page.evaluate(f"""
                        () => {{
                            const categoryDivs = Array.from(document.querySelectorAll('.each-kind-MR__DN'));
                            const targetDiv = categoryDivs.find(div => 
                                div.textContent.trim() === '{category}'
                            );
                            if (targetDiv) {{
                                targetDiv.click();
                                return true;
                            }}
                            return false;
                        }}
                    """)
                    
                    if category_clicked:
                        print(f"✓ 已点击【{category}】分类")
                        print("等待分类内容加载...")
                        await asyncio.sleep(8)  # 增加等待时间
                    else:
                        print(f"⚠ 未找到【{category}】分类标签")
                        print("提示：使用 --list-categories 查看所有可用分类")
                except Exception as e:
                    print(f"⚠ 点击分类失败: {e}")
                    print("将抓取当前显示的内容")
            
            # 提取数据
            data = await extract_video_data(page, debug_mode=debug_mode)
            
            if data['total'] == 0:
                print("⚠ 未提取到视频数据")
                print("提示：")
                print("  1. 确保已登录抖音账号")
                print("  2. 确保页面已完全加载")
                print("  3. 尝试使用 --manual 参数手动控制")
                if not cookie_file:
                    print("  4. 或者使用 --cookie 参数提供Cookie文件")
                return
            
            print(f"✓ 提取到 {data['total']} 个视频\n")
            
            # 保存结果
            save_results(data['videos'], url, output_path, category)
            
        except Exception as e:
            print(f"✗ 错误: {e}")
            import traceback
            traceback.print_exc()
        finally:
            await browser.close()


def save_results(videos, url, output_path, category=None):
    """保存结果"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 文件名包含分类信息
    category_suffix = f"_{category}" if category else ""
    
    # JSON格式
    json_file = output_path / f"douyin_creative_guidance{category_suffix}_{timestamp}.json"
    result_data = {
        'page_url': url,
        'page_title': '抖音创作指导 - 热门视频',
        'category': category or '全部',
        'platform': 'douyin',
        'crawl_time': datetime.now().isoformat(),
        'total_videos': len(videos),
        'videos': videos
    }
    
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(result_data, f, ensure_ascii=False, indent=2)
    
    print(f"✓ JSON已保存: {json_file}")
    
    # 文本格式
    txt_file = json_file.with_suffix('.txt')
    with open(txt_file, "w", encoding="utf-8") as f:
        f.write(f"抖音创作指导 - 热门视频列表\n")
        f.write(f"页面: {url}\n")
        if category:
            f.write(f"分类: {category}\n")
        f.write(f"抓取时间: {datetime.now():%Y-%m-%d %H:%M:%S}\n")
        f.write(f"视频总数: {len(videos)}\n")
        f.write("=" * 80 + "\n\n")
        
        for video in videos:
            f.write(f"视频 {video['index']}:\n")
            f.write(f"  作者: {video.get('author', 'N/A')}\n")
            f.write(f"  描述: {video.get('description', 'N/A')}\n")
            
            if video.get('duration'):
                f.write(f"  时长: {video['duration']}\n")
            
            if video.get('authorLink'):
                f.write(f"  作者链接: {video['authorLink']}\n")
            
            if video.get('hot'):
                f.write(f"  热度: {video['hot']}\n")
            if video.get('plays'):
                f.write(f"  播放量: {video['plays']}\n")
            if video.get('likes'):
                f.write(f"  点赞: {video['likes']}\n")
            if video.get('comments'):
                f.write(f"  评论: {video['comments']}\n")
            
            if video.get('hashTags'):
                f.write(f"  标签: {', '.join(video['hashTags'])}\n")
            
            if video.get('hotWords'):
                f.write(f"  热词: {', '.join(video['hotWords'])}\n")
            
            f.write("-" * 80 + "\n")
    
    print(f"✓ 文本已保存: {txt_file}")
    
    # CSV格式
    csv_file = json_file.with_suffix('.csv')
    with open(csv_file, "w", encoding="utf-8-sig") as f:
        f.write("序号,作者,描述,时长,热度,播放量,点赞,评论,标签,作者链接\n")
        for video in videos:
            tags = '|'.join(video.get('hashTags', []))
            f.write(f"{video['index']},")
            f.write(f'"{video.get("author", "")}",')
            f.write(f'"{video.get("description", "")}",')
            f.write(f'"{video.get("duration", "")}",')
            f.write(f'"{video.get("hot", "")}",')
            f.write(f'"{video.get("plays", "")}",')
            f.write(f'"{video.get("likes", "")}",')
            f.write(f'"{video.get("comments", "")}",')
            f.write(f'"{tags}",')
            f.write(f'"{video.get("authorLink", "")}"\n')
    
    print(f"✓ CSV已保存: {csv_file}\n")
    
    print(f"统计:")
    print(f"  总视频数: {len(videos)}")
    print(f"  有标签: {sum(1 for v in videos if v.get('hashTags'))}")
    print(f"  有互动数据: {sum(1 for v in videos if v.get('plays') or v.get('likes'))}")


def main():
    parser = argparse.ArgumentParser(description="抖音创作指导页面爬虫 (Playwright版)")
    parser.add_argument("--output-dir", "-o", default="douyin_data", help="输出目录")
    parser.add_argument("--headless", action="store_true", help="无头模式（不显示浏览器）")
    parser.add_argument("--manual", "-m", action="store_true", help="手动模式：等待用户登录后按回车继续")
    parser.add_argument("--cookie", "-c", help="Cookie文件路径（JSON格式）")
    parser.add_argument("--category", "-t", help="分类标签")
    parser.add_argument("--debug", "-d", action="store_true", help="调试模式：显示页面HTML结构")
    parser.add_argument("--list-categories", "-l", action="store_true", help="列出所有可用的分类")
    args = parser.parse_args()
    
    print("=" * 80)
    print("抖音创作指导页面爬虫 (Playwright版)")
    print("=" * 80)
    print()
    
    # 如果是列出分类模式
    if args.list_categories:
        try:
            asyncio.run(list_categories(cookie_file=args.cookie))
        except KeyboardInterrupt:
            print("\n已中断")
        except Exception as e:
            print(f"\n错误: {e}")
            import traceback
            traceback.print_exc()
        return
    
    if args.cookie:
        if Path(args.cookie).exists():
            print(f"✓ 将使用Cookie文件: {args.cookie}")
        else:
            print(f"⚠ Cookie文件不存在: {args.cookie}")
        print()
    
    if args.category:
        print(f"✓ 目标分类: {args.category}")
        print()
    
    if not args.headless:
        print("⚠ 浏览器模式已启用")
        if args.manual:
            print("   手动模式：登录后按回车键继续抓取")
        else:
            print("   自动模式：将自动等待10秒后抓取")
        print()
    
    try:
        asyncio.run(crawl_creative_guidance(
            output_dir=args.output_dir,
            headless=args.headless,
            manual_mode=args.manual,
            cookie_file=args.cookie,
            category=args.category,
            debug_mode=args.debug,
        ))
    except KeyboardInterrupt:
        print("\n已中断")
    except Exception as e:
        print(f"\n错误: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()