#!/usr/bin/env python3 """One-command pipeline: fetch titles -> batch analyze -> outputs. Pipeline outputs: 1) source/output/reports/1_up_titles_report.md 2) source/output/reports/2_up_analysis_full_auto.md 3) source/output/reports/3_up_keep_follow_only.md 4) source/output/uids/4_unfollow_mids_list.txt (+ split files) Pipeline steps: 1) 抓取视频标题 (analyze_up_content.py) 2) 分批AI分析 (batch_ai_summary_from_report.py) 3) 生成保留关注报告 (extract_keep_follow_doc.py) 4) 生成取关UID列表 (extract_unfollow_list.py) 5) 按首字母排序 (sort_up_main.py) 6) 提取分组信息 (extract_group_info.py) """ from __future__ import annotations import argparse import subprocess import sys from pathlib import Path def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="一键运行完整功能链") parser.add_argument( "--input-json", default="source/resources/export_uids.json", help="UP资源文件路径,默认: source/resources/export_uids.json", ) parser.add_argument( "--titles-report", default="source/output/reports/1_up_titles_report.md", help="标题抓取报告输出路径", ) parser.add_argument( "--analysis-report", default="source/output/reports/2_up_analysis_full_auto.md", help="分批分析报告输出路径", ) parser.add_argument( "--keep-report", default="source/output/reports/3_up_keep_follow_only.md", help="保留关注报告输出路径", ) parser.add_argument( "--unfollow-uids", default="source/output/uids/4_unfollow_mids_list.txt", help="取关UID输出路径", ) parser.add_argument( "--group_info", default="source/output/uids/only_group_info.md", help="分组信息输出路径", ) parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量") parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数") parser.add_argument("--workers", type=int, default=6, help="并发请求数") parser.add_argument("--max-retries", type=int, default=2, help="单条分析重试次数") parser.add_argument("--request-timeout", type=float, default=60.0, help="单次请求超时") parser.add_argument("--split-size", type=int, default=100, help="取关UID拆分分组大小") parser.add_argument("--sleep-seconds", type=float, default=0.0, help="任务间隔秒数") parser.add_argument("--retry-times", type=int, default=3, help="抓取重试次数") parser.add_argument("--fetch-mode", choices=["auto", "api", "html"], default="auto", help="标题抓取模式") parser.add_argument("--only-tag", default="", help="可选:仅处理包含该标签的UP") parser.add_argument("--max-ups", type=int, default=0, help="可选:限制处理UP数量") parser.add_argument("--bili-cookie", default="", help="可选:运行时传入B站Cookie") parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告") parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取") parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段") parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段") parser.add_argument("--python", default=sys.executable, help="指定Python解释器") return parser.parse_args() def run_cmd(cmd: list[str], title: str) -> None: print(f"\n=== {title} ===") print("$", " ".join(cmd)) subprocess.run(cmd, check=True) def main() -> int: args = parse_args() for p in [ Path(args.titles_report).parent, Path(args.analysis_report).parent, Path(args.keep_report).parent, Path(args.unfollow_uids).parent, ]: p.mkdir(parents=True, exist_ok=True) if not args.skip_fetch: fetch_cmd = [ args.python, "source/scripts/analyze_up_content.py", "--input", args.input_json, "--output", args.titles_report, "--titles-per-up", str(max(1, args.titles_per_up)), "--retry-times", str(max(1, args.retry_times)), "--fetch-mode", args.fetch_mode, "--sleep-seconds", str(max(0.0, args.sleep_seconds)), "--skip-ai", ] if args.only_tag: fetch_cmd += ["--only-tag", args.only_tag] if args.max_ups > 0: fetch_cmd += ["--max-ups", str(args.max_ups)] if args.bili_cookie: fetch_cmd += ["--bili-cookie", args.bili_cookie] run_cmd(fetch_cmd, "步骤1/6 抓取视频标题") if not args.skip_analyze: analyze_cmd = [ args.python, "source/scripts/batch_ai_summary_from_report.py", "--input-report", args.titles_report, "--output-report", args.analysis_report, "--batch-size", str(max(1, args.batch_size)), "--run-all-batches", "--workers", str(max(1, args.workers)), "--max-retries", str(max(1, args.max_retries)), "--request-timeout", str(max(1.0, args.request_timeout)), "--sleep-seconds", str(max(0.0, args.sleep_seconds)), ] run_cmd(analyze_cmd, "步骤2/6 分批AI分析") keep_cmd = [ args.python, "source/scripts/extract_keep_follow_doc.py", "--input-report", args.analysis_report, "--output-report", args.keep_report, ] run_cmd(keep_cmd, "步骤3/6 生成保留关注报告") uid_cmd = [ args.python, "source/scripts/extract_unfollow_list.py", "--input-report", args.analysis_report, "--output-csv", args.unfollow_uids, "--format", "mid-only", "--split-size", str(max(0, args.split_size)), ] run_cmd(uid_cmd, "步骤4/6 生成取关UID列表") sorted_report = "source/output/reports/5_sorted_up_analysis.md" group_report = "source/output/reports/6_group_info.md" if not args.skip_sort: sort_cmd = [ args.python, "source/scripts/sort_up_main.py", "--input", args.analysis_report, "--output", sorted_report, ] run_cmd(sort_cmd, "步骤5/6 按首字母排序") if not args.skip_group: input_for_group = sorted_report if not args.skip_sort else args.analysis_report group_cmd = [ args.python, "source/scripts/extract_group_info.py", "--input", input_for_group, "--output", group_report, ] run_cmd(group_cmd, "步骤6/6 提取分组信息") print("\n流水线完成。") print(f"- 1 标题报告: {args.titles_report}") print(f"- 2 分析报告: {args.analysis_report}") print(f"- 3 保留报告: {args.keep_report}") print(f"- 4 取关UID: {args.unfollow_uids}") if not args.skip_sort: print(f"- 5 排序报告: {sorted_report}") if not args.skip_group: print(f"- 6 分组报告: {group_report}") return 0 if __name__ == "__main__": raise SystemExit(main())