Files
bili_follow_group/source/scripts/run_pipeline.py

209 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""One-command pipeline: fetch titles -> batch analyze -> outputs.
Pipeline outputs:
1) source/output/reports/1_up_titles_report.md
2) source/output/reports/2_up_analysis_full_auto.md
3) source/output/reports/3_up_keep_follow_only.md
4) source/output/uids/4_unfollow_mids_list.txt (+ split files)
Pipeline steps:
1) 抓取视频标题 (analyze_up_content.py)
2) 分批AI分析 (batch_ai_summary_from_report.py)
3) 生成保留关注报告 (extract_keep_follow_doc.py)
4) 生成取关UID列表 (extract_unfollow_list.py)
5) 按首字母排序 (sort_up_main.py)
6) 提取分组信息 (extract_group_info.py)
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="一键运行完整功能链")
parser.add_argument(
"--input-json",
default="source/resources/export_uids.json",
help="UP资源文件路径默认: source/resources/export_uids.json",
)
parser.add_argument(
"--titles-report",
default="source/output/reports/1_up_titles_report.md",
help="标题抓取报告输出路径",
)
parser.add_argument(
"--analysis-report",
default="source/output/reports/2_up_analysis_full_auto.md",
help="分批分析报告输出路径",
)
parser.add_argument(
"--keep-report",
default="source/output/reports/3_up_keep_follow_only.md",
help="保留关注报告输出路径",
)
parser.add_argument(
"--unfollow-uids",
default="source/output/uids/4_unfollow_mids_list.txt",
help="取关UID输出路径",
)
parser.add_argument(
"--group_info",
default="source/output/uids/only_group_info.md",
help="分组信息输出路径",
)
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
parser.add_argument("--max-retries", type=int, default=2, help="单条分析重试次数")
parser.add_argument("--request-timeout", type=float, default=60.0, help="单次请求超时")
parser.add_argument("--split-size", type=int, default=100, help="取关UID拆分分组大小")
parser.add_argument("--sleep-seconds", type=float, default=0.0, help="任务间隔秒数")
parser.add_argument("--retry-times", type=int, default=3, help="抓取重试次数")
parser.add_argument("--fetch-mode", choices=["auto", "api", "html"], default="auto", help="标题抓取模式")
parser.add_argument("--only-tag", default="", help="可选仅处理包含该标签的UP")
parser.add_argument("--max-ups", type=int, default=0, help="可选限制处理UP数量")
parser.add_argument("--bili-cookie", default="", help="可选运行时传入B站Cookie")
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段")
parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段")
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
return parser.parse_args()
def run_cmd(cmd: list[str], title: str) -> None:
print(f"\n=== {title} ===")
print("$", " ".join(cmd))
subprocess.run(cmd, check=True)
def main() -> int:
args = parse_args()
for p in [
Path(args.titles_report).parent,
Path(args.analysis_report).parent,
Path(args.keep_report).parent,
Path(args.unfollow_uids).parent,
]:
p.mkdir(parents=True, exist_ok=True)
if not args.skip_fetch:
fetch_cmd = [
args.python,
"source/scripts/analyze_up_content.py",
"--input",
args.input_json,
"--output",
args.titles_report,
"--titles-per-up",
str(max(1, args.titles_per_up)),
"--retry-times",
str(max(1, args.retry_times)),
"--fetch-mode",
args.fetch_mode,
"--sleep-seconds",
str(max(0.0, args.sleep_seconds)),
"--skip-ai",
]
if args.only_tag:
fetch_cmd += ["--only-tag", args.only_tag]
if args.max_ups > 0:
fetch_cmd += ["--max-ups", str(args.max_ups)]
if args.bili_cookie:
fetch_cmd += ["--bili-cookie", args.bili_cookie]
run_cmd(fetch_cmd, "步骤1/6 抓取视频标题")
if not args.skip_analyze:
analyze_cmd = [
args.python,
"source/scripts/batch_ai_summary_from_report.py",
"--input-report",
args.titles_report,
"--output-report",
args.analysis_report,
"--batch-size",
str(max(1, args.batch_size)),
"--run-all-batches",
"--workers",
str(max(1, args.workers)),
"--max-retries",
str(max(1, args.max_retries)),
"--request-timeout",
str(max(1.0, args.request_timeout)),
"--sleep-seconds",
str(max(0.0, args.sleep_seconds)),
]
run_cmd(analyze_cmd, "步骤2/6 分批AI分析")
keep_cmd = [
args.python,
"source/scripts/extract_keep_follow_doc.py",
"--input-report",
args.analysis_report,
"--output-report",
args.keep_report,
]
run_cmd(keep_cmd, "步骤3/6 生成保留关注报告")
uid_cmd = [
args.python,
"source/scripts/extract_unfollow_list.py",
"--input-report",
args.analysis_report,
"--output-csv",
args.unfollow_uids,
"--format",
"mid-only",
"--split-size",
str(max(0, args.split_size)),
]
run_cmd(uid_cmd, "步骤4/6 生成取关UID列表")
sorted_report = "source/output/reports/5_sorted_up_analysis.md"
group_report = "source/output/reports/6_group_info.md"
if not args.skip_sort:
sort_cmd = [
args.python,
"source/scripts/sort_up_main.py",
"--input",
args.analysis_report,
"--output",
sorted_report,
]
run_cmd(sort_cmd, "步骤5/6 按首字母排序")
if not args.skip_group:
input_for_group = sorted_report if not args.skip_sort else args.analysis_report
group_cmd = [
args.python,
"source/scripts/extract_group_info.py",
"--input",
input_for_group,
"--output",
group_report,
]
run_cmd(group_cmd, "步骤6/6 提取分组信息")
print("\n流水线完成。")
print(f"- 1 标题报告: {args.titles_report}")
print(f"- 2 分析报告: {args.analysis_report}")
print(f"- 3 保留报告: {args.keep_report}")
print(f"- 4 取关UID: {args.unfollow_uids}")
if not args.skip_sort:
print(f"- 5 排序报告: {sorted_report}")
if not args.skip_group:
print(f"- 6 分组报告: {group_report}")
return 0
if __name__ == "__main__":
raise SystemExit(main())