Files
bili_follow_group/source/run_pipeline.py
digouyou b34239f5ea first_test
Co-authored-by: Copilot <copilot@github.com>
2026-04-26 19:26:17 +08:00

164 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""One-command pipeline: fetch titles -> batch analyze -> outputs.
Pipeline outputs:
1) source/output/reports/up_titles_report.md
2) source/output/reports/up_analysis_full_auto.md
3) source/output/reports/up_keep_follow_only.md
4) source/output/uids/unfollow_mids_list.txt (+ split files)
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="一键运行完整功能链")
parser.add_argument(
"--input-json",
default="source/resources/export_uids.json",
help="UP资源文件路径默认: source/resources/export_uids.json",
)
parser.add_argument(
"--titles-report",
default="source/output/reports/up_titles_report.md",
help="标题抓取报告输出路径",
)
parser.add_argument(
"--analysis-report",
default="source/output/reports/up_analysis_full_auto.md",
help="分批分析报告输出路径",
)
parser.add_argument(
"--keep-report",
default="source/output/reports/up_keep_follow_only.md",
help="保留关注报告输出路径",
)
parser.add_argument(
"--unfollow-uids",
default="source/output/uids/unfollow_mids_list.txt",
help="取关UID输出路径",
)
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
parser.add_argument("--max-retries", type=int, default=2, help="单条分析重试次数")
parser.add_argument("--request-timeout", type=float, default=60.0, help="单次请求超时")
parser.add_argument("--split-size", type=int, default=100, help="取关UID拆分分组大小")
parser.add_argument("--sleep-seconds", type=float, default=0.0, help="任务间隔秒数")
parser.add_argument("--retry-times", type=int, default=3, help="抓取重试次数")
parser.add_argument("--fetch-mode", choices=["auto", "api", "html"], default="auto", help="标题抓取模式")
parser.add_argument("--only-tag", default="", help="可选仅处理包含该标签的UP")
parser.add_argument("--max-ups", type=int, default=0, help="可选限制处理UP数量")
parser.add_argument("--bili-cookie", default="", help="可选运行时传入B站Cookie")
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
return parser.parse_args()
def run_cmd(cmd: list[str], title: str) -> None:
print(f"\n=== {title} ===")
print("$", " ".join(cmd))
subprocess.run(cmd, check=True)
def main() -> int:
args = parse_args()
for p in [
Path(args.titles_report).parent,
Path(args.analysis_report).parent,
Path(args.keep_report).parent,
Path(args.unfollow_uids).parent,
]:
p.mkdir(parents=True, exist_ok=True)
if not args.skip_fetch:
fetch_cmd = [
args.python,
"source/analyze_up_content.py",
"--input",
args.input_json,
"--output",
args.titles_report,
"--titles-per-up",
str(max(1, args.titles_per_up)),
"--retry-times",
str(max(1, args.retry_times)),
"--fetch-mode",
args.fetch_mode,
"--sleep-seconds",
str(max(0.0, args.sleep_seconds)),
"--skip-ai",
]
if args.only_tag:
fetch_cmd += ["--only-tag", args.only_tag]
if args.max_ups > 0:
fetch_cmd += ["--max-ups", str(args.max_ups)]
if args.bili_cookie:
fetch_cmd += ["--bili-cookie", args.bili_cookie]
run_cmd(fetch_cmd, "步骤1/4 抓取视频标题")
if not args.skip_analyze:
analyze_cmd = [
args.python,
"source/batch_ai_summary_from_report.py",
"--input-report",
args.titles_report,
"--output-report",
args.analysis_report,
"--batch-size",
str(max(1, args.batch_size)),
"--run-all-batches",
"--workers",
str(max(1, args.workers)),
"--max-retries",
str(max(1, args.max_retries)),
"--request-timeout",
str(max(1.0, args.request_timeout)),
"--sleep-seconds",
str(max(0.0, args.sleep_seconds)),
]
run_cmd(analyze_cmd, "步骤2/4 分批AI分析")
keep_cmd = [
args.python,
"source/extract_keep_follow_doc.py",
"--input-report",
args.analysis_report,
"--output-report",
args.keep_report,
]
run_cmd(keep_cmd, "步骤3/4 生成保留关注报告")
uid_cmd = [
args.python,
"source/extract_unfollow_list.py",
"--input-report",
args.analysis_report,
"--output-csv",
args.unfollow_uids,
"--format",
"mid-only",
"--split-size",
str(max(0, args.split_size)),
]
run_cmd(uid_cmd, "步骤4/4 生成取关UID列表")
print("\n流水线完成。")
print(f"- 标题报告: {args.titles_report}")
print(f"- 分析报告: {args.analysis_report}")
print(f"- 保留报告: {args.keep_report}")
print(f"- 取关UID: {args.unfollow_uids}")
return 0
if __name__ == "__main__":
raise SystemExit(main())