更新pipeline

This commit is contained in:
2026-04-26 20:50:21 +08:00
parent 27d6df9d8b
commit 474ff816c1
2 changed files with 67 additions and 19 deletions

View File

@@ -6,6 +6,14 @@ Pipeline outputs:
2) source/output/reports/up_analysis_full_auto.md
3) source/output/reports/up_keep_follow_only.md
4) source/output/uids/unfollow_mids_list.txt (+ split files)
Pipeline steps:
1) 抓取视频标题 (analyze_up_content.py)
2) 分批AI分析 (batch_ai_summary_from_report.py)
3) 生成保留关注报告 (extract_keep_follow_doc.py)
4) 生成取关UID列表 (extract_unfollow_list.py)
5) 按首字母排序 (sort_up_main.py)
6) 提取分组信息 (extract_group_info.py)
"""
from __future__ import annotations
@@ -43,6 +51,11 @@ def parse_args() -> argparse.Namespace:
default="source/output/uids/unfollow_mids_list.txt",
help="取关UID输出路径",
)
parser.add_argument(
"--group_info",
default="source/output/uids/only_group_info.md",
help="分组信息输出路径",
)
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
@@ -57,6 +70,8 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--bili-cookie", default="", help="可选运行时传入B站Cookie")
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段")
parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段")
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
return parser.parse_args()
@@ -103,7 +118,7 @@ def main() -> int:
if args.bili_cookie:
fetch_cmd += ["--bili-cookie", args.bili_cookie]
run_cmd(fetch_cmd, "步骤1/4 抓取视频标题")
run_cmd(fetch_cmd, "步骤1/6 抓取视频标题")
if not args.skip_analyze:
analyze_cmd = [
@@ -125,7 +140,7 @@ def main() -> int:
"--sleep-seconds",
str(max(0.0, args.sleep_seconds)),
]
run_cmd(analyze_cmd, "步骤2/4 分批AI分析")
run_cmd(analyze_cmd, "步骤2/6 分批AI分析")
keep_cmd = [
args.python,
@@ -135,7 +150,7 @@ def main() -> int:
"--output-report",
args.keep_report,
]
run_cmd(keep_cmd, "步骤3/4 生成保留关注报告")
run_cmd(keep_cmd, "步骤3/6 生成保留关注报告")
uid_cmd = [
args.python,
@@ -149,7 +164,29 @@ def main() -> int:
"--split-size",
str(max(0, args.split_size)),
]
run_cmd(uid_cmd, "步骤4/4 生成取关UID列表")
run_cmd(uid_cmd, "步骤4/6 生成取关UID列表")
if not args.skip_sort:
sort_cmd = [
args.python,
"source/sort_up_main.py",
"--input",
args.analysis_report,
"--output",
args.analysis_report,
]
run_cmd(sort_cmd, "步骤5/6 按首字母排序")
if not args.skip_group:
group_cmd = [
args.python,
"source/extract_group_info.py",
"--input",
args.analysis_report,
"--output",
args.analysis_report,
]
run_cmd(group_cmd, "步骤6/6 提取分组信息")
print("\n流水线完成。")
print(f"- 标题报告: {args.titles_report}")