From 474ff816c15330d0c7c2a09902cad47d1df2afd6 Mon Sep 17 00:00:00 2001 From: digouyou <2074920584@qq.com> Date: Sun, 26 Apr 2026 20:50:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- readme.md | 41 ++++++++++++++++++++++++-------------- source/run_pipeline.py | 45 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 19 deletions(-) diff --git a/readme.md b/readme.md index 6693684..956475e 100644 --- a/readme.md +++ b/readme.md @@ -1,11 +1,37 @@ # B站关注清理工具(优化版) +> 一键命令运行全流程:`python source/run_pipeline.py` + 本项目保留并聚焦一条可用功能链: 1. 抓取视频标题 2. 分批AI分析 3. 生成取关UID(支持按100拆分) 4. 生成保留关注报告 +5. 按首字母排序 +6. 提取分组信息 + +## 快速开始 + +```powershell +# 完整流程(推荐) +python source/run_pipeline.py + +# 速度优先 +python source/run_pipeline.py --workers 8 --batch-size 30 --sleep-seconds 0 + +# 试跑30个UP +python source/run_pipeline.py --max-ups 30 + +# 跳过抓取,使用已有标题报告 +python source/run_pipeline.py --skip-fetch + +# 跳过分析,仅生成产物 +python source/run_pipeline.py --skip-analyze + +# 跳过排序/分组 +python source/run_pipeline.py --skip-sort --skip-group +``` ## 目录结构 @@ -46,21 +72,6 @@ VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3" `batch_ai_summary_from_report.py` 会自动读取该配置。 -## 一键推荐用法 - -在项目根目录运行: - -```powershell -python source/run_pipeline.py -``` - -默认会完成: - -1. 从 [source/resources/export_uids.json](source/resources/export_uids.json) 抓取标题到 [source/output/reports/up_titles_report.md](source/output/reports/up_titles_report.md) -2. 分批分析到 [source/output/reports/up_analysis_full_auto.md](source/output/reports/up_analysis_full_auto.md) -3. 生成保留关注报告 [source/output/reports/up_keep_follow_only.md](source/output/reports/up_keep_follow_only.md) -4. 生成取关UID [source/output/uids/unfollow_mids_list.txt](source/output/uids/unfollow_mids_list.txt) 并按100拆分 - ## 常用参数 ```powershell diff --git a/source/run_pipeline.py b/source/run_pipeline.py index 7178ae5..472a00f 100644 --- a/source/run_pipeline.py +++ b/source/run_pipeline.py @@ -6,6 +6,14 @@ Pipeline outputs: 2) source/output/reports/up_analysis_full_auto.md 3) source/output/reports/up_keep_follow_only.md 4) source/output/uids/unfollow_mids_list.txt (+ split files) + +Pipeline steps: +1) 抓取视频标题 (analyze_up_content.py) +2) 分批AI分析 (batch_ai_summary_from_report.py) +3) 生成保留关注报告 (extract_keep_follow_doc.py) +4) 生成取关UID列表 (extract_unfollow_list.py) +5) 按首字母排序 (sort_up_main.py) +6) 提取分组信息 (extract_group_info.py) """ from __future__ import annotations @@ -43,6 +51,11 @@ def parse_args() -> argparse.Namespace: default="source/output/uids/unfollow_mids_list.txt", help="取关UID输出路径", ) + parser.add_argument( + "--group_info", + default="source/output/uids/only_group_info.md", + help="分组信息输出路径", + ) parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量") parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数") parser.add_argument("--workers", type=int, default=6, help="并发请求数") @@ -57,6 +70,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--bili-cookie", default="", help="可选:运行时传入B站Cookie") parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告") parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取") + parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段") + parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段") parser.add_argument("--python", default=sys.executable, help="指定Python解释器") return parser.parse_args() @@ -103,7 +118,7 @@ def main() -> int: if args.bili_cookie: fetch_cmd += ["--bili-cookie", args.bili_cookie] - run_cmd(fetch_cmd, "步骤1/4 抓取视频标题") + run_cmd(fetch_cmd, "步骤1/6 抓取视频标题") if not args.skip_analyze: analyze_cmd = [ @@ -125,7 +140,7 @@ def main() -> int: "--sleep-seconds", str(max(0.0, args.sleep_seconds)), ] - run_cmd(analyze_cmd, "步骤2/4 分批AI分析") + run_cmd(analyze_cmd, "步骤2/6 分批AI分析") keep_cmd = [ args.python, @@ -135,7 +150,7 @@ def main() -> int: "--output-report", args.keep_report, ] - run_cmd(keep_cmd, "步骤3/4 生成保留关注报告") + run_cmd(keep_cmd, "步骤3/6 生成保留关注报告") uid_cmd = [ args.python, @@ -149,7 +164,29 @@ def main() -> int: "--split-size", str(max(0, args.split_size)), ] - run_cmd(uid_cmd, "步骤4/4 生成取关UID列表") + run_cmd(uid_cmd, "步骤4/6 生成取关UID列表") + + if not args.skip_sort: + sort_cmd = [ + args.python, + "source/sort_up_main.py", + "--input", + args.analysis_report, + "--output", + args.analysis_report, + ] + run_cmd(sort_cmd, "步骤5/6 按首字母排序") + + if not args.skip_group: + group_cmd = [ + args.python, + "source/extract_group_info.py", + "--input", + args.analysis_report, + "--output", + args.analysis_report, + ] + run_cmd(group_cmd, "步骤6/6 提取分组信息") print("\n流水线完成。") print(f"- 标题报告: {args.titles_report}")