更新pipeline
This commit is contained in:
41
readme.md
41
readme.md
@@ -1,11 +1,37 @@
|
|||||||
# B站关注清理工具(优化版)
|
# B站关注清理工具(优化版)
|
||||||
|
|
||||||
|
> 一键命令运行全流程:`python source/run_pipeline.py`
|
||||||
|
|
||||||
本项目保留并聚焦一条可用功能链:
|
本项目保留并聚焦一条可用功能链:
|
||||||
|
|
||||||
1. 抓取视频标题
|
1. 抓取视频标题
|
||||||
2. 分批AI分析
|
2. 分批AI分析
|
||||||
3. 生成取关UID(支持按100拆分)
|
3. 生成取关UID(支持按100拆分)
|
||||||
4. 生成保留关注报告
|
4. 生成保留关注报告
|
||||||
|
5. 按首字母排序
|
||||||
|
6. 提取分组信息
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
# 完整流程(推荐)
|
||||||
|
python source/run_pipeline.py
|
||||||
|
|
||||||
|
# 速度优先
|
||||||
|
python source/run_pipeline.py --workers 8 --batch-size 30 --sleep-seconds 0
|
||||||
|
|
||||||
|
# 试跑30个UP
|
||||||
|
python source/run_pipeline.py --max-ups 30
|
||||||
|
|
||||||
|
# 跳过抓取,使用已有标题报告
|
||||||
|
python source/run_pipeline.py --skip-fetch
|
||||||
|
|
||||||
|
# 跳过分析,仅生成产物
|
||||||
|
python source/run_pipeline.py --skip-analyze
|
||||||
|
|
||||||
|
# 跳过排序/分组
|
||||||
|
python source/run_pipeline.py --skip-sort --skip-group
|
||||||
|
```
|
||||||
|
|
||||||
## 目录结构
|
## 目录结构
|
||||||
|
|
||||||
@@ -46,21 +72,6 @@ VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
|
|||||||
|
|
||||||
`batch_ai_summary_from_report.py` 会自动读取该配置。
|
`batch_ai_summary_from_report.py` 会自动读取该配置。
|
||||||
|
|
||||||
## 一键推荐用法
|
|
||||||
|
|
||||||
在项目根目录运行:
|
|
||||||
|
|
||||||
```powershell
|
|
||||||
python source/run_pipeline.py
|
|
||||||
```
|
|
||||||
|
|
||||||
默认会完成:
|
|
||||||
|
|
||||||
1. 从 [source/resources/export_uids.json](source/resources/export_uids.json) 抓取标题到 [source/output/reports/up_titles_report.md](source/output/reports/up_titles_report.md)
|
|
||||||
2. 分批分析到 [source/output/reports/up_analysis_full_auto.md](source/output/reports/up_analysis_full_auto.md)
|
|
||||||
3. 生成保留关注报告 [source/output/reports/up_keep_follow_only.md](source/output/reports/up_keep_follow_only.md)
|
|
||||||
4. 生成取关UID [source/output/uids/unfollow_mids_list.txt](source/output/uids/unfollow_mids_list.txt) 并按100拆分
|
|
||||||
|
|
||||||
## 常用参数
|
## 常用参数
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
|
|||||||
@@ -6,6 +6,14 @@ Pipeline outputs:
|
|||||||
2) source/output/reports/up_analysis_full_auto.md
|
2) source/output/reports/up_analysis_full_auto.md
|
||||||
3) source/output/reports/up_keep_follow_only.md
|
3) source/output/reports/up_keep_follow_only.md
|
||||||
4) source/output/uids/unfollow_mids_list.txt (+ split files)
|
4) source/output/uids/unfollow_mids_list.txt (+ split files)
|
||||||
|
|
||||||
|
Pipeline steps:
|
||||||
|
1) 抓取视频标题 (analyze_up_content.py)
|
||||||
|
2) 分批AI分析 (batch_ai_summary_from_report.py)
|
||||||
|
3) 生成保留关注报告 (extract_keep_follow_doc.py)
|
||||||
|
4) 生成取关UID列表 (extract_unfollow_list.py)
|
||||||
|
5) 按首字母排序 (sort_up_main.py)
|
||||||
|
6) 提取分组信息 (extract_group_info.py)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -43,6 +51,11 @@ def parse_args() -> argparse.Namespace:
|
|||||||
default="source/output/uids/unfollow_mids_list.txt",
|
default="source/output/uids/unfollow_mids_list.txt",
|
||||||
help="取关UID输出路径",
|
help="取关UID输出路径",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--group_info",
|
||||||
|
default="source/output/uids/only_group_info.md",
|
||||||
|
help="分组信息输出路径",
|
||||||
|
)
|
||||||
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
|
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
|
||||||
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
|
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
|
||||||
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
|
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
|
||||||
@@ -57,6 +70,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
parser.add_argument("--bili-cookie", default="", help="可选:运行时传入B站Cookie")
|
parser.add_argument("--bili-cookie", default="", help="可选:运行时传入B站Cookie")
|
||||||
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
|
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
|
||||||
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
|
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
|
||||||
|
parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段")
|
||||||
|
parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段")
|
||||||
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
|
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
@@ -103,7 +118,7 @@ def main() -> int:
|
|||||||
if args.bili_cookie:
|
if args.bili_cookie:
|
||||||
fetch_cmd += ["--bili-cookie", args.bili_cookie]
|
fetch_cmd += ["--bili-cookie", args.bili_cookie]
|
||||||
|
|
||||||
run_cmd(fetch_cmd, "步骤1/4 抓取视频标题")
|
run_cmd(fetch_cmd, "步骤1/6 抓取视频标题")
|
||||||
|
|
||||||
if not args.skip_analyze:
|
if not args.skip_analyze:
|
||||||
analyze_cmd = [
|
analyze_cmd = [
|
||||||
@@ -125,7 +140,7 @@ def main() -> int:
|
|||||||
"--sleep-seconds",
|
"--sleep-seconds",
|
||||||
str(max(0.0, args.sleep_seconds)),
|
str(max(0.0, args.sleep_seconds)),
|
||||||
]
|
]
|
||||||
run_cmd(analyze_cmd, "步骤2/4 分批AI分析")
|
run_cmd(analyze_cmd, "步骤2/6 分批AI分析")
|
||||||
|
|
||||||
keep_cmd = [
|
keep_cmd = [
|
||||||
args.python,
|
args.python,
|
||||||
@@ -135,7 +150,7 @@ def main() -> int:
|
|||||||
"--output-report",
|
"--output-report",
|
||||||
args.keep_report,
|
args.keep_report,
|
||||||
]
|
]
|
||||||
run_cmd(keep_cmd, "步骤3/4 生成保留关注报告")
|
run_cmd(keep_cmd, "步骤3/6 生成保留关注报告")
|
||||||
|
|
||||||
uid_cmd = [
|
uid_cmd = [
|
||||||
args.python,
|
args.python,
|
||||||
@@ -149,7 +164,29 @@ def main() -> int:
|
|||||||
"--split-size",
|
"--split-size",
|
||||||
str(max(0, args.split_size)),
|
str(max(0, args.split_size)),
|
||||||
]
|
]
|
||||||
run_cmd(uid_cmd, "步骤4/4 生成取关UID列表")
|
run_cmd(uid_cmd, "步骤4/6 生成取关UID列表")
|
||||||
|
|
||||||
|
if not args.skip_sort:
|
||||||
|
sort_cmd = [
|
||||||
|
args.python,
|
||||||
|
"source/sort_up_main.py",
|
||||||
|
"--input",
|
||||||
|
args.analysis_report,
|
||||||
|
"--output",
|
||||||
|
args.analysis_report,
|
||||||
|
]
|
||||||
|
run_cmd(sort_cmd, "步骤5/6 按首字母排序")
|
||||||
|
|
||||||
|
if not args.skip_group:
|
||||||
|
group_cmd = [
|
||||||
|
args.python,
|
||||||
|
"source/extract_group_info.py",
|
||||||
|
"--input",
|
||||||
|
args.analysis_report,
|
||||||
|
"--output",
|
||||||
|
args.analysis_report,
|
||||||
|
]
|
||||||
|
run_cmd(group_cmd, "步骤6/6 提取分组信息")
|
||||||
|
|
||||||
print("\n流水线完成。")
|
print("\n流水线完成。")
|
||||||
print(f"- 标题报告: {args.titles_report}")
|
print(f"- 标题报告: {args.titles_report}")
|
||||||
|
|||||||
Reference in New Issue
Block a user