恢复所有项目文件
This commit is contained in:
76
source/.gitignore
vendored
Normal file
76
source/.gitignore
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
# 1. 忽略操作系统自动生成的文件
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
*.lnk
|
||||
|
||||
# 2. 忽略编译/构建产物
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# 3. 忽略IDE配置
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# 4. 忽略日志文件
|
||||
*.log
|
||||
*.tmp
|
||||
*.temp
|
||||
*.md
|
||||
*.*
|
||||
|
||||
# 5. 忽略敏感数据
|
||||
*.env
|
||||
*.key
|
||||
*.pem
|
||||
*.cert
|
||||
config.yaml
|
||||
secrets/
|
||||
|
||||
# 6. 忽略大型媒体文件
|
||||
*.mp4
|
||||
*.mov
|
||||
*.avi
|
||||
*.wav
|
||||
*.mp3
|
||||
*.zip
|
||||
*.tar
|
||||
*.gz
|
||||
*.7z
|
||||
*.rar
|
||||
|
||||
# 7. 忽略数据分析/机器学习特有
|
||||
*.model
|
||||
*.h5
|
||||
*.pkl
|
||||
*.joblib
|
||||
.ipynb_checkpoints/
|
||||
|
||||
# 8. 忽略你项目中的自动生成目录
|
||||
# 根据你的目录结构,忽略source/output/和source/reports/下的所有文件
|
||||
# 但保留目录结构本身(可以添加空的.gitkeep文件来保持空目录)
|
||||
source/output/**/*
|
||||
!source/output/.gitkeep
|
||||
source/reports/**/*
|
||||
!source/reports/.gitkeep
|
||||
1507
source/.note/20_07_group_only.md
Normal file
1507
source/.note/20_07_group_only.md
Normal file
File diff suppressed because it is too large
Load Diff
2
source/.note/note.md
Normal file
2
source/.note/note.md
Normal file
@@ -0,0 +1,2 @@
|
||||
不要让agent直接处理内容
|
||||
让agent编写脚本,自行测试,不然额度会不够
|
||||
16075
source/.test_output/17_46.md
Normal file
16075
source/.test_output/17_46.md
Normal file
File diff suppressed because it is too large
Load Diff
14919
source/.test_output/18_04.md
Normal file
14919
source/.test_output/18_04.md
Normal file
File diff suppressed because it is too large
Load Diff
14873
source/.test_output/18_12.md
Normal file
14873
source/.test_output/18_12.md
Normal file
File diff suppressed because it is too large
Load Diff
14871
source/.test_output/19_02.md
Normal file
14871
source/.test_output/19_02.md
Normal file
File diff suppressed because it is too large
Load Diff
16455
source/.test_output/19_06_all.md
Normal file
16455
source/.test_output/19_06_all.md
Normal file
File diff suppressed because it is too large
Load Diff
4585
source/.test_output/19_30_keep_follow.md
Normal file
4585
source/.test_output/19_30_keep_follow.md
Normal file
File diff suppressed because it is too large
Load Diff
4572
source/.test_output/19_39_sorted.md
Normal file
4572
source/.test_output/19_39_sorted.md
Normal file
File diff suppressed because it is too large
Load Diff
7394
source/.test_output/19_45_no_titles.md
Normal file
7394
source/.test_output/19_45_no_titles.md
Normal file
File diff suppressed because it is too large
Load Diff
4572
source/.test_output/19_53_no_titles.md
Normal file
4572
source/.test_output/19_53_no_titles.md
Normal file
File diff suppressed because it is too large
Load Diff
1507
source/.test_output/20_07_group_only.md
Normal file
1507
source/.test_output/20_07_group_only.md
Normal file
File diff suppressed because it is too large
Load Diff
1
source/.test_output/all_i_need/export_uids.json
Normal file
1
source/.test_output/all_i_need/export_uids.json
Normal file
File diff suppressed because one or more lines are too long
1
source/.test_output/all_i_need/export_uids.txt
Normal file
1
source/.test_output/all_i_need/export_uids.txt
Normal file
File diff suppressed because one or more lines are too long
1
source/.test_output/all_i_need/unfollow_mids_list.txt
Normal file
1
source/.test_output/all_i_need/unfollow_mids_list.txt
Normal file
@@ -0,0 +1 @@
|
||||
1044673687,1481344732,1858861103,444728505,23947287,35807625,111714204,1587138171,440798355,33291981,11914415,436700803,3493282273299102,612593877,2125857107,2000819931,507448807,505935166,14524124,385200931,1769820463,1562896062,3493285194632125,3493264275540254,479424216,604710494,1016523517,1428318343,700817047,543931674,1590538073,1574721168,432752294,3494376355400290,1795221360,4848323,495224316,3493258518858434,379247856,32360194,381653678,274928598,475656605,365212208,3546378525477862,35339643,1747335,1263990139,3493263038220393,251642119,387412319,1212367465,589747109,1025542770,23770618,3494350482836026,54091976,599449178,1715594148,3493127266503448,1767282898,487505057,630874464,1264711195,3537118481615036,319358609,518742534,385172962,4401694,474803476,525382468,3546595513600180,295993972,476819048,21435789,1725223092,2114928296,174471602,1480366563,17095888,295100453,1305776725,25694274,14797570,166828,385126080,3461582166166488,3537120815745590,489302782,73674032,1500074803,68134500,1047158092,3546571071293861,124806013,26055664,441631812,243680430,601300995,108526737,2100151539,3546603229023143,1749224369,3493133887211865,56300844,255139870,23244398,3493291869866324,3494354444355822,3546593938639500,1098004826,94577838,21849780,35105301,423319981,535023713,224560702,3546637651675315,3494361759221832,1640934198,1710911403,14342271,2031277323,603430640,3546568640694467,1741962246,1304346514,283389925,3461575868418125,3546622413768823,3494364269513335,185549749,502539494,73528331,510767506,3461579156752681,238171381,3546627212052911,448165099,1975692083,542824499,16243913,3494354016537425,316627722,1944667205,1433031509,3546387566299549,496787581,3546643550963789,382423121,600428973,430426421,325848853,735958,35162124,668794433,3546390949005555,478548163,3546672034482563,250584301,485234598,1555665460,6776617,108709998,437840703,28378491,67079745,1606682745,629101318,452161580,3493089637305282,374377163,213845897,323713206,272107494,622986240,1773278179,3546656899336980,67141499,318331,285027361,114366178,203983793,1283676771,1965933018,470624011,3546583482239276,3493281239402498,1475977561,2016676980,1209319826,1335124945,416206486,129860965,1780480185,1809567655,245645656,1937416537,1060544882,1335713025,3546617688886097,3546752326044595,3546613148551357,652060948,2116071253,97407861,3546908731639909,3546693165386233,278761367,323588182,486989780,3494353494345852,96609715,264869770,478849208,1679822121,19414347,3493127314737312,702915816,482867012,3546969421122388,3546590214097572,501642082,458165375,3546662484052067,481153145,1159873315,3546857594685834,1508100119,111900,1732848825,3546606469123022,106685726,490494088,1511660367
|
||||
1
source/.test_output/all_i_need/unfollow_mids_list_1.txt
Normal file
1
source/.test_output/all_i_need/unfollow_mids_list_1.txt
Normal file
@@ -0,0 +1 @@
|
||||
1044673687,1481344732,1858861103,444728505,23947287,35807625,111714204,1587138171,440798355,33291981,11914415,436700803,3493282273299102,612593877,2125857107,2000819931,507448807,505935166,14524124,385200931,1769820463,1562896062,3493285194632125,3493264275540254,479424216,604710494,1016523517,1428318343,700817047,543931674,1590538073,1574721168,432752294,3494376355400290,1795221360,4848323,495224316,3493258518858434,379247856,32360194,381653678,274928598,475656605,365212208,3546378525477862,35339643,1747335,1263990139,3493263038220393,251642119,387412319,1212367465,589747109,1025542770,23770618,3494350482836026,54091976,599449178,1715594148,3493127266503448,1767282898,487505057,630874464,1264711195,3537118481615036,319358609,518742534,385172962,4401694,474803476,525382468,3546595513600180,295993972,476819048,21435789,1725223092,2114928296,174471602,1480366563,17095888,295100453,1305776725,25694274,14797570,166828,385126080,3461582166166488,3537120815745590,489302782,73674032,1500074803,68134500,1047158092,3546571071293861,124806013,26055664,441631812,243680430,601300995,108526737
|
||||
1
source/.test_output/all_i_need/unfollow_mids_list_2.txt
Normal file
1
source/.test_output/all_i_need/unfollow_mids_list_2.txt
Normal file
@@ -0,0 +1 @@
|
||||
2100151539,3546603229023143,1749224369,3493133887211865,56300844,255139870,23244398,3493291869866324,3494354444355822,3546593938639500,1098004826,94577838,21849780,35105301,423319981,535023713,224560702,3546637651675315,3494361759221832,1640934198,1710911403,14342271,2031277323,603430640,3546568640694467,1741962246,1304346514,283389925,3461575868418125,3546622413768823,3494364269513335,185549749,502539494,73528331,510767506,3461579156752681,238171381,3546627212052911,448165099,1975692083,542824499,16243913,3494354016537425,316627722,1944667205,1433031509,3546387566299549,496787581,3546643550963789,382423121,600428973,430426421,325848853,735958,35162124,668794433,3546390949005555,478548163,3546672034482563,250584301,485234598,1555665460,6776617,108709998,437840703,28378491,67079745,1606682745,629101318,452161580,3493089637305282,374377163,213845897,323713206,272107494,622986240,1773278179,3546656899336980,67141499,318331,285027361,114366178,203983793,1283676771,1965933018,470624011,3546583482239276,3493281239402498,1475977561,2016676980,1209319826,1335124945,416206486,129860965,1780480185,1809567655,245645656,1937416537,1060544882,1335713025
|
||||
1
source/.test_output/all_i_need/unfollow_mids_list_3.txt
Normal file
1
source/.test_output/all_i_need/unfollow_mids_list_3.txt
Normal file
@@ -0,0 +1 @@
|
||||
3546617688886097,3546752326044595,3546613148551357,652060948,2116071253,97407861,3546908731639909,3546693165386233,278761367,323588182,486989780,3494353494345852,96609715,264869770,478849208,1679822121,19414347,3493127314737312,702915816,482867012,3546969421122388,3546590214097572,501642082,458165375,3546662484052067,481153145,1159873315,3546857594685834,1508100119,111900,1732848825,3546606469123022,106685726,490494088,1511660367
|
||||
16443
source/.test_output/all_i_need/up_analysis_full_auto copy.md
Normal file
16443
source/.test_output/all_i_need/up_analysis_full_auto copy.md
Normal file
File diff suppressed because it is too large
Load Diff
14835
source/.test_output/all_i_need/up_analysis_full_auto.md
Normal file
14835
source/.test_output/all_i_need/up_analysis_full_auto.md
Normal file
File diff suppressed because it is too large
Load Diff
126
source/.test_output/all_i_need/up_keep_follow_only.md
Normal file
126
source/.test_output/all_i_need/up_keep_follow_only.md
Normal file
@@ -0,0 +1,126 @@
|
||||
# 保留关注UP主分析与分组建议
|
||||
|
||||
- 生成时间: 2026-04-26 15:12:23
|
||||
- 来源文件: up_analysis_full_auto.md
|
||||
- 条目数: 88
|
||||
|
||||
## 1. AI光影社 (mid: 1957113893)
|
||||
|
||||
### AI分析
|
||||
|
||||
AI光影社专注于人工智能领域的最新进展与深度思考,涵盖GPT模型迭代、AI智能体协作、科学应用及社会影响等硬核内容。其标题体现对技术本质和行业趋势的剖析,兼具前沿性和批判性。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容聚焦AI技术原理与科学应用,关键词'模型''科学''技术'命中硬核知识分组规则,但缺乏编程实操细节,未达更高分组标准,建议保留。
|
||||
|
||||
## 2. DeeparchWorks (mid: 276483261)
|
||||
|
||||
### AI分析
|
||||
|
||||
DeeparchWorks专注于Linux内核深度技术解析,涵盖调度器、网络、文件系统等核心模块优化与问题修复。内容专业性强,每期结合具体补丁或新特性进行多层原理剖析,适合对系统底层有深入研究的开发者。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题均涉及Linux内核机制、性能优化与算法细节,命中'硬核知识'关键词(如调度器、零拷贝、BPF验证器),内容深度符合技术保留标准,但未达到每日必读的核心级别。
|
||||
|
||||
## 3. Ele实验室 (mid: 481434238)
|
||||
|
||||
### AI分析
|
||||
|
||||
Ele实验室专注于计算机科学领域的硬核知识科普,内容涵盖向量数据库、自然语言处理、概率模拟等技术深度解析,结合历史视角与实用工具开发,适合对计算机原理和前沿技术有深入兴趣的观众。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题涉及向量数据库、NLP、加密技术等高密度技术主题,符合硬核知识定位,内容优质且无娱乐化倾向,值得保留。
|
||||
|
||||
## 4. EnglishVista2023 (mid: 488218866)
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主EnglishVista2023专注于英语词汇学习,内容涵盖雅思词汇系列(如第137-143天)和主题词汇速记(如天气预报、打印机相关)。视频标题规律性强,以教育类硬核知识为主,无娱乐或营销内容,适合长期英语学习者。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容为系统性英语词汇教学,专业度中等且更新稳定,符合技能学习分组的定位,适合有明确学习需求的用户持续关注。
|
||||
|
||||
## 5. Git源宝 (mid: 38061207)
|
||||
|
||||
### AI分析
|
||||
|
||||
Git源宝专注于AI编程与科技热点,内容涵盖DeepSeek解析、AI编程教学、AIGC率优化等硬核技术,同时涉及AI应用娱乐化观察,整体偏向实用技能与前沿资讯混合。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题高频命中'AI编程''教学''技术解析'等硬核关键词,但部分内容偏向娱乐观察,未达每日必读或纯干货标准,建议保留核心知识价值内容。
|
||||
|
||||
## 6. Java面试突击-Mic (mid: 1031543543)
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主专注于Java面试技巧、职业发展建议及行业趋势分析,内容涵盖简历优化、面试问题解析、AI对程序员影响等,针对求职者和学习者提供实用指导。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题高频命中'面试'、'Java'、'技能'等关键词,内容聚焦求职技能提升,符合B组定位;虽非顶级硬核知识,但对学习者有持续价值,建议保留。
|
||||
|
||||
## 7. math也是柠檬精 (mid: 457983079)
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主专注于计算机科学和数学领域,内容涵盖408考试相关知识点、数据结构与算法、计算机组成原理等硬核技术主题。视频标题显示其内容具有高度专业性和学习价值,适合需要系统学习相关知识的观众。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容聚焦技能学习,标题如'补码的顶级理解'、'红黑树'等命中B组关键词,专业性强,适合长期学习参考。
|
||||
|
||||
## 8. NotOnlySuccess (mid: 3546647317448859)
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主NotOnlySuccess专注于算法与数据结构深度解析,涵盖红黑树、动态规划优化、数学思维与编程结合等硬核主题。内容专业性强,适合有算法基础的观众,但缺乏日常实用技巧或娱乐性内容,定位较为垂直。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题高频命中'红黑树''DP优化''数学思维'等硬核关键词,符合A组规则。内容质量高但受众狭窄,适合算法爱好者长期学习,故建议保留。
|
||||
|
||||
## 9. OK语文 (mid: 3493281113573620)
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主OK语文专注高考语文备考,内容覆盖阅读、作文、选择题等核心题型,提供实用技巧与素材。标题高频出现“高考”“满分”“神技”等词,专业性强且稳定聚焦应试提分。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容垂直高考语文,专业度高且持续输出解题技巧与素材,符合硬核知识定位,适合备考学生长期参考。
|
||||
|
||||
## 10. Python万能胶 (mid: 1577562848)
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主Python万能胶专注于机器学习、深度学习、强化学习等实战案例与流程讲解,内容涵盖特征工程、模型调参、Tensorflow应用等硬核技术,适合编程学习者系统跟进。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: AA_编程信息干货必留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 标题高频命中'机器学习''实战案例''特征工程'等编程干货关键词,内容垂直且实用,符合AA组核心规则,属于优质技术资源。
|
||||
|
||||
12492
source/.test_output/all_i_need/up_titles_report.md
Normal file
12492
source/.test_output/all_i_need/up_titles_report.md
Normal file
File diff suppressed because it is too large
Load Diff
1507
source/.test_output/group_only.md
Normal file
1507
source/.test_output/group_only.md
Normal file
File diff suppressed because it is too large
Load Diff
180
source/output/reports/1_up_titles_report.md
Normal file
180
source/output/reports/1_up_titles_report.md
Normal file
@@ -0,0 +1,180 @@
|
||||
# UP主内容分析报告
|
||||
|
||||
- 生成时间: 2026-04-26 22:10:14
|
||||
- 分析数量: 8
|
||||
|
||||
## 1. 考研英语马天艺老师 (mid: 1357612844)
|
||||
|
||||
- 主页: https://space.bilibili.com/1357612844/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 欢迎来到艺哥的四六级高分实战营 | 四级词汇part1
|
||||
- 四六级把我当场通过!全部高分!证书归我!
|
||||
- 27考研英语常见的4个误区,你中招了几个?
|
||||
- 考研单词背了就忘?不是你记性差,是用错了记忆方法1
|
||||
- 考研英语语法零基础,该从哪里入手?给你一套可落地的学习路径
|
||||
- 27 考研英语入门,先搞懂这 1 个核心问题,比盲目背单词更重要
|
||||
- Rural vs Royal怎么记才不会搞混?
|
||||
- 30秒教你记考研核心单词
|
||||
- Rural vs Royal怎么记才不会弄混?
|
||||
- 每日一词|integrity&responsibility考研写作核心词如何用对?
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 2. 我是蓝同学啊 (mid: 321583894)
|
||||
|
||||
- 主页: https://space.bilibili.com/321583894/video
|
||||
- 标签: 实力派
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【22年8月更新】PyCharm专业版安装激活(macOS)
|
||||
- 【避坑】M1 Mac安装Matlab 2021的教程(有安装包)
|
||||
- 超详细的UG 12.0安装教程(帮你避坑)
|
||||
- 【避坑】Altium Designer 2021下载安装(永久激活)
|
||||
- Matlab 2019下载安装
|
||||
- 强烈推荐!Matlab 2020中文版下载安装!避坑
|
||||
- Matlab 2021下载安装(个人推荐)
|
||||
- 免费好用的录屏软件oCam,强烈推荐!
|
||||
- 最强工科软件Matlab 2022下载安装(中文版)
|
||||
- 【避坑】Visual Studio 2022下载安装教程
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 3. 中国食品报融媒体 (mid: 439478093)
|
||||
|
||||
- 主页: https://space.bilibili.com/439478093/video
|
||||
- 标签: 纪录片, 新闻
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 未获国家批准、照搬体外数据:麦角硫因还有哪些营销乱象?
|
||||
- 为什么连锁餐饮会青睐中央厨房?【走进食品第一线】
|
||||
- 香蕉,真的会灭绝吗?
|
||||
- 香蕉黑斑里是果蝇幼虫?营销号莫再造谣伤农!
|
||||
- 盗视频、伪科学、夸大宣传,某些鱼油营销有多猖狂?
|
||||
- 三省三厂、不打招呼,带你突击检查无骨鸡爪的生产一线!
|
||||
- 科技的尽头是:母猪的产后护理!
|
||||
- 光鲜带货背后的肮脏车间:直击网红毛肚生产乱象
|
||||
- 三十年的智商税,都在这袋“超能量子以太核聚变富氢负氘水”里了
|
||||
- 千万网红12秒直播连踩四个坑?带货话术雷区别乱蹦!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 4. 心理述 (mid: 1044673687)
|
||||
|
||||
- 主页: https://space.bilibili.com/1044673687/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 你会不会忽然的出现
|
||||
- 你会不会忽然的出现……
|
||||
- 你要披荆斩棘,越挫越勇!每日阅读提升自己
|
||||
- 超治愈的一段话。“我们穷极一生追求的幸福,不在过去,也不在末来。而是在当下眼中景,碗中餐,身边人”
|
||||
- 静下心来,用心去感受央视文案带来的思想“浪潮”。
|
||||
- “路虽远 行则将至 事虽难 做则可成”用心感受央视的每一篇文案
|
||||
- 关于爱情?央视文案给出了一个很好的标准!
|
||||
- 人民日报的金句,值得我们每一个人去用心感受!
|
||||
- 人民日报的金句,值得每一个人反覆去阅读
|
||||
- 央视的顶级文案,句句都充满了力量
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 5. 栗之from一直夫妇 (mid: 62224043)
|
||||
|
||||
- 主页: https://space.bilibili.com/62224043/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【四六级】2 个月,如何拿高分? | 四六级规划 | 备考资料
|
||||
- 【100天经济学人精读计划】Day101 | 避孕产品要交税了 | 外刊逐句精讲+语法
|
||||
- 【翻译预测】六级翻译,五篇!
|
||||
- 【翻译预测】四级翻译,五篇!
|
||||
- 【考前急救】写作高级词汇,四六级、考研都能用
|
||||
- 【四六级翻译预测】社会热点,超全总结!
|
||||
- 【北大学姐】三个月,如何备考四六级? | 四六级规划 | 备考资料
|
||||
- 【考前抢分】四六级热点话题汇总,翻译、写作、阅读、听力通用
|
||||
- 【写作】高级感写作替换词,四六级、考研都需要
|
||||
- 【四六级翻译】最常考的那些词、短语、句型,帮你总结了!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 6. Java面试突击-Mic (mid: 1031543543)
|
||||
|
||||
- 主页: https://space.bilibili.com/1031543543/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- Java面试必备:3个润色个人经历的方法让你轻松拿Offer
|
||||
- Java又双叒叕已死啦?那Java还能不能学,要不要转AI?
|
||||
- 老码农都懂,新人却大概率会踩坑的问题!程序员为什么不能一次把代码写好?
|
||||
- 金三银四面试的众生相,简历改了100遍还是被拒?核心问题在这里
|
||||
- 4月面试,90% 的人面试Java岗位都栽在这5 个致命问题
|
||||
- 一个没人敢告诉你的真相:90%的计算机学生,死在了这条看不见的“斩杀线”上
|
||||
- 26年java求职,AI 真的能替代Java 程序员吗?
|
||||
- 4月Java面试你就这么做,做好这3个润色个人经历的方法让你轻松拿Offer!
|
||||
- 4月开始跳槽,面试押题:为什么一定要做限流?
|
||||
- 4月准备跳槽面试,你千万不能忽略的问题,Java程序员面试中最大的杀手!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 7. 我们都是社畜 (mid: 1481344732)
|
||||
|
||||
- 主页: https://space.bilibili.com/1481344732/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 我在等复活甲,你在等什么
|
||||
- 头一次上榜,等了6亿年
|
||||
- 一只小蜥蜴决定走出冷血时代
|
||||
- “致敬我们星球上所有存在过的生命” 【地球往事纯享版】
|
||||
- 格林:社会毒打都是狗给的
|
||||
- 天冷了,我可什么都干得出来!
|
||||
- 《 回家的诱惑 》
|
||||
- 年轻的时候建议多去外星看看
|
||||
- 什么球,脑洞那么大
|
||||
- 什么球,压力这么大!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
## 8. 黑毛羊驼 (mid: 475443398)
|
||||
|
||||
- 主页: https://space.bilibili.com/475443398/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 原始欧洲人居然是深色的皮肤?欧洲人的金发碧眼是怎么来的呢?【片羽寻灵05】
|
||||
- 为什么我们的耳朵比恐龙多了两坨肉?哺乳动物曾经能捕杀恐龙?【兽族时代】 #合弓纲篇05
|
||||
- 鲸的器官都是XXXXXL号的?为何须鲸能演化成百吨巨兽?【兽族时代】 #古兽篇19
|
||||
- 从鱼到人的演化给我们带来了什么?人类身体结构的bug源自哪里?
|
||||
- 棘龙新种出世!最早的硬骨鱼填补了“从鱼到人”的演化空白?【朝化析拾30】
|
||||
- 非洲巨鼠居然能帮人类排地雷?哪些动物能“替代”人类工作?
|
||||
- 马的起源地北美居然成了无马之地?老马啊!5600万年的老马【兽族时代】 #古兽篇18
|
||||
- 【网络热门神秘生物32】澳大利亚最诡异的水怪?本耶普的真面目到底是什么?
|
||||
- 霸王龙真有机会长到15米?700万年前人族就能直立行走了?【朝化析拾29】
|
||||
- 为什么人类痴迷于外星生物?或许好奇才是星际文明的驱动力!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
243
source/output/reports/2_up_analysis_full_auto.md
Normal file
243
source/output/reports/2_up_analysis_full_auto.md
Normal file
@@ -0,0 +1,243 @@
|
||||
# UP主内容分析报告(分批AI总结)
|
||||
|
||||
- 生成时间: 2026-04-26 22:10:28
|
||||
- 分析数量: 8
|
||||
- 处理说明: 自动连续模式完成: 成功7, 失败1, 处理批次数=1, 待分析总数=8
|
||||
|
||||
## 分组统计
|
||||
|
||||
- AAA_核心每日必读: 0
|
||||
- AA_编程信息干货必留: 0
|
||||
- A_硬核知识保留: 1
|
||||
- B_技能学习保留: 3
|
||||
- C_资讯快餐观察: 1
|
||||
- D_娱乐消遣可取关: 2
|
||||
- E_营销带货谨慎: 0
|
||||
- 保留关注: 2
|
||||
- 可以取关: 5
|
||||
|
||||
## 1. 考研英语马天艺老师 (mid: 1357612844)
|
||||
|
||||
- 主页: https://space.bilibili.com/1357612844/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 欢迎来到艺哥的四六级高分实战营 | 四级词汇part1
|
||||
- 四六级把我当场通过!全部高分!证书归我!
|
||||
- 27考研英语常见的4个误区,你中招了几个?
|
||||
- 考研单词背了就忘?不是你记性差,是用错了记忆方法1
|
||||
- 考研英语语法零基础,该从哪里入手?给你一套可落地的学习路径
|
||||
- 27 考研英语入门,先搞懂这 1 个核心问题,比盲目背单词更重要
|
||||
- Rural vs Royal怎么记才不会搞混?
|
||||
- 30秒教你记考研核心单词
|
||||
- Rural vs Royal怎么记才不会弄混?
|
||||
- 每日一词|integrity&responsibility考研写作核心词如何用对?
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主专注于考研英语和四六级考试技巧,内容涵盖词汇记忆、语法学习、常见误区解析等实用技能。视频标题多为具体学习方法与高频考点,适合备考学生系统学习,但缺乏更广泛的知识拓展或深度内容。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容高度匹配技能学习主题,如词汇记忆、语法路径等关键词命中B组规则。作为备考辅助资源实用性强,但未达到核心必读标准,故建议保留而非取关。
|
||||
|
||||
## 2. 我是蓝同学啊 (mid: 321583894)
|
||||
|
||||
- 主页: https://space.bilibili.com/321583894/video
|
||||
- 标签: 实力派
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【22年8月更新】PyCharm专业版安装激活(macOS)
|
||||
- 【避坑】M1 Mac安装Matlab 2021的教程(有安装包)
|
||||
- 超详细的UG 12.0安装教程(帮你避坑)
|
||||
- 【避坑】Altium Designer 2021下载安装(永久激活)
|
||||
- Matlab 2019下载安装
|
||||
- 强烈推荐!Matlab 2020中文版下载安装!避坑
|
||||
- Matlab 2021下载安装(个人推荐)
|
||||
- 免费好用的录屏软件oCam,强烈推荐!
|
||||
- 最强工科软件Matlab 2022下载安装(中文版)
|
||||
- 【避坑】Visual Studio 2022下载安装教程
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主'我是蓝同学啊'专注于工科软件安装教程,内容涵盖PyCharm、Matlab、UG、Altium Designer等专业工具,强调'避坑'和'永久激活',实用性较强但缺乏原创技术深度,整体偏向基础操作指导。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容虽涉及编程工具,但以安装教程为主,属于基础技能学习范畴,未达到'编程干货'的核心标准;建议取关以聚焦更高阶内容。
|
||||
|
||||
## 3. 中国食品报融媒体 (mid: 439478093)
|
||||
|
||||
- 主页: https://space.bilibili.com/439478093/video
|
||||
- 标签: 纪录片, 新闻
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 未获国家批准、照搬体外数据:麦角硫因还有哪些营销乱象?
|
||||
- 为什么连锁餐饮会青睐中央厨房?【走进食品第一线】
|
||||
- 香蕉,真的会灭绝吗?
|
||||
- 香蕉黑斑里是果蝇幼虫?营销号莫再造谣伤农!
|
||||
- 盗视频、伪科学、夸大宣传,某些鱼油营销有多猖狂?
|
||||
- 三省三厂、不打招呼,带你突击检查无骨鸡爪的生产一线!
|
||||
- 科技的尽头是:母猪的产后护理!
|
||||
- 光鲜带货背后的肮脏车间:直击网红毛肚生产乱象
|
||||
- 三十年的智商税,都在这袋“超能量子以太核聚变富氢负氘水”里了
|
||||
- 千万网红12秒直播连踩四个坑?带货话术雷区别乱蹦!
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主聚焦食品行业新闻与纪录片,内容涵盖食品安全、营销乱象、生产现场等,兼具科普与打假性质。标题多涉及营销陷阱、行业揭秘,但部分内容偏向揭露性新闻,整体偏向资讯观察与警示。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: C_资讯快餐观察
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容以食品行业资讯和揭露营销乱象为主,虽具科普性但偏向快餐式观察,未达到核心知识或硬核干货标准,建议取关以精简关注列表。
|
||||
|
||||
## 4. 心理述 (mid: 1044673687)
|
||||
|
||||
- 主页: https://space.bilibili.com/1044673687/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 你会不会忽然的出现
|
||||
- 你会不会忽然的出现……
|
||||
- 你要披荆斩棘,越挫越勇!每日阅读提升自己
|
||||
- 超治愈的一段话。“我们穷极一生追求的幸福,不在过去,也不在末来。而是在当下眼中景,碗中餐,身边人”
|
||||
- 静下心来,用心去感受央视文案带来的思想“浪潮”。
|
||||
- “路虽远 行则将至 事虽难 做则可成”用心感受央视的每一篇文案
|
||||
- 关于爱情?央视文案给出了一个很好的标准!
|
||||
- 人民日报的金句,值得我们每一个人去用心感受!
|
||||
- 人民日报的金句,值得每一个人反覆去阅读
|
||||
- 央视的顶级文案,句句都充满了力量
|
||||
|
||||
### AI分析
|
||||
|
||||
心理述UP主以分享央视、人民日报等官方媒体金句为主,内容聚焦励志、治愈和生活感悟,缺乏专业心理学知识或原创深度分析,更新频率较高但内容同质化明显。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容为碎片化鸡汤文案,缺乏专业性和稳定性,属于低信息密度的情感消遣,不符合核心优质内容标准。
|
||||
|
||||
## 5. 栗之from一直夫妇 (mid: 62224043)
|
||||
|
||||
- 主页: https://space.bilibili.com/62224043/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【四六级】2 个月,如何拿高分? | 四六级规划 | 备考资料
|
||||
- 【100天经济学人精读计划】Day101 | 避孕产品要交税了 | 外刊逐句精讲+语法
|
||||
- 【翻译预测】六级翻译,五篇!
|
||||
- 【翻译预测】四级翻译,五篇!
|
||||
- 【考前急救】写作高级词汇,四六级、考研都能用
|
||||
- 【四六级翻译预测】社会热点,超全总结!
|
||||
- 【北大学姐】三个月,如何备考四六级? | 四六级规划 | 备考资料
|
||||
- 【考前抢分】四六级热点话题汇总,翻译、写作、阅读、听力通用
|
||||
- 【写作】高级感写作替换词,四六级、考研都需要
|
||||
- 【四六级翻译】最常考的那些词、短语、句型,帮你总结了!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
### 分组建议
|
||||
|
||||
- (待分组)
|
||||
|
||||
### 异常
|
||||
|
||||
- AI返回未知group: B_技能学习保留(规则词:技能学习、备考、考试、翻译、写作)
|
||||
|
||||
## 6. Java面试突击-Mic (mid: 1031543543)
|
||||
|
||||
- 主页: https://space.bilibili.com/1031543543/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- Java面试必备:3个润色个人经历的方法让你轻松拿Offer
|
||||
- Java又双叒叕已死啦?那Java还能不能学,要不要转AI?
|
||||
- 老码农都懂,新人却大概率会踩坑的问题!程序员为什么不能一次把代码写好?
|
||||
- 金三银四面试的众生相,简历改了100遍还是被拒?核心问题在这里
|
||||
- 4月面试,90% 的人面试Java岗位都栽在这5 个致命问题
|
||||
- 一个没人敢告诉你的真相:90%的计算机学生,死在了这条看不见的“斩杀线”上
|
||||
- 26年java求职,AI 真的能替代Java 程序员吗?
|
||||
- 4月Java面试你就这么做,做好这3个润色个人经历的方法让你轻松拿Offer!
|
||||
- 4月开始跳槽,面试押题:为什么一定要做限流?
|
||||
- 4月准备跳槽面试,你千万不能忽略的问题,Java程序员面试中最大的杀手!
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主Mic专注于Java面试技巧与行业动态,内容涵盖简历润色、面试避坑、技术趋势分析等,针对求职季提供实用建议,但缺乏深度技术解析,偏向速成指导。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容以面试速成技能为主,命中'技能学习'规则词,但缺乏硬核知识深度,非核心优质UP,建议取关以聚焦更高价值内容。
|
||||
|
||||
## 7. 我们都是社畜 (mid: 1481344732)
|
||||
|
||||
- 主页: https://space.bilibili.com/1481344732/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 我在等复活甲,你在等什么
|
||||
- 头一次上榜,等了6亿年
|
||||
- 一只小蜥蜴决定走出冷血时代
|
||||
- “致敬我们星球上所有存在过的生命” 【地球往事纯享版】
|
||||
- 格林:社会毒打都是狗给的
|
||||
- 天冷了,我可什么都干得出来!
|
||||
- 《 回家的诱惑 》
|
||||
- 年轻的时候建议多去外星看看
|
||||
- 什么球,脑洞那么大
|
||||
- 什么球,压力这么大!
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主“我们都是社畜”以拟人化、幽默风格创作动物或科幻主题短内容,标题如“一只小蜥蜴决定走出冷血时代”充满娱乐脑洞,缺乏专业知识深度,内容稳定性低,偏向轻松消遣。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 标题多为娱乐化表达,未涉及硬核知识或技能干货,符合D组‘娱乐消遣’特征;内容浅显且波动大,建议取关以聚焦核心优质资源。
|
||||
|
||||
## 8. 黑毛羊驼 (mid: 475443398)
|
||||
|
||||
- 主页: https://space.bilibili.com/475443398/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 原始欧洲人居然是深色的皮肤?欧洲人的金发碧眼是怎么来的呢?【片羽寻灵05】
|
||||
- 为什么我们的耳朵比恐龙多了两坨肉?哺乳动物曾经能捕杀恐龙?【兽族时代】 #合弓纲篇05
|
||||
- 鲸的器官都是XXXXXL号的?为何须鲸能演化成百吨巨兽?【兽族时代】 #古兽篇19
|
||||
- 从鱼到人的演化给我们带来了什么?人类身体结构的bug源自哪里?
|
||||
- 棘龙新种出世!最早的硬骨鱼填补了“从鱼到人”的演化空白?【朝化析拾30】
|
||||
- 非洲巨鼠居然能帮人类排地雷?哪些动物能“替代”人类工作?
|
||||
- 马的起源地北美居然成了无马之地?老马啊!5600万年的老马【兽族时代】 #古兽篇18
|
||||
- 【网络热门神秘生物32】澳大利亚最诡异的水怪?本耶普的真面目到底是什么?
|
||||
- 霸王龙真有机会长到15米?700万年前人族就能直立行走了?【朝化析拾29】
|
||||
- 为什么人类痴迷于外星生物?或许好奇才是星际文明的驱动力!
|
||||
|
||||
### AI分析
|
||||
|
||||
黑毛羊驼专注于古生物、人类演化和神秘生物等硬核科普内容,涵盖恐龙、哺乳动物、人类起源等专业领域,视频标题显示深度知识性和系统性,适合对古生物学和演化史感兴趣的观众。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容专业度高,主题稳定围绕古生物和演化知识,命中硬核知识保留规则,虽非每日更新但质量可靠。
|
||||
43
source/output/reports/3_up_keep_follow_only.md
Normal file
43
source/output/reports/3_up_keep_follow_only.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 保留关注UP主分析与分组建议
|
||||
|
||||
- 生成时间: 2026-04-26 22:10:28
|
||||
- 来源文件: 2_up_analysis_full_auto.md
|
||||
- 条目数: 3
|
||||
|
||||
## 1. 栗之from一直夫妇 (mid: 62224043)
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
### 分组建议
|
||||
|
||||
- (待分组)
|
||||
|
||||
### 异常
|
||||
|
||||
- AI返回未知group: B_技能学习保留(规则词:技能学习、备考、考试、翻译、写作)
|
||||
|
||||
## 2. 考研英语马天艺老师 (mid: 1357612844)
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主专注于考研英语和四六级考试技巧,内容涵盖词汇记忆、语法学习、常见误区解析等实用技能。视频标题多为具体学习方法与高频考点,适合备考学生系统学习,但缺乏更广泛的知识拓展或深度内容。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容高度匹配技能学习主题,如词汇记忆、语法路径等关键词命中B组规则。作为备考辅助资源实用性强,但未达到核心必读标准,故建议保留而非取关。
|
||||
|
||||
## 3. 黑毛羊驼 (mid: 475443398)
|
||||
|
||||
### AI分析
|
||||
|
||||
黑毛羊驼专注于古生物、人类演化和神秘生物等硬核科普内容,涵盖恐龙、哺乳动物、人类起源等专业领域,视频标题显示深度知识性和系统性,适合对古生物学和演化史感兴趣的观众。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容专业度高,主题稳定围绕古生物和演化知识,命中硬核知识保留规则,虽非每日更新但质量可靠。
|
||||
232
source/output/reports/5_sorted_up_analysis.md
Normal file
232
source/output/reports/5_sorted_up_analysis.md
Normal file
@@ -0,0 +1,232 @@
|
||||
# UP主内容分析报告(分批AI总结)
|
||||
|
||||
- 生成时间: 2026-04-26 22:10:28
|
||||
- 分析数量: 8
|
||||
- 处理说明: 自动连续模式完成: 成功7, 失败1, 处理批次数=1, 待分析总数=8
|
||||
|
||||
## 6. Java面试突击-Mic (mid: 1031543543)
|
||||
|
||||
- 主页: https://space.bilibili.com/1031543543/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- Java面试必备:3个润色个人经历的方法让你轻松拿Offer
|
||||
- Java又双叒叕已死啦?那Java还能不能学,要不要转AI?
|
||||
- 老码农都懂,新人却大概率会踩坑的问题!程序员为什么不能一次把代码写好?
|
||||
- 金三银四面试的众生相,简历改了100遍还是被拒?核心问题在这里
|
||||
- 4月面试,90% 的人面试Java岗位都栽在这5 个致命问题
|
||||
- 一个没人敢告诉你的真相:90%的计算机学生,死在了这条看不见的“斩杀线”上
|
||||
- 26年java求职,AI 真的能替代Java 程序员吗?
|
||||
- 4月Java面试你就这么做,做好这3个润色个人经历的方法让你轻松拿Offer!
|
||||
- 4月开始跳槽,面试押题:为什么一定要做限流?
|
||||
- 4月准备跳槽面试,你千万不能忽略的问题,Java程序员面试中最大的杀手!
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主Mic专注于Java面试技巧与行业动态,内容涵盖简历润色、面试避坑、技术趋势分析等,针对求职季提供实用建议,但缺乏深度技术解析,偏向速成指导。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容以面试速成技能为主,命中'技能学习'规则词,但缺乏硬核知识深度,非核心优质UP,建议取关以聚焦更高价值内容。
|
||||
|
||||
## 3. 中国食品报融媒体 (mid: 439478093)
|
||||
|
||||
- 主页: https://space.bilibili.com/439478093/video
|
||||
- 标签: 纪录片, 新闻
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 未获国家批准、照搬体外数据:麦角硫因还有哪些营销乱象?
|
||||
- 为什么连锁餐饮会青睐中央厨房?【走进食品第一线】
|
||||
- 香蕉,真的会灭绝吗?
|
||||
- 香蕉黑斑里是果蝇幼虫?营销号莫再造谣伤农!
|
||||
- 盗视频、伪科学、夸大宣传,某些鱼油营销有多猖狂?
|
||||
- 三省三厂、不打招呼,带你突击检查无骨鸡爪的生产一线!
|
||||
- 科技的尽头是:母猪的产后护理!
|
||||
- 光鲜带货背后的肮脏车间:直击网红毛肚生产乱象
|
||||
- 三十年的智商税,都在这袋“超能量子以太核聚变富氢负氘水”里了
|
||||
- 千万网红12秒直播连踩四个坑?带货话术雷区别乱蹦!
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主聚焦食品行业新闻与纪录片,内容涵盖食品安全、营销乱象、生产现场等,兼具科普与打假性质。标题多涉及营销陷阱、行业揭秘,但部分内容偏向揭露性新闻,整体偏向资讯观察与警示。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: C_资讯快餐观察
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容以食品行业资讯和揭露营销乱象为主,虽具科普性但偏向快餐式观察,未达到核心知识或硬核干货标准,建议取关以精简关注列表。
|
||||
|
||||
## 4. 心理述 (mid: 1044673687)
|
||||
|
||||
- 主页: https://space.bilibili.com/1044673687/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 你会不会忽然的出现
|
||||
- 你会不会忽然的出现……
|
||||
- 你要披荆斩棘,越挫越勇!每日阅读提升自己
|
||||
- 超治愈的一段话。“我们穷极一生追求的幸福,不在过去,也不在末来。而是在当下眼中景,碗中餐,身边人”
|
||||
- 静下心来,用心去感受央视文案带来的思想“浪潮”。
|
||||
- “路虽远 行则将至 事虽难 做则可成”用心感受央视的每一篇文案
|
||||
- 关于爱情?央视文案给出了一个很好的标准!
|
||||
- 人民日报的金句,值得我们每一个人去用心感受!
|
||||
- 人民日报的金句,值得每一个人反覆去阅读
|
||||
- 央视的顶级文案,句句都充满了力量
|
||||
|
||||
### AI分析
|
||||
|
||||
心理述UP主以分享央视、人民日报等官方媒体金句为主,内容聚焦励志、治愈和生活感悟,缺乏专业心理学知识或原创深度分析,更新频率较高但内容同质化明显。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容为碎片化鸡汤文案,缺乏专业性和稳定性,属于低信息密度的情感消遣,不符合核心优质内容标准。
|
||||
|
||||
## 2. 我是蓝同学啊 (mid: 321583894)
|
||||
|
||||
- 主页: https://space.bilibili.com/321583894/video
|
||||
- 标签: 实力派
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【22年8月更新】PyCharm专业版安装激活(macOS)
|
||||
- 【避坑】M1 Mac安装Matlab 2021的教程(有安装包)
|
||||
- 超详细的UG 12.0安装教程(帮你避坑)
|
||||
- 【避坑】Altium Designer 2021下载安装(永久激活)
|
||||
- Matlab 2019下载安装
|
||||
- 强烈推荐!Matlab 2020中文版下载安装!避坑
|
||||
- Matlab 2021下载安装(个人推荐)
|
||||
- 免费好用的录屏软件oCam,强烈推荐!
|
||||
- 最强工科软件Matlab 2022下载安装(中文版)
|
||||
- 【避坑】Visual Studio 2022下载安装教程
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主'我是蓝同学啊'专注于工科软件安装教程,内容涵盖PyCharm、Matlab、UG、Altium Designer等专业工具,强调'避坑'和'永久激活',实用性较强但缺乏原创技术深度,整体偏向基础操作指导。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容虽涉及编程工具,但以安装教程为主,属于基础技能学习范畴,未达到'编程干货'的核心标准;建议取关以聚焦更高阶内容。
|
||||
|
||||
## 7. 我们都是社畜 (mid: 1481344732)
|
||||
|
||||
- 主页: https://space.bilibili.com/1481344732/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 我在等复活甲,你在等什么
|
||||
- 头一次上榜,等了6亿年
|
||||
- 一只小蜥蜴决定走出冷血时代
|
||||
- “致敬我们星球上所有存在过的生命” 【地球往事纯享版】
|
||||
- 格林:社会毒打都是狗给的
|
||||
- 天冷了,我可什么都干得出来!
|
||||
- 《 回家的诱惑 》
|
||||
- 年轻的时候建议多去外星看看
|
||||
- 什么球,脑洞那么大
|
||||
- 什么球,压力这么大!
|
||||
|
||||
### AI分析
|
||||
|
||||
UP主“我们都是社畜”以拟人化、幽默风格创作动物或科幻主题短内容,标题如“一只小蜥蜴决定走出冷血时代”充满娱乐脑洞,缺乏专业知识深度,内容稳定性低,偏向轻松消遣。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 标题多为娱乐化表达,未涉及硬核知识或技能干货,符合D组‘娱乐消遣’特征;内容浅显且波动大,建议取关以聚焦核心优质资源。
|
||||
|
||||
## 5. 栗之from一直夫妇 (mid: 62224043)
|
||||
|
||||
- 主页: https://space.bilibili.com/62224043/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 【四六级】2 个月,如何拿高分? | 四六级规划 | 备考资料
|
||||
- 【100天经济学人精读计划】Day101 | 避孕产品要交税了 | 外刊逐句精讲+语法
|
||||
- 【翻译预测】六级翻译,五篇!
|
||||
- 【翻译预测】四级翻译,五篇!
|
||||
- 【考前急救】写作高级词汇,四六级、考研都能用
|
||||
- 【四六级翻译预测】社会热点,超全总结!
|
||||
- 【北大学姐】三个月,如何备考四六级? | 四六级规划 | 备考资料
|
||||
- 【考前抢分】四六级热点话题汇总,翻译、写作、阅读、听力通用
|
||||
- 【写作】高级感写作替换词,四六级、考研都需要
|
||||
- 【四六级翻译】最常考的那些词、短语、句型,帮你总结了!
|
||||
|
||||
### AI分析
|
||||
|
||||
测试模式已跳过AI分析
|
||||
|
||||
### 分组建议
|
||||
|
||||
- (待分组)
|
||||
|
||||
### 异常
|
||||
|
||||
- AI返回未知group: B_技能学习保留(规则词:技能学习、备考、考试、翻译、写作)
|
||||
|
||||
## 1. 考研英语马天艺老师 (mid: 1357612844)
|
||||
|
||||
- 主页: https://space.bilibili.com/1357612844/video
|
||||
- 标签: 无
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 欢迎来到艺哥的四六级高分实战营 | 四级词汇part1
|
||||
- 四六级把我当场通过!全部高分!证书归我!
|
||||
- 27考研英语常见的4个误区,你中招了几个?
|
||||
- 考研单词背了就忘?不是你记性差,是用错了记忆方法1
|
||||
- 考研英语语法零基础,该从哪里入手?给你一套可落地的学习路径
|
||||
- 27 考研英语入门,先搞懂这 1 个核心问题,比盲目背单词更重要
|
||||
- Rural vs Royal怎么记才不会搞混?
|
||||
- 30秒教你记考研核心单词
|
||||
- Rural vs Royal怎么记才不会弄混?
|
||||
- 每日一词|integrity&responsibility考研写作核心词如何用对?
|
||||
|
||||
### AI分析
|
||||
|
||||
该UP主专注于考研英语和四六级考试技巧,内容涵盖词汇记忆、语法学习、常见误区解析等实用技能。视频标题多为具体学习方法与高频考点,适合备考学生系统学习,但缺乏更广泛的知识拓展或深度内容。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容高度匹配技能学习主题,如词汇记忆、语法路径等关键词命中B组规则。作为备考辅助资源实用性强,但未达到核心必读标准,故建议保留而非取关。
|
||||
|
||||
## 8. 黑毛羊驼 (mid: 475443398)
|
||||
|
||||
- 主页: https://space.bilibili.com/475443398/video
|
||||
- 标签: 准备取关
|
||||
|
||||
### 最近10条标题
|
||||
|
||||
- 原始欧洲人居然是深色的皮肤?欧洲人的金发碧眼是怎么来的呢?【片羽寻灵05】
|
||||
- 为什么我们的耳朵比恐龙多了两坨肉?哺乳动物曾经能捕杀恐龙?【兽族时代】 #合弓纲篇05
|
||||
- 鲸的器官都是XXXXXL号的?为何须鲸能演化成百吨巨兽?【兽族时代】 #古兽篇19
|
||||
- 从鱼到人的演化给我们带来了什么?人类身体结构的bug源自哪里?
|
||||
- 棘龙新种出世!最早的硬骨鱼填补了“从鱼到人”的演化空白?【朝化析拾30】
|
||||
- 非洲巨鼠居然能帮人类排地雷?哪些动物能“替代”人类工作?
|
||||
- 马的起源地北美居然成了无马之地?老马啊!5600万年的老马【兽族时代】 #古兽篇18
|
||||
- 【网络热门神秘生物32】澳大利亚最诡异的水怪?本耶普的真面目到底是什么?
|
||||
- 霸王龙真有机会长到15米?700万年前人族就能直立行走了?【朝化析拾29】
|
||||
- 为什么人类痴迷于外星生物?或许好奇才是星际文明的驱动力!
|
||||
|
||||
### AI分析
|
||||
|
||||
黑毛羊驼专注于古生物、人类演化和神秘生物等硬核科普内容,涵盖恐龙、哺乳动物、人类起源等专业领域,视频标题显示深度知识性和系统性,适合对古生物学和演化史感兴趣的观众。
|
||||
|
||||
### 分组建议
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容专业度高,主题稳定围绕古生物和演化知识,命中硬核知识保留规则,虽非每日更新但质量可靠。
|
||||
|
||||
45
source/output/reports/6_group_info.md
Normal file
45
source/output/reports/6_group_info.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# UP主内容分析报告(分批AI总结)
|
||||
|
||||
- 生成时间: 2026-04-26 22:10:28
|
||||
- 分析数量: 8
|
||||
- 处理说明: 自动连续模式完成: 成功7, 失败1, 处理批次数=1, 待分析总数=8
|
||||
|
||||
## 3. 中国食品报融媒体 (mid: 439478093)
|
||||
|
||||
- 预设分组: C_资讯快餐观察
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容以食品行业资讯和揭露营销乱象为主,虽具科普性但偏向快餐式观察,未达到核心知识或硬核干货标准,建议取关以精简关注列表。
|
||||
|
||||
## 4. 心理述 (mid: 1044673687)
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容为碎片化鸡汤文案,缺乏专业性和稳定性,属于低信息密度的情感消遣,不符合核心优质内容标准。
|
||||
|
||||
## 7. 我们都是社畜 (mid: 1481344732)
|
||||
|
||||
- 预设分组: D_娱乐消遣可取关
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 标题多为娱乐化表达,未涉及硬核知识或技能干货,符合D组‘娱乐消遣’特征;内容浅显且波动大,建议取关以聚焦核心优质资源。
|
||||
|
||||
## 2. 我是蓝同学啊 (mid: 321583894)
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 可以取关
|
||||
- 判断依据: 内容虽涉及编程工具,但以安装教程为主,属于基础技能学习范畴,未达到'编程干货'的核心标准;建议取关以聚焦更高阶内容。
|
||||
|
||||
## 5. 栗之from一直夫妇 (mid: 62224043)
|
||||
|
||||
- 异常: B_技能学习保留(规则词:技能学习、备考、考试、翻译、写作)
|
||||
|
||||
## 1. 考研英语马天艺老师 (mid: 1357612844)
|
||||
|
||||
- 预设分组: B_技能学习保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容高度匹配技能学习主题,如词汇记忆、语法路径等关键词命中B组规则。作为备考辅助资源实用性强,但未达到核心必读标准,故建议保留而非取关。
|
||||
|
||||
## 8. 黑毛羊驼 (mid: 475443398)
|
||||
|
||||
- 预设分组: A_硬核知识保留
|
||||
- 建议动作: 保留关注
|
||||
- 判断依据: 内容专业度高,主题稳定围绕古生物和演化知识,命中硬核知识保留规则,虽非每日更新但质量可靠。
|
||||
1
source/output/uids/4_unfollow_mids_list.txt
Normal file
1
source/output/uids/4_unfollow_mids_list.txt
Normal file
@@ -0,0 +1 @@
|
||||
321583894,439478093,1044673687,1031543543,1481344732
|
||||
1
source/output/uids/4_unfollow_mids_list_1.txt
Normal file
1
source/output/uids/4_unfollow_mids_list_1.txt
Normal file
@@ -0,0 +1 @@
|
||||
321583894,439478093,1044673687,1031543543,1481344732
|
||||
1
source/output/uids/unfollow_mids_list.txt
Normal file
1
source/output/uids/unfollow_mids_list.txt
Normal file
@@ -0,0 +1 @@
|
||||
1044673687,1481344732,1858861103,444728505,23947287,35807625,111714204,1587138171,440798355,33291981,11914415,436700803,3493282273299102,612593877,2125857107,2000819931,507448807,505935166,14524124,385200931,1769820463,1562896062,3493285194632125,3493264275540254,479424216,604710494,1016523517,1428318343,700817047,543931674,1590538073,1574721168,432752294,3494376355400290,1795221360,4848323,495224316,3493258518858434,379247856,32360194,381653678,274928598,475656605,365212208,3546378525477862,35339643,1747335,1263990139,3493263038220393,251642119,387412319,1212367465,589747109,1025542770,23770618,3494350482836026,54091976,599449178,1715594148,3493127266503448,1767282898,487505057,630874464,1264711195,3537118481615036,319358609,518742534,385172962,4401694,474803476,525382468,3546595513600180,295993972,476819048,21435789,1725223092,2114928296,174471602,1480366563,17095888,295100453,1305776725,25694274,14797570,166828,385126080,3461582166166488,3537120815745590,489302782,73674032,1500074803,68134500,1047158092,3546571071293861,124806013,26055664,441631812,243680430,601300995,108526737,2100151539,3546603229023143,1749224369,3493133887211865,56300844,255139870,23244398,3493291869866324,3494354444355822,3546593938639500,1098004826,94577838,21849780,35105301,423319981,535023713,224560702,3546637651675315,3494361759221832,1640934198,1710911403,14342271,2031277323,603430640,3546568640694467,1741962246,1304346514,283389925,3461575868418125,3546622413768823,3494364269513335,185549749,502539494,73528331,510767506,3461579156752681,238171381,3546627212052911,448165099,1975692083,542824499,16243913,3494354016537425,316627722,1944667205,1433031509,3546387566299549,496787581,3546643550963789,382423121,600428973,430426421,325848853,735958,35162124,668794433,3546390949005555,478548163,3546672034482563,250584301,485234598,1555665460,6776617,108709998,437840703,28378491,67079745,1606682745,629101318,452161580,3493089637305282,374377163,213845897,323713206,272107494,622986240,1773278179,3546656899336980,67141499,318331,285027361,114366178,203983793,1283676771,1965933018,470624011,3546583482239276,3493281239402498,1475977561,2016676980,1209319826,1335124945,416206486,129860965,1780480185,1809567655,245645656,1937416537,1060544882,1335713025,3546617688886097,3546752326044595,3546613148551357,652060948,2116071253,97407861,3546908731639909,3546693165386233,278761367,323588182,486989780,3494353494345852,96609715,264869770,478849208,1679822121,19414347,3493127314737312,702915816,482867012,3546969421122388,3546590214097572,501642082,458165375,3546662484052067,481153145,1159873315,3546857594685834,1508100119,111900,1732848825,3546606469123022,106685726,490494088,1511660367
|
||||
1
source/output/uids/unfollow_mids_list_1.txt
Normal file
1
source/output/uids/unfollow_mids_list_1.txt
Normal file
@@ -0,0 +1 @@
|
||||
1044673687,1481344732,1858861103,444728505,23947287,35807625,111714204,1587138171,440798355,33291981,11914415,436700803,3493282273299102,612593877,2125857107,2000819931,507448807,505935166,14524124,385200931,1769820463,1562896062,3493285194632125,3493264275540254,479424216,604710494,1016523517,1428318343,700817047,543931674,1590538073,1574721168,432752294,3494376355400290,1795221360,4848323,495224316,3493258518858434,379247856,32360194,381653678,274928598,475656605,365212208,3546378525477862,35339643,1747335,1263990139,3493263038220393,251642119,387412319,1212367465,589747109,1025542770,23770618,3494350482836026,54091976,599449178,1715594148,3493127266503448,1767282898,487505057,630874464,1264711195,3537118481615036,319358609,518742534,385172962,4401694,474803476,525382468,3546595513600180,295993972,476819048,21435789,1725223092,2114928296,174471602,1480366563,17095888,295100453,1305776725,25694274,14797570,166828,385126080,3461582166166488,3537120815745590,489302782,73674032,1500074803,68134500,1047158092,3546571071293861,124806013,26055664,441631812,243680430,601300995,108526737
|
||||
1
source/output/uids/unfollow_mids_list_2.txt
Normal file
1
source/output/uids/unfollow_mids_list_2.txt
Normal file
@@ -0,0 +1 @@
|
||||
2100151539,3546603229023143,1749224369,3493133887211865,56300844,255139870,23244398,3493291869866324,3494354444355822,3546593938639500,1098004826,94577838,21849780,35105301,423319981,535023713,224560702,3546637651675315,3494361759221832,1640934198,1710911403,14342271,2031277323,603430640,3546568640694467,1741962246,1304346514,283389925,3461575868418125,3546622413768823,3494364269513335,185549749,502539494,73528331,510767506,3461579156752681,238171381,3546627212052911,448165099,1975692083,542824499,16243913,3494354016537425,316627722,1944667205,1433031509,3546387566299549,496787581,3546643550963789,382423121,600428973,430426421,325848853,735958,35162124,668794433,3546390949005555,478548163,3546672034482563,250584301,485234598,1555665460,6776617,108709998,437840703,28378491,67079745,1606682745,629101318,452161580,3493089637305282,374377163,213845897,323713206,272107494,622986240,1773278179,3546656899336980,67141499,318331,285027361,114366178,203983793,1283676771,1965933018,470624011,3546583482239276,3493281239402498,1475977561,2016676980,1209319826,1335124945,416206486,129860965,1780480185,1809567655,245645656,1937416537,1060544882,1335713025
|
||||
1
source/output/uids/unfollow_mids_list_3.txt
Normal file
1
source/output/uids/unfollow_mids_list_3.txt
Normal file
@@ -0,0 +1 @@
|
||||
3546617688886097,3546752326044595,3546613148551357,652060948,2116071253,97407861,3546908731639909,3546693165386233,278761367,323588182,486989780,3494353494345852,96609715,264869770,478849208,1679822121,19414347,3493127314737312,702915816,482867012,3546969421122388,3546590214097572,501642082,458165375,3546662484052067,481153145,1159873315,3546857594685834,1508100119,111900,1732848825,3546606469123022,106685726,490494088,1511660367
|
||||
1
source/resources/export_uids.json
Normal file
1
source/resources/export_uids.json
Normal file
File diff suppressed because one or more lines are too long
1
source/resources/export_uids.txt
Normal file
1
source/resources/export_uids.txt
Normal file
File diff suppressed because one or more lines are too long
1
source/resources/export_uids_test5.json
Normal file
1
source/resources/export_uids_test5.json
Normal file
@@ -0,0 +1 @@
|
||||
[{"mid":1357612844,"name":"考研英语马天艺老师","tag":[]},{"mid":321583894,"name":"我是蓝同学啊","tag":["实力派"]},{"mid":439478093,"name":"中国食品报融媒体","tag":["纪录片","新闻"]},{"mid":1044673687,"name":"心理述","tag":[]},{"mid":62224043,"name":"栗之from一直夫妇","tag":[]},{"mid":1031543543,"name":"Java面试突击-Mic","tag":[]},{"mid":1481344732,"name":"我们都是社畜","tag":["准备取关"]},{"mid":475443398,"name":"黑毛羊驼","tag":["准备取关"]}]
|
||||
1
source/resources/export_uids_test5.txt
Normal file
1
source/resources/export_uids_test5.txt
Normal file
@@ -0,0 +1 @@
|
||||
1357612844,321583894,439478093,1044673687,62224043
|
||||
690
source/scripts/analyze_up_content.py
Normal file
690
source/scripts/analyze_up_content.py
Normal file
@@ -0,0 +1,690 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fetch recent Bilibili video titles for UIDs and analyze with Volcengine API.
|
||||
|
||||
Input JSON format (list of objects):
|
||||
[
|
||||
{"mid": 12345, "name": "UP Name", "tag": ["准备取关"]}
|
||||
]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import html
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib import error, parse, request
|
||||
|
||||
|
||||
BILIBILI_API = "https://api.bilibili.com/x/space/arc/search"
|
||||
BILIBILI_WBI_API = "https://api.bilibili.com/x/space/wbi/arc/search"
|
||||
BILIBILI_NAV_API = "https://api.bilibili.com/x/web-interface/nav"
|
||||
# 可选:如果仍频繁触发412,可填浏览器里复制的Cookie字符串。
|
||||
BILIBILI_COOKIE = "buvid3=5D02D792-070F-79D0-4243-4F75C6277EC022345infoc; b_nut=1765807422; _uuid=1796ECEE-451E-E1B7-1D9A-5D7F5CCCDA5822634infoc; buvid_fp=993faeece85f3e3119d8331a4e5bf683; buvid4=785EC013-0E2C-BC9F-5CBD-B8B00C76D13024715-025121522-ba1d0oh5R0Q47E2dVDisZg%3D%3D; SESSDATA=875331b4%2C1781359476%2C70459%2Ac1CjAXAQicR89csAHVVl-X8yAIy0-eko5ey69tJAyAXIbHhSU5HaUgth-E2fW1e9ij0MESVll2anVrYXVOYkc3VzZ2RmtFQlZzUnNoR0JOdUNZYldWSXh4Y3NZVlVWc1lOaC04M2JRQ3VKZ0x5b2RMbXl1MWpCSE1XMjd2UjVDTUJoUko1bU96aE9BIIEC; bili_jct=2e6b55fe6837ee753c69cd477c1b1ac6; DedeUserID=440102691; DedeUserID__ckMd5=42ab71f1395d8071; theme-tip-show=SHOWED; rpdid=|(u~RklkYm)u0J'u~Yl)|~YuR; hit-dyn-v2=1; theme-avatar-tip-show=SHOWED; LIVE_BUVID=AUTO5117758855687732; PVID=3; CURRENT_QUALITY=64; theme-switch-show=SHOWED; home_feed_column=4; browser_resolution=1359-871; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NzcyODE5NjAsImlhdCI6MTc3NzAyMjcwMCwicGx0IjotMX0.euCIXefcvPlg1SwKKQh2HLfYStrTdG8dN-qnKCeUBFU; bili_ticket_expires=1777281900; sid=7beimq93; CURRENT_FNVAL=2000; bp_t_offset_440102691=1195139899255160832; b_lsid=52AAA640_19DC3A11696"
|
||||
RUNTIME_BILIBILI_COOKIE = ""
|
||||
DEFAULT_USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
MIXIN_KEY_ENC_TAB = [
|
||||
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35,
|
||||
27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13,
|
||||
37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4,
|
||||
22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52,
|
||||
]
|
||||
|
||||
# 在这里直接填写火山引擎配置。
|
||||
VOLCENGINE_API_KEY = "586d443c-5034-4810-9760-50ce77394e8a"
|
||||
VOLCENGINE_MODEL = "deepseek-v3-1-terminus"
|
||||
VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpItem:
|
||||
mid: int
|
||||
name: str
|
||||
tag: list[str]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="抓取 UP 前10个视频标题,并调用火山引擎 API 生成分析报告"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="./source/resources/export_uids.json",
|
||||
help="输入 JSON 文件路径,默认: ./source/resources/export_uids.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="./source/output/reports/up_titles_report.md",
|
||||
help="输出 Markdown 报告路径,默认: ./source/output/reports/up_titles_report.md",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--titles-per-up",
|
||||
type=int,
|
||||
default=10,
|
||||
help="每个 UP 抓取的视频标题数量,默认: 10",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-ups",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多处理多少个 UP,0 表示全部",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-tag",
|
||||
default="",
|
||||
help="只处理包含该标签的 UP,例如: 准备取关;留空表示不过滤",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep-seconds",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="每个 UP 抓取后的等待秒数,默认: 0.8",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-times",
|
||||
type=int,
|
||||
default=3,
|
||||
help="抓取重试次数(遇到412/-799时),默认: 3",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-mid",
|
||||
type=int,
|
||||
default=0,
|
||||
help="测试模式:只抓取这个mid,不读取输入文件",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-name",
|
||||
default="TEST_UP",
|
||||
help="测试模式下显示名称,默认: TEST_UP",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-ai",
|
||||
action="store_true",
|
||||
help="只测试抓取,不调用AI分析",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="输出抓取调试信息",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bili-cookie",
|
||||
default="",
|
||||
help="可选:运行时传入B站Cookie,优先级高于脚本内BILIBILI_COOKIE",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fetch-mode",
|
||||
choices=["auto", "api", "html"],
|
||||
default="auto",
|
||||
help="抓取模式: auto(先API后HTML)/api/html,默认: auto",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--analyze-from-report",
|
||||
default="",
|
||||
help="从已有报告读取标题并仅执行AI分析,例如: source/up_analysis_report.md",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=30,
|
||||
help="分批分析时每批数量,默认: 30",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-index",
|
||||
type=int,
|
||||
default=1,
|
||||
help="分批分析批次序号(从1开始),默认: 1",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def parse_report_items(report_path: Path) -> list[dict[str, Any]]:
|
||||
lines = report_path.read_text(encoding="utf-8").splitlines()
|
||||
items: list[dict[str, Any]] = []
|
||||
current: dict[str, Any] | None = None
|
||||
section = ""
|
||||
|
||||
for line in lines:
|
||||
m = re.match(r"^##\s+\d+\.\s+(.*?)\s+\(mid:\s*(\d+)\)", line)
|
||||
if m:
|
||||
if current is not None:
|
||||
items.append(current)
|
||||
current = {
|
||||
"mid": int(m.group(2)),
|
||||
"name": m.group(1).strip(),
|
||||
"tag": [],
|
||||
"url": f"https://space.bilibili.com/{int(m.group(2))}/video",
|
||||
"titles": [],
|
||||
"analysis": "",
|
||||
"error": "",
|
||||
}
|
||||
section = ""
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
continue
|
||||
|
||||
if line.startswith("- 主页: "):
|
||||
current["url"] = line.replace("- 主页: ", "", 1).strip()
|
||||
continue
|
||||
if line.startswith("- 标签: "):
|
||||
raw_tag = line.replace("- 标签: ", "", 1).strip()
|
||||
current["tag"] = [] if raw_tag in ("", "无") else [x.strip() for x in raw_tag.split(",") if x.strip()]
|
||||
continue
|
||||
if line == "### 最近10条标题":
|
||||
section = "titles"
|
||||
continue
|
||||
if line == "### AI分析":
|
||||
section = "analysis"
|
||||
continue
|
||||
if line == "### 异常":
|
||||
section = "error"
|
||||
continue
|
||||
if line.startswith("### "):
|
||||
section = ""
|
||||
continue
|
||||
|
||||
if section == "titles" and line.startswith("- "):
|
||||
t = line[2:].strip()
|
||||
if t and t != "(未抓取到标题)":
|
||||
current["titles"].append(t)
|
||||
elif section == "analysis":
|
||||
if line.strip():
|
||||
if current["analysis"]:
|
||||
current["analysis"] += "\n" + line.strip()
|
||||
else:
|
||||
current["analysis"] = line.strip()
|
||||
elif section == "error" and line.startswith("- "):
|
||||
current["error"] = line[2:].strip()
|
||||
|
||||
if current is not None:
|
||||
items.append(current)
|
||||
return items
|
||||
|
||||
|
||||
def run_batch_analysis_from_report(args: argparse.Namespace, output_path: Path) -> int:
|
||||
report_path = Path(args.analyze_from_report)
|
||||
if not report_path.exists():
|
||||
print(f"报告文件不存在: {report_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
items = parse_report_items(report_path)
|
||||
if not items:
|
||||
print("报告中未解析到可分析条目", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
pending = [
|
||||
it for it in items
|
||||
if it.get("titles") and (not it.get("analysis") or it.get("analysis") == "测试模式已跳过AI分析")
|
||||
]
|
||||
if not pending:
|
||||
print("报告中没有待分析条目(可能已全部分析完成)")
|
||||
output_path.write_text(build_report(items), encoding="utf-8")
|
||||
return 0
|
||||
|
||||
batch_size = max(args.batch_size, 1)
|
||||
batch_index = max(args.batch_index, 1)
|
||||
start = (batch_index - 1) * batch_size
|
||||
end = start + batch_size
|
||||
batch = pending[start:end]
|
||||
if not batch:
|
||||
print(f"批次为空: batch-index={batch_index}, batch-size={batch_size}, 待分析总数={len(pending)}")
|
||||
output_path.write_text(build_report(items), encoding="utf-8")
|
||||
return 0
|
||||
|
||||
print(
|
||||
f"开始分批分析: 第{batch_index}批, 每批{batch_size}条, "
|
||||
f"本批{len(batch)}条, 待分析总数{len(pending)}"
|
||||
)
|
||||
|
||||
key_to_index = {f"{it['mid']}::{it['name']}": idx for idx, it in enumerate(items)}
|
||||
for idx, it in enumerate(batch, start=1):
|
||||
print(f"[batch {idx}/{len(batch)}] AI分析: {it['name']} ({it['mid']})")
|
||||
try:
|
||||
analysis = analyze_titles(it["name"], it["url"], it["titles"])
|
||||
origin_idx = key_to_index.get(f"{it['mid']}::{it['name']}")
|
||||
if origin_idx is not None:
|
||||
items[origin_idx]["analysis"] = analysis
|
||||
items[origin_idx]["error"] = ""
|
||||
except Exception as exc: # noqa: BLE001
|
||||
origin_idx = key_to_index.get(f"{it['mid']}::{it['name']}")
|
||||
if origin_idx is not None:
|
||||
items[origin_idx]["error"] = str(exc)
|
||||
time.sleep(max(args.sleep_seconds, 0.0))
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(build_report(items), encoding="utf-8")
|
||||
print(f"分批分析报告已生成: {output_path}")
|
||||
return 0
|
||||
|
||||
|
||||
def load_up_items(input_path: Path) -> list[UpItem]:
|
||||
raw = json.loads(input_path.read_text(encoding="utf-8"))
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError("输入 JSON 必须是数组")
|
||||
|
||||
items: list[UpItem] = []
|
||||
for idx, obj in enumerate(raw):
|
||||
if not isinstance(obj, dict):
|
||||
raise ValueError(f"第 {idx + 1} 项不是对象")
|
||||
mid = obj.get("mid")
|
||||
name = obj.get("name", "")
|
||||
tags = obj.get("tag", [])
|
||||
if mid is None:
|
||||
continue
|
||||
try:
|
||||
mid_int = int(mid)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not isinstance(name, str):
|
||||
name = str(name)
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
tags = [str(t) for t in tags]
|
||||
items.append(UpItem(mid=mid_int, name=name.strip(), tag=tags))
|
||||
return items
|
||||
|
||||
|
||||
def http_get_json(
|
||||
url: str,
|
||||
timeout: float = 20.0,
|
||||
referer: str = "https://space.bilibili.com/",
|
||||
) -> dict[str, Any]:
|
||||
headers = {
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Referer": referer,
|
||||
"Origin": "https://www.bilibili.com",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
}
|
||||
cookie = RUNTIME_BILIBILI_COOKIE.strip() or BILIBILI_COOKIE.strip()
|
||||
if cookie:
|
||||
headers["Cookie"] = cookie
|
||||
req = request.Request(url, headers=headers, method="GET")
|
||||
with request.urlopen(req, timeout=timeout) as resp:
|
||||
body = resp.read().decode("utf-8", errors="replace")
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def http_get_text(
|
||||
url: str,
|
||||
timeout: float = 20.0,
|
||||
referer: str = "https://space.bilibili.com/",
|
||||
) -> str:
|
||||
headers = {
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
"Referer": referer,
|
||||
"Origin": "https://www.bilibili.com",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
}
|
||||
cookie = RUNTIME_BILIBILI_COOKIE.strip() or BILIBILI_COOKIE.strip()
|
||||
if cookie:
|
||||
headers["Cookie"] = cookie
|
||||
req = request.Request(url, headers=headers, method="GET")
|
||||
with request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def get_mixin_key(img_key: str, sub_key: str) -> str:
|
||||
origin = img_key + sub_key
|
||||
mixed = "".join(origin[i] for i in MIXIN_KEY_ENC_TAB)
|
||||
return mixed[:32]
|
||||
|
||||
|
||||
def build_wbi_params(base_params: dict[str, Any], mixin_key: str) -> dict[str, Any]:
|
||||
params = {k: str(v) for k, v in base_params.items()}
|
||||
params["wts"] = str(int(time.time()))
|
||||
params = dict(sorted(params.items()))
|
||||
filtered = {
|
||||
k: re.sub(r"[!'()*]", "", v)
|
||||
for k, v in params.items()
|
||||
}
|
||||
query = parse.urlencode(filtered)
|
||||
w_rid = hashlib.md5((query + mixin_key).encode("utf-8")).hexdigest()
|
||||
filtered["w_rid"] = w_rid
|
||||
return filtered
|
||||
|
||||
|
||||
def get_wbi_mixin_key() -> str:
|
||||
data = http_get_json(BILIBILI_NAV_API, referer="https://www.bilibili.com/")
|
||||
if data.get("code") != 0:
|
||||
raise RuntimeError(
|
||||
f"获取wbi密钥失败 code={data.get('code')}, message={data.get('message')}"
|
||||
)
|
||||
wbi_img = data.get("data", {}).get("wbi_img", {})
|
||||
img_url = wbi_img.get("img_url", "")
|
||||
sub_url = wbi_img.get("sub_url", "")
|
||||
if not img_url or not sub_url:
|
||||
raise RuntimeError("获取wbi密钥失败: nav接口缺少img_url/sub_url")
|
||||
img_key = img_url.rsplit("/", 1)[-1].split(".")[0]
|
||||
sub_key = sub_url.rsplit("/", 1)[-1].split(".")[0]
|
||||
return get_mixin_key(img_key, sub_key)
|
||||
|
||||
|
||||
def parse_titles_from_data(data: dict[str, Any]) -> list[str]:
|
||||
vlist = data.get("data", {}).get("list", {}).get("vlist", [])
|
||||
if not isinstance(vlist, list):
|
||||
return []
|
||||
titles: list[str] = []
|
||||
for item in vlist:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
title = item.get("title", "")
|
||||
if isinstance(title, str) and title.strip():
|
||||
titles.append(clean_html(title.strip()))
|
||||
return titles
|
||||
|
||||
|
||||
def fetch_titles_from_space_html(mid: int, titles_per_up: int, debug: bool = False) -> list[str]:
|
||||
url = f"https://space.bilibili.com/{mid}/video"
|
||||
html_text = http_get_text(url, referer="https://www.bilibili.com/")
|
||||
|
||||
# 页面中视频封面<img>常携带标题到alt字段,优先从这里提取。
|
||||
alt_candidates = re.findall(
|
||||
r'<img[^>]*class="[^"]*b-img__inner[^"]*"[^>]*alt="([^"]+)"',
|
||||
html_text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
titles: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for raw in alt_candidates:
|
||||
t = clean_html(html.unescape(raw)).strip()
|
||||
if not t or t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
titles.append(t)
|
||||
if len(titles) >= titles_per_up:
|
||||
break
|
||||
|
||||
if debug:
|
||||
print(f"[debug] HTML模式提取到 {len(titles)} 条标题")
|
||||
return titles
|
||||
|
||||
|
||||
def fetch_titles(
|
||||
mid: int,
|
||||
titles_per_up: int,
|
||||
retry_times: int = 3,
|
||||
debug: bool = False,
|
||||
fetch_mode: str = "auto",
|
||||
) -> list[str]:
|
||||
base_params = {
|
||||
"mid": str(mid),
|
||||
"pn": "1",
|
||||
"ps": str(titles_per_up),
|
||||
"order": "pubdate",
|
||||
"index": "1",
|
||||
"jsonp": "json",
|
||||
}
|
||||
|
||||
errors: list[str] = []
|
||||
if fetch_mode in ("auto", "api"):
|
||||
# 优先使用wbi接口,稳定性通常更好。
|
||||
mixin_key = ""
|
||||
try:
|
||||
mixin_key = get_wbi_mixin_key()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if debug:
|
||||
print(f"[debug] 获取wbi密钥失败: {exc}")
|
||||
|
||||
for attempt in range(1, max(retry_times, 1) + 1):
|
||||
try:
|
||||
if mixin_key:
|
||||
signed = build_wbi_params(base_params, mixin_key)
|
||||
url = f"{BILIBILI_WBI_API}?{parse.urlencode(signed)}"
|
||||
else:
|
||||
url = f"{BILIBILI_API}?{parse.urlencode(base_params)}"
|
||||
data = http_get_json(url, referer=f"https://space.bilibili.com/{mid}/video")
|
||||
code = data.get("code", -1)
|
||||
if code == 0:
|
||||
titles = parse_titles_from_data(data)
|
||||
if titles:
|
||||
return titles
|
||||
errors.append("接口返回成功但标题为空")
|
||||
else:
|
||||
errors.append(f"code={code}, message={data.get('message', 'unknown')} ")
|
||||
except error.HTTPError as exc:
|
||||
errors.append(f"HTTP {exc.code} {exc.reason}")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
errors.append(str(exc))
|
||||
|
||||
sleep_for = min(12.0, (1.8 ** attempt) + random.uniform(0.2, 1.0))
|
||||
if debug:
|
||||
print(f"[debug] mid={mid} API第{attempt}次失败: {errors[-1]},{sleep_for:.1f}s后重试")
|
||||
time.sleep(sleep_for)
|
||||
|
||||
if fetch_mode in ("auto", "html"):
|
||||
try:
|
||||
html_titles = fetch_titles_from_space_html(mid, titles_per_up, debug=debug)
|
||||
if html_titles:
|
||||
return html_titles
|
||||
errors.append("HTML模式未提取到标题")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
errors.append(f"HTML模式失败: {exc}")
|
||||
|
||||
joined = "; ".join(errors[-3:])
|
||||
if ("412" in joined) or ("-799" in joined):
|
||||
hint = "提示: 请在脚本里填写BILIBILI_COOKIE,或运行时加 --bili-cookie \"SESSDATA=...; buvid3=...\""
|
||||
raise RuntimeError(f"{joined}; {hint}")
|
||||
raise RuntimeError(joined)
|
||||
|
||||
|
||||
def clean_html(text: str) -> str:
|
||||
return re.sub(r"<[^>]+>", "", text)
|
||||
|
||||
|
||||
def call_volcengine_chat(system_prompt: str, user_prompt: str) -> str:
|
||||
api_key = VOLCENGINE_API_KEY.strip()
|
||||
base_url = VOLCENGINE_BASE_URL.strip()
|
||||
model = VOLCENGINE_MODEL.strip()
|
||||
|
||||
if (not api_key) or ("在这里填" in api_key):
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_API_KEY")
|
||||
if (not model) or ("在这里填" in model):
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_MODEL")
|
||||
if not base_url:
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_BASE_URL")
|
||||
|
||||
url = f"{base_url.rstrip('/')}/chat/completions"
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.4,
|
||||
}
|
||||
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||
|
||||
req = request.Request(
|
||||
url,
|
||||
data=data,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
with request.urlopen(req, timeout=60) as resp:
|
||||
body = resp.read().decode("utf-8", errors="replace")
|
||||
result = json.loads(body)
|
||||
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError(f"火山引擎返回结构异常: {body[:500]}")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def analyze_titles(up_name: str, up_url: str, titles: list[str]) -> str:
|
||||
system_prompt = (
|
||||
"你是一个内容分析助手。根据视频标题判断UP主内容方向,并给出是否建议取关。"
|
||||
"输出必须是简体中文,且严格按照用户给定的Markdown格式。"
|
||||
)
|
||||
joined_titles = "\n".join(f"- {t}" for t in titles)
|
||||
user_prompt = f"""
|
||||
请分析以下UP主最近视频标题:
|
||||
|
||||
UP主:{up_name}
|
||||
主页:{up_url}
|
||||
标题:
|
||||
{joined_titles}
|
||||
|
||||
请按以下格式输出(不要增加其它段落):
|
||||
1) 内容定位:一句话
|
||||
2) 受众画像:一句话
|
||||
3) 近期内容倾向:2-3点,使用-开头
|
||||
4) 质量评价:80-120字
|
||||
5) 取关建议:保留关注/可以取关(二选一)
|
||||
6) 建议理由:50-100字
|
||||
""".strip()
|
||||
return call_volcengine_chat(system_prompt, user_prompt)
|
||||
|
||||
|
||||
def build_report(results: list[dict[str, Any]]) -> str:
|
||||
now = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
lines: list[str] = []
|
||||
lines.append("# UP主内容分析报告")
|
||||
lines.append("")
|
||||
lines.append(f"- 生成时间: {now}")
|
||||
lines.append(f"- 分析数量: {len(results)}")
|
||||
lines.append("")
|
||||
|
||||
for idx, item in enumerate(results, start=1):
|
||||
lines.append(f"## {idx}. {item['name']} (mid: {item['mid']})")
|
||||
lines.append("")
|
||||
lines.append(f"- 主页: {item['url']}")
|
||||
tags = item.get("tag", [])
|
||||
lines.append(f"- 标签: {', '.join(tags) if tags else '无'}")
|
||||
lines.append("")
|
||||
lines.append("### 最近10条标题")
|
||||
lines.append("")
|
||||
titles = item.get("titles", [])
|
||||
if titles:
|
||||
for t in titles:
|
||||
lines.append(f"- {t}")
|
||||
else:
|
||||
lines.append("- (未抓取到标题)")
|
||||
lines.append("")
|
||||
|
||||
analysis = item.get("analysis", "")
|
||||
if analysis:
|
||||
lines.append("### AI分析")
|
||||
lines.append("")
|
||||
lines.append(analysis)
|
||||
lines.append("")
|
||||
|
||||
error_msg = item.get("error", "")
|
||||
if error_msg:
|
||||
lines.append("### 异常")
|
||||
lines.append("")
|
||||
lines.append(f"- {error_msg}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
global RUNTIME_BILIBILI_COOKIE
|
||||
args = parse_args()
|
||||
RUNTIME_BILIBILI_COOKIE = (args.bili_cookie or "").strip()
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output)
|
||||
|
||||
if args.analyze_from_report:
|
||||
return run_batch_analysis_from_report(args, output_path)
|
||||
|
||||
if args.test_mid > 0:
|
||||
items = [UpItem(mid=args.test_mid, name=args.test_name, tag=["测试模式"]) ]
|
||||
print(f"测试模式: 仅处理 mid={args.test_mid}")
|
||||
else:
|
||||
if not input_path.exists():
|
||||
print(f"输入文件不存在: {input_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
items = load_up_items(input_path)
|
||||
except Exception as exc:
|
||||
print(f"加载输入文件失败: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.only_tag:
|
||||
items = [it for it in items if args.only_tag in it.tag]
|
||||
|
||||
if args.max_ups and args.max_ups > 0:
|
||||
items = items[: args.max_ups]
|
||||
|
||||
if not items:
|
||||
print("没有可处理的 UP 数据", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"开始处理 {len(items)} 个 UP...")
|
||||
if args.skip_ai:
|
||||
print("已启用 --skip-ai,仅测试抓取标题")
|
||||
if args.debug:
|
||||
print(f"[debug] 当前抓取模式: {args.fetch_mode}")
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
for idx, item in enumerate(items, start=1):
|
||||
up_url = f"https://space.bilibili.com/{item.mid}/video"
|
||||
row: dict[str, Any] = {
|
||||
"mid": item.mid,
|
||||
"name": item.name or f"mid_{item.mid}",
|
||||
"tag": item.tag,
|
||||
"url": up_url,
|
||||
"titles": [],
|
||||
"analysis": "",
|
||||
"error": "",
|
||||
}
|
||||
|
||||
print(f"[{idx}/{len(items)}] 抓取: {row['name']} ({item.mid})")
|
||||
try:
|
||||
titles = fetch_titles(
|
||||
item.mid,
|
||||
args.titles_per_up,
|
||||
retry_times=args.retry_times,
|
||||
debug=args.debug,
|
||||
fetch_mode=args.fetch_mode,
|
||||
)
|
||||
row["titles"] = titles
|
||||
if not titles:
|
||||
row["error"] = "未抓取到标题,可能是接口限制或UP无公开视频"
|
||||
elif args.skip_ai:
|
||||
row["analysis"] = "测试模式已跳过AI分析"
|
||||
else:
|
||||
row["analysis"] = analyze_titles(row["name"], up_url, titles)
|
||||
except error.HTTPError as exc:
|
||||
row["error"] = f"HTTP错误: {exc.code} {exc.reason}"
|
||||
except error.URLError as exc:
|
||||
row["error"] = f"网络错误: {exc.reason}"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
row["error"] = str(exc)
|
||||
|
||||
if args.debug and row["titles"]:
|
||||
sample = row["titles"][: min(3, len(row["titles"]))]
|
||||
print(f"[debug] mid={item.mid} 成功抓取 {len(row['titles'])} 条,样例: {sample}")
|
||||
|
||||
results.append(row)
|
||||
time.sleep(max(args.sleep_seconds, 0))
|
||||
|
||||
report = build_report(results)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(report, encoding="utf-8")
|
||||
print(f"报告已生成: {output_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
598
source/scripts/batch_ai_summary_from_report.py
Normal file
598
source/scripts/batch_ai_summary_from_report.py
Normal file
@@ -0,0 +1,598 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Batch AI summary from existing UP markdown report.
|
||||
|
||||
Read an existing report (e.g. source/up_analysis_report.md),
|
||||
extract each UP's title list, and generate AI summaries in batches.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib import request
|
||||
|
||||
# Fill your Volcengine Ark settings here.
|
||||
VOLCENGINE_API_KEY = "586d443c-5034-4810-9760-50ce77394e8a"
|
||||
VOLCENGINE_MODEL = "deepseek-v3-1-terminus"
|
||||
VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
|
||||
|
||||
SKIP_MARKERS = {
|
||||
"",
|
||||
"测试模式已跳过AI分析",
|
||||
"(待分析)",
|
||||
}
|
||||
|
||||
# 预设分组及关键词规则(可自行扩展)。
|
||||
PRESET_GROUPS: dict[str, list[str]] = {
|
||||
"AAA_核心每日必读":[
|
||||
"编程", "算法", "工程", "干货", "新闻", "趋势",
|
||||
],
|
||||
"AA_编程信息干货必留": [
|
||||
"编程", "算法", "工程", "教程", "实战", "课程", "新技术", "开源", "工具", "效率", "技术", "架构",
|
||||
],
|
||||
"A_硬核知识保留": [
|
||||
"科普", "数学", "物理", "编程", "算法", "工程", "历史", "新闻", "深度",
|
||||
],
|
||||
"B_技能学习保留": [
|
||||
"英语", "四六级", "考研", "面试", "教程", "实战", "学习", "课程", "写作",
|
||||
],
|
||||
"C_资讯快餐观察": [
|
||||
"热点", "速览", "信息差", "快报", "盘点", "吐槽", "观点", "趋势",
|
||||
],
|
||||
"D_娱乐消遣可取关": [
|
||||
"搞笑", "整活", "抽象", "乐子", "娱乐", "段子", "鬼畜", "日常", "情侣",
|
||||
],
|
||||
"E_营销带货谨慎": [
|
||||
"好物", "测评", "种草", "直播", "带货", "优惠", "开箱", "广告", "激活",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="基于现有报告分批做AI总结")
|
||||
parser.add_argument(
|
||||
"--input-report",
|
||||
default="source/output/reports/1_up_titles_report.md",
|
||||
help="已有标题报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-report",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="输出报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=20,
|
||||
help="每批处理数量,默认: 20",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-index",
|
||||
type=int,
|
||||
default=1,
|
||||
help="批次序号(从1开始),默认: 1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep-seconds",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="提交任务间隔秒数,默认: 0(并发模式建议0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=4,
|
||||
help="并发请求数,默认: 4",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-retries",
|
||||
type=int,
|
||||
default=2,
|
||||
help="单个UP分析最大重试次数,默认: 2",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--request-timeout",
|
||||
type=float,
|
||||
default=60.0,
|
||||
help="单次AI请求超时秒数,默认: 60",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="强制覆盖已有AI分析(默认只处理待分析项)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="输出调试信息",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config-from",
|
||||
default="source/scripts/analyze_up_content.py",
|
||||
help="自动读取API配置的脚本路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--run-all-batches",
|
||||
action="store_true",
|
||||
help="自动连续跑完所有批次(忽略batch-index)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_api_config_from_script(path: Path) -> dict[str, str]:
|
||||
if not path.exists():
|
||||
return {}
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
result: dict[str, str] = {}
|
||||
for key in ("VOLCENGINE_API_KEY", "VOLCENGINE_MODEL", "VOLCENGINE_BASE_URL"):
|
||||
m = re.search(rf"^{key}\s*=\s*\"([^\"]*)\"", text, flags=re.MULTILINE)
|
||||
if m:
|
||||
result[key] = m.group(1).strip()
|
||||
return result
|
||||
|
||||
|
||||
def parse_report(path: Path) -> list[dict[str, Any]]:
|
||||
lines = path.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
items: list[dict[str, Any]] = []
|
||||
current: dict[str, Any] | None = None
|
||||
section = ""
|
||||
|
||||
for line in lines:
|
||||
m = re.match(r"^##\s+\d+\.\s+(.*?)\s+\(mid:\s*(\d+)\)", line)
|
||||
if m:
|
||||
if current is not None:
|
||||
items.append(current)
|
||||
mid = int(m.group(2))
|
||||
current = {
|
||||
"mid": mid,
|
||||
"name": m.group(1).strip(),
|
||||
"tag": [],
|
||||
"url": f"https://space.bilibili.com/{mid}/video",
|
||||
"titles": [],
|
||||
"analysis": "",
|
||||
"group": "",
|
||||
"action": "",
|
||||
"reason": "",
|
||||
"error": "",
|
||||
}
|
||||
section = ""
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
continue
|
||||
|
||||
if line.startswith("- 主页: "):
|
||||
current["url"] = line.replace("- 主页: ", "", 1).strip()
|
||||
continue
|
||||
if line.startswith("- 标签: "):
|
||||
raw = line.replace("- 标签: ", "", 1).strip()
|
||||
current["tag"] = [] if raw in ("", "无") else [x.strip() for x in raw.split(",") if x.strip()]
|
||||
continue
|
||||
|
||||
if line == "### 最近10条标题":
|
||||
section = "titles"
|
||||
continue
|
||||
if line == "### AI分析":
|
||||
section = "analysis"
|
||||
continue
|
||||
if line == "### 分组建议":
|
||||
section = "group"
|
||||
continue
|
||||
if line == "### 异常":
|
||||
section = "error"
|
||||
continue
|
||||
if line.startswith("### "):
|
||||
section = ""
|
||||
continue
|
||||
|
||||
if section == "titles" and line.startswith("- "):
|
||||
text = line[2:].strip()
|
||||
if text and text != "(未抓取到标题)":
|
||||
current["titles"].append(text)
|
||||
elif section == "analysis" and line.strip():
|
||||
current["analysis"] = (current["analysis"] + "\n" + line.strip()).strip()
|
||||
elif section == "group":
|
||||
if line.startswith("- 预设分组: "):
|
||||
current["group"] = line.replace("- 预设分组: ", "", 1).strip()
|
||||
elif line.startswith("- 建议动作: "):
|
||||
current["action"] = line.replace("- 建议动作: ", "", 1).strip()
|
||||
elif line.startswith("- 判断依据: "):
|
||||
current["reason"] = line.replace("- 判断依据: ", "", 1).strip()
|
||||
elif line.strip() == "(待分组)":
|
||||
current["group"] = ""
|
||||
current["action"] = ""
|
||||
current["reason"] = ""
|
||||
elif section == "error" and line.startswith("- "):
|
||||
current["error"] = line[2:].strip()
|
||||
|
||||
if current is not None:
|
||||
items.append(current)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def call_volcengine_chat(
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
cfg: dict[str, str],
|
||||
timeout: float,
|
||||
) -> str:
|
||||
api_key = cfg.get("VOLCENGINE_API_KEY", "").strip()
|
||||
model = cfg.get("VOLCENGINE_MODEL", "").strip()
|
||||
base_url = cfg.get("VOLCENGINE_BASE_URL", "").strip()
|
||||
|
||||
if (not api_key) or ("在这里填" in api_key):
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_API_KEY")
|
||||
if (not model) or ("在这里填" in model):
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_MODEL")
|
||||
if not base_url:
|
||||
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_BASE_URL")
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.4,
|
||||
}
|
||||
|
||||
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||||
req = request.Request(
|
||||
f"{base_url.rstrip('/')}/chat/completions",
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
with request.urlopen(req, timeout=timeout) as resp:
|
||||
text = resp.read().decode("utf-8", errors="replace")
|
||||
|
||||
data = json.loads(text)
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
if not isinstance(content, str) or not content.strip():
|
||||
raise RuntimeError(f"AI响应异常: {text[:500]}")
|
||||
return content.strip()
|
||||
|
||||
|
||||
def summarize_one_up(
|
||||
name: str,
|
||||
mid: int,
|
||||
titles: list[str],
|
||||
tags: list[str],
|
||||
cfg: dict[str, str],
|
||||
timeout: float,
|
||||
) -> dict[str, str]:
|
||||
system_prompt = (
|
||||
"你是内容定位与订阅决策助手。"
|
||||
"你必须输出合法JSON,不要输出其它文本。"
|
||||
)
|
||||
joined_titles = "\n".join(f"- {t}" for t in titles)
|
||||
joined_tags = "、".join(tags) if tags else "无"
|
||||
rule_hint = heuristic_group_hint(titles, tags)
|
||||
groups_desc = "\n".join(f"- {k}" for k in PRESET_GROUPS)
|
||||
|
||||
user_prompt = f"""
|
||||
请基于以下信息完成分组与总结。
|
||||
|
||||
UP主: {name}
|
||||
mid: {mid}
|
||||
标签: {joined_tags}
|
||||
最近标题:
|
||||
{joined_titles}
|
||||
|
||||
预设分组:
|
||||
{groups_desc}
|
||||
|
||||
代码规则初判:
|
||||
{rule_hint}
|
||||
|
||||
要求:
|
||||
1) 输出JSON对象,字段严格为: summary, group, action, reason。
|
||||
2) summary: 一段中文总结,50-100字。
|
||||
3) group: 必须从预设分组里选一个。给出详细的分组类别和命中分组中的规则词。
|
||||
4) action: 只能是"保留关注"或"可以取关"。敏感一点,只保留真正核心优质的up,其他都建议取关。
|
||||
5) reason: 30-60字,解释为什么分到该组并给出该动作。
|
||||
""".strip()
|
||||
|
||||
content = call_volcengine_chat(system_prompt, user_prompt, cfg, timeout=timeout)
|
||||
return parse_ai_json(content)
|
||||
|
||||
|
||||
def parse_ai_json(content: str) -> dict[str, str]:
|
||||
text = content.strip()
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
|
||||
text = re.sub(r"\n?```$", "", text).strip()
|
||||
m = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
||||
if m:
|
||||
text = m.group(0)
|
||||
data = json.loads(text)
|
||||
summary = str(data.get("summary", "")).strip()
|
||||
group = str(data.get("group", "")).strip()
|
||||
action = str(data.get("action", "")).strip()
|
||||
reason = str(data.get("reason", "")).strip()
|
||||
if not summary:
|
||||
raise RuntimeError("AI返回缺少summary")
|
||||
if group not in PRESET_GROUPS:
|
||||
raise RuntimeError(f"AI返回未知group: {group}")
|
||||
if action not in ("保留关注", "可以取关"):
|
||||
raise RuntimeError(f"AI返回未知action: {action}")
|
||||
if not reason:
|
||||
reason = "基于标题内容与更新风格综合判断。"
|
||||
return {
|
||||
"summary": summary,
|
||||
"group": group,
|
||||
"action": action,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
|
||||
def heuristic_group_hint(titles: list[str], tags: list[str]) -> str:
|
||||
text = "\n".join(titles) + "\n" + " ".join(tags)
|
||||
score: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
|
||||
lower_text = text.lower()
|
||||
for group, words in PRESET_GROUPS.items():
|
||||
for w in words:
|
||||
w_lower = w.lower()
|
||||
if w_lower in lower_text:
|
||||
score[group] += 1
|
||||
ranked = sorted(score.items(), key=lambda x: x[1], reverse=True)
|
||||
best_group, best_score = ranked[0]
|
||||
if best_score <= 0:
|
||||
return "未命中关键词,倾向按内容专业度与稳定性判断。"
|
||||
top3 = ", ".join(f"{g}:{s}" for g, s in ranked[:3])
|
||||
return f"关键词命中最高组={best_group}(score={best_score}),参考分布: {top3}"
|
||||
|
||||
|
||||
def summarize_one_up_with_retry(
|
||||
item: dict[str, Any],
|
||||
cfg: dict[str, str],
|
||||
max_retries: int,
|
||||
timeout: float,
|
||||
debug: bool,
|
||||
) -> dict[str, str]:
|
||||
last_exc: Exception | None = None
|
||||
total_try = max(1, max_retries)
|
||||
for attempt in range(1, total_try + 1):
|
||||
try:
|
||||
return summarize_one_up(
|
||||
item["name"],
|
||||
item["mid"],
|
||||
item.get("titles", []),
|
||||
item.get("tag", []),
|
||||
cfg,
|
||||
timeout=timeout,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_exc = exc
|
||||
if debug:
|
||||
print(f"[debug] {item['name']} 第{attempt}次失败: {exc}")
|
||||
if attempt < total_try:
|
||||
time.sleep(min(2.0, 0.5 * attempt))
|
||||
raise RuntimeError(str(last_exc) if last_exc else "未知错误")
|
||||
|
||||
|
||||
def build_report(items: list[dict[str, Any]], batch_note: str) -> str:
|
||||
now = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
lines: list[str] = [
|
||||
"# UP主内容分析报告(分批AI总结)",
|
||||
"",
|
||||
f"- 生成时间: {now}",
|
||||
f"- 分析数量: {len(items)}",
|
||||
f"- 处理说明: {batch_note}",
|
||||
"",
|
||||
]
|
||||
|
||||
group_stats: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
|
||||
action_stats: dict[str, int] = {"保留关注": 0, "可以取关": 0}
|
||||
for item in items:
|
||||
g = item.get("group", "")
|
||||
a = item.get("action", "")
|
||||
if g in group_stats:
|
||||
group_stats[g] += 1
|
||||
if a in action_stats:
|
||||
action_stats[a] += 1
|
||||
|
||||
lines.append("## 分组统计")
|
||||
lines.append("")
|
||||
for g, c in group_stats.items():
|
||||
lines.append(f"- {g}: {c}")
|
||||
lines.append(f"- 保留关注: {action_stats['保留关注']}")
|
||||
lines.append(f"- 可以取关: {action_stats['可以取关']}")
|
||||
lines.append("")
|
||||
|
||||
for idx, item in enumerate(items, start=1):
|
||||
lines.append(f"## {idx}. {item['name']} (mid: {item['mid']})")
|
||||
lines.append("")
|
||||
lines.append(f"- 主页: {item['url']}")
|
||||
tags = item.get("tag", [])
|
||||
lines.append(f"- 标签: {', '.join(tags) if tags else '无'}")
|
||||
lines.append("")
|
||||
lines.append("### 最近10条标题")
|
||||
lines.append("")
|
||||
titles = item.get("titles", [])
|
||||
if titles:
|
||||
for t in titles:
|
||||
lines.append(f"- {t}")
|
||||
else:
|
||||
lines.append("- (未抓取到标题)")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### AI分析")
|
||||
lines.append("")
|
||||
analysis = item.get("analysis", "")
|
||||
lines.append(analysis if analysis else "(待分析)")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### 分组建议")
|
||||
lines.append("")
|
||||
group = item.get("group", "")
|
||||
action = item.get("action", "")
|
||||
reason = item.get("group_reason", "")
|
||||
if group and action:
|
||||
lines.append(f"- 预设分组: {group}")
|
||||
lines.append(f"- 建议动作: {action}")
|
||||
lines.append(f"- 判断依据: {reason if reason else '基于标题与更新风格综合判断。'}")
|
||||
else:
|
||||
lines.append("- (待分组)")
|
||||
lines.append("")
|
||||
|
||||
error = item.get("error", "")
|
||||
if error:
|
||||
lines.append("### 异常")
|
||||
lines.append("")
|
||||
lines.append(f"- {error}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
input_report = Path(args.input_report)
|
||||
output_report = Path(args.output_report)
|
||||
|
||||
if not input_report.exists():
|
||||
print(f"输入报告不存在: {input_report}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
items = parse_report(input_report)
|
||||
if not items:
|
||||
print("输入报告未解析出任何UP条目", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
config = {
|
||||
"VOLCENGINE_API_KEY": VOLCENGINE_API_KEY,
|
||||
"VOLCENGINE_MODEL": VOLCENGINE_MODEL,
|
||||
"VOLCENGINE_BASE_URL": VOLCENGINE_BASE_URL,
|
||||
}
|
||||
if ("在这里填" in config["VOLCENGINE_API_KEY"]) or ("在这里填" in config["VOLCENGINE_MODEL"]):
|
||||
inherited = load_api_config_from_script(Path(args.config_from))
|
||||
if inherited:
|
||||
config.update(inherited)
|
||||
|
||||
if args.force:
|
||||
pending = [it for it in items if it.get("titles")]
|
||||
# else:
|
||||
# pending = [
|
||||
# it for it in items
|
||||
# if it.get("titles") and it.get("analysis", "").strip() in SKIP_MARKERS
|
||||
# ]
|
||||
else:
|
||||
pending = [
|
||||
it for it in items
|
||||
if it.get("titles") and (
|
||||
it.get("analysis", "").strip() in SKIP_MARKERS
|
||||
or not it.get("group") # 没有分组也要重跑
|
||||
)
|
||||
]
|
||||
|
||||
if not pending:
|
||||
print("没有待分析条目,直接输出当前报告")
|
||||
output_report.write_text(build_report(items, "无待分析条目"), encoding="utf-8")
|
||||
return 0
|
||||
|
||||
index_map = {f"{it['mid']}::{it['name']}": idx for idx, it in enumerate(items)}
|
||||
success_total = 0
|
||||
failed_total = 0
|
||||
|
||||
batch_size = max(1, args.batch_size)
|
||||
if args.run_all_batches:
|
||||
total_batches = math.ceil(len(pending) / batch_size)
|
||||
batch_indexes = list(range(1, total_batches + 1))
|
||||
print(f"自动连续模式: 共{total_batches}批, 待分析总数{len(pending)}")
|
||||
else:
|
||||
batch_indexes = [max(1, args.batch_index)]
|
||||
|
||||
workers = max(1, args.workers)
|
||||
print(f"并发配置: workers={workers}, retries={max(1, args.max_retries)}, timeout={args.request_timeout}s")
|
||||
|
||||
for batch_index in batch_indexes:
|
||||
start = (batch_index - 1) * batch_size
|
||||
end = start + batch_size
|
||||
batch = pending[start:end]
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
print(
|
||||
f"开始分批AI总结: 第{batch_index}批, 每批{batch_size}条, "
|
||||
f"本批{len(batch)}条, 待分析总数{len(pending)}"
|
||||
)
|
||||
|
||||
success = 0
|
||||
failed = 0
|
||||
future_to_item: dict[Any, dict[str, Any]] = {}
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
for i, it in enumerate(batch, start=1):
|
||||
print(f"[submit {i}/{len(batch)}] {it['name']} ({it['mid']})")
|
||||
future = executor.submit(
|
||||
summarize_one_up_with_retry,
|
||||
it,
|
||||
config,
|
||||
max(1, args.max_retries),
|
||||
float(args.request_timeout),
|
||||
args.debug,
|
||||
)
|
||||
future_to_item[future] = it
|
||||
if args.sleep_seconds > 0:
|
||||
time.sleep(args.sleep_seconds)
|
||||
|
||||
done_count = 0
|
||||
for future in as_completed(future_to_item):
|
||||
done_count += 1
|
||||
it = future_to_item[future]
|
||||
idx = index_map.get(f"{it['mid']}::{it['name']}")
|
||||
try:
|
||||
ai_res = future.result()
|
||||
if idx is not None:
|
||||
items[idx]["analysis"] = ai_res["summary"]
|
||||
items[idx]["group"] = ai_res["group"]
|
||||
items[idx]["action"] = ai_res["action"]
|
||||
items[idx]["group_reason"] = ai_res["reason"]
|
||||
items[idx]["error"] = ""
|
||||
success += 1
|
||||
print(f"[done {done_count}/{len(batch)}] 成功: {it['name']} ({it['mid']})")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if idx is not None:
|
||||
items[idx]["error"] = str(exc)
|
||||
failed += 1
|
||||
print(f"[done {done_count}/{len(batch)}] 失败: {it['name']} ({it['mid']})")
|
||||
if args.debug:
|
||||
print(f"[debug] 失败详情: {exc}")
|
||||
|
||||
success_total += success
|
||||
failed_total += failed
|
||||
|
||||
step_note = (
|
||||
f"第{batch_index}批完成: 成功{success}, 失败{failed}, "
|
||||
f"本批{len(batch)}, 待分析总数{len(pending)}"
|
||||
)
|
||||
output_report.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_report.write_text(build_report(items, step_note), encoding="utf-8")
|
||||
print(f"第{batch_index}批写入完成: {output_report}")
|
||||
|
||||
mode_text = "自动连续" if args.run_all_batches else "单批"
|
||||
note = (
|
||||
f"{mode_text}模式完成: 成功{success_total}, 失败{failed_total}, "
|
||||
f"处理批次数={len(batch_indexes)}, 待分析总数={len(pending)}"
|
||||
)
|
||||
output_report.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_report.write_text(build_report(items, note), encoding="utf-8")
|
||||
print(f"输出完成: {output_report}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
101
source/scripts/extract_group_info.py
Normal file
101
source/scripts/extract_group_info.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import argparse
|
||||
import re
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="提取UP分组信息")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="./source/19_53_no_titles.md",
|
||||
help="输入报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="输出报告路径(默认覆盖输入)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
input_file = args.input
|
||||
output_file = args.output or input_file
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split('\n')
|
||||
section_starts = []
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith('## '):
|
||||
section_starts.append(i)
|
||||
|
||||
if len(section_starts) < 2:
|
||||
print('No sections found')
|
||||
return 1
|
||||
|
||||
header = '\n'.join(lines[:section_starts[0]])
|
||||
sections = []
|
||||
|
||||
for idx in range(len(section_starts)):
|
||||
start = section_starts[idx]
|
||||
end = section_starts[idx + 1] if idx + 1 < len(section_starts) else len(lines)
|
||||
section = '\n'.join(lines[start:end])
|
||||
sections.append(section)
|
||||
|
||||
sections = sections[1:]
|
||||
|
||||
parsed = []
|
||||
for sec in sections:
|
||||
match = re.match(r'^## (\d+)\. (.+) \(mid: (\d+)\)', sec)
|
||||
if match:
|
||||
num = int(match.group(1))
|
||||
name = match.group(2)
|
||||
mid = match.group(3)
|
||||
|
||||
group_m = re.search(r'- 预设分组: (.+)', sec)
|
||||
action_m = re.search(r'- 建议动作: (.+)', sec)
|
||||
reason_m = re.search(r'- 判断依据: (.+)', sec)
|
||||
error_m = re.search(r'AI返回未知group: (.+)', sec)
|
||||
|
||||
group = group_m.group(1).strip() if group_m else ""
|
||||
action = action_m.group(1).strip() if action_m else ""
|
||||
reason = reason_m.group(1).strip() if reason_m else ""
|
||||
error = error_m.group(1).strip() if error_m else ""
|
||||
|
||||
parsed.append({
|
||||
'num': num,
|
||||
'name': name,
|
||||
'mid': mid,
|
||||
'group': group,
|
||||
'action': action,
|
||||
'reason': reason,
|
||||
'error': error
|
||||
})
|
||||
|
||||
parsed.sort(key=lambda x: (x['name'].casefold(), int(x['mid'])))
|
||||
|
||||
lines_out = [header, ""]
|
||||
|
||||
for p in parsed:
|
||||
lines_out.append(f"## {p['num']}. {p['name']} (mid: {p['mid']})")
|
||||
lines_out.append("")
|
||||
if p['group']:
|
||||
lines_out.append(f"- 预设分组: {p['group']}")
|
||||
if p['action']:
|
||||
lines_out.append(f"- 建议动作: {p['action']}")
|
||||
if p['reason']:
|
||||
lines_out.append(f"- 判断依据: {p['reason']}")
|
||||
if p['error']:
|
||||
lines_out.append(f"- 异常: {p['error']}")
|
||||
lines_out.append("")
|
||||
|
||||
result = '\n'.join(lines_out)
|
||||
result = re.sub(r'\n{3,}', '\n\n', result)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(result)
|
||||
|
||||
print(f'Extracted {len(parsed)} sections')
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
104
source/scripts/extract_keep_follow_doc.py
Normal file
104
source/scripts/extract_keep_follow_doc.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="提取非取关UP的AI分析与分组建议")
|
||||
parser.add_argument(
|
||||
"--input-report",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="输入分析报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-report",
|
||||
default="source/output/reports/3_up_keep_follow_only.md",
|
||||
help="输出保留关注报告路径",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
src = Path(args.input_report)
|
||||
dst = Path(args.output_report)
|
||||
|
||||
if not src.exists():
|
||||
print(f"来源文件不存在: {src}")
|
||||
return 1
|
||||
|
||||
text = src.read_text(encoding="utf-8")
|
||||
pattern = r"^##\s+\d+\.\s+(.+?)\s+\(mid:\s*(\d+)\)\s*$"
|
||||
matches = list(re.finditer(pattern, text, re.MULTILINE))
|
||||
|
||||
items: list[tuple[str, str, str, str, str, str]] = []
|
||||
for i, m in enumerate(matches):
|
||||
start = m.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||
section = text[start:end]
|
||||
|
||||
name = m.group(1).strip()
|
||||
mid = m.group(2).strip()
|
||||
|
||||
action_m = re.search(r"-\s*建议动作:\s*(.+)", section)
|
||||
action = action_m.group(1).strip() if action_m else ""
|
||||
# 反逻辑:没有"建议动作: 可以取关"就保留
|
||||
if action == "可以取关":
|
||||
continue
|
||||
|
||||
ai_m = re.search(r"###\s*AI分析\s*\n([\s\S]*?)(?=\n###\s|\Z)", section)
|
||||
ai_text = ai_m.group(1).strip() if ai_m else ""
|
||||
|
||||
group_m = re.search(r"###\s*分组建议\s*\n([\s\S]*?)(?=\n###\s|\Z)", section)
|
||||
group_text = group_m.group(1).strip() if group_m else ""
|
||||
|
||||
error_m = re.search(r"###\s*异常\s*\n([\s\S]*?)(?=\n###\s|\Z)", section)
|
||||
error_text = error_m.group(1).strip() if error_m else ""
|
||||
|
||||
items.append((name, mid, ai_text, group_text, action, error_text))
|
||||
|
||||
# 按昵称首字母A-Z排序(同名时按mid升序)
|
||||
items.sort(key=lambda x: (x[0].casefold(), int(x[1])))
|
||||
|
||||
lines = [
|
||||
"# 保留关注UP主分析与分组建议",
|
||||
"",
|
||||
f"- 生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
f"- 来源文件: {src.name}",
|
||||
f"- 条目数: {len(items)}",
|
||||
"",
|
||||
]
|
||||
|
||||
for idx, (name, mid, ai_text, group_text, action, error_text) in enumerate(items, 1):
|
||||
lines.append(f"## {idx}. {name} (mid: {mid})")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### AI分析")
|
||||
lines.append("")
|
||||
lines.append(ai_text if ai_text else "(无)")
|
||||
lines.append("")
|
||||
|
||||
lines.append("### 分组建议")
|
||||
lines.append("")
|
||||
lines.append(group_text if group_text else f"- 建议动作: {action if action else '(无)'}")
|
||||
lines.append("")
|
||||
|
||||
if error_text:
|
||||
lines.append("### 异常")
|
||||
lines.append("")
|
||||
lines.append(error_text)
|
||||
lines.append("")
|
||||
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
dst.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"已生成: {dst}")
|
||||
print(f"保留条目: {len(items)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
174
source/scripts/extract_unfollow_list.py
Normal file
174
source/scripts/extract_unfollow_list.py
Normal file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract UPs marked as "可以取关" and output their mids to CSV.
|
||||
|
||||
Read an UP analysis report and extract all UPs with action "可以取关",
|
||||
then output their mids to a CSV file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_report(report_path: Path) -> list[dict[str, Any]]:
|
||||
"""解析Markdown格式的UP分析报告,返回UP列表"""
|
||||
if not report_path.exists():
|
||||
return []
|
||||
|
||||
text = report_path.read_text(encoding="utf-8")
|
||||
items = []
|
||||
|
||||
# 按UP项分割(每个UP项以"## N. 名字 (mid: ...)"开头)
|
||||
pattern = r"^## \d+\. (.+?)\s+\(mid:\s*(\d+)\)"
|
||||
matches = list(re.finditer(pattern, text, re.MULTILINE))
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||
section = text[start:end]
|
||||
|
||||
name = match.group(1).strip()
|
||||
mid = int(match.group(2))
|
||||
|
||||
# 提取建议动作
|
||||
action_match = re.search(r"- 建议动作: (.+?)(?:\n|$)", section)
|
||||
action = action_match.group(1).strip() if action_match else ""
|
||||
|
||||
items.append({
|
||||
"mid": mid,
|
||||
"name": name,
|
||||
"action": action,
|
||||
})
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="从UP分析报告中提取可以取关的UP")
|
||||
parser.add_argument(
|
||||
"--input-report",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="输入报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-csv",
|
||||
default="source/output/uids/4_unfollow_mids_list.txt",
|
||||
help="输出文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format",
|
||||
choices=["csv", "mid-only", "json"],
|
||||
default="mid-only",
|
||||
help="输出格式:csv(mid,name), mid-only(仅mid逗号分隔), json(JSON格式)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with-names",
|
||||
action="store_true",
|
||||
help="在mid后添加UP名称(仅mid-only格式生效)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--split-size",
|
||||
type=int,
|
||||
default=0,
|
||||
help="可选:将mid-only结果按N个一组拆分多个文件,例如100",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_report = Path(args.input_report)
|
||||
output_csv = Path(args.output_csv)
|
||||
|
||||
if not input_report.exists():
|
||||
print(f"错误: 输入报告不存在: {input_report}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"读取报告: {input_report}")
|
||||
items = parse_report(input_report)
|
||||
|
||||
if not items:
|
||||
print("未能从报告中解析任何UP", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# 筛选可以取关的UP
|
||||
unfollow_items = [it for it in items if it.get("action") == "可以取关"]
|
||||
|
||||
print(f"总 UP 数: {len(items)}")
|
||||
print(f"可以取关: {len(unfollow_items)}")
|
||||
|
||||
if not unfollow_items:
|
||||
print("没有可以取关的UP")
|
||||
return 0
|
||||
|
||||
# 输出格式
|
||||
if args.format == "csv":
|
||||
# 标准CSV格式:mid, name
|
||||
output_csv.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_csv, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["mid", "name"])
|
||||
writer.writeheader()
|
||||
for item in unfollow_items:
|
||||
writer.writerow({"mid": item["mid"], "name": item["name"]})
|
||||
|
||||
print(f"\n✓ 已输出CSV格式到: {output_csv}")
|
||||
print(f" 格式: mid,name")
|
||||
print(f" 行数: {len(unfollow_items)}")
|
||||
|
||||
elif args.format == "mid-only":
|
||||
# 仅mid,逗号分隔
|
||||
mids = [str(it["mid"]) for it in unfollow_items]
|
||||
|
||||
if args.with_names:
|
||||
# mid:name 格式
|
||||
content = ",".join([f"{it['mid']}:{it['name']}" for it in unfollow_items])
|
||||
print(f"\n✓ 已输出mid:name列表到: {output_csv}")
|
||||
print(f" 格式: mid1:name1,mid2:name2,...")
|
||||
else:
|
||||
# 仅mid
|
||||
content = ",".join(mids)
|
||||
print(f"\n✓ 已输出mid列表到: {output_csv}")
|
||||
print(f" 格式: mid1,mid2,mid3,...")
|
||||
|
||||
output_csv.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_csv.write_text(content, encoding="utf-8")
|
||||
print(f" 数量: {len(mids)}")
|
||||
|
||||
split_size = max(0, int(args.split_size))
|
||||
if split_size > 0:
|
||||
groups = [mids[i:i + split_size] for i in range(0, len(mids), split_size)]
|
||||
stem = output_csv.stem
|
||||
suffix = output_csv.suffix or ".txt"
|
||||
for i, group in enumerate(groups, start=1):
|
||||
part_path = output_csv.with_name(f"{stem}_{i}{suffix}")
|
||||
part_path.write_text(",".join(group), encoding="utf-8")
|
||||
print(f" 已按每组{split_size}个拆分为{len(groups)}个文件")
|
||||
|
||||
elif args.format == "json":
|
||||
# JSON格式
|
||||
import json
|
||||
|
||||
data = [{"mid": it["mid"], "name": it["name"]} for it in unfollow_items]
|
||||
output_csv.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_csv.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
print(f"\n✓ 已输出JSON格式到: {output_csv}")
|
||||
print(f" 数量: {len(data)}")
|
||||
|
||||
# 显示前10个示例
|
||||
if len(unfollow_items) > 0:
|
||||
print(f"\n📋 示例(前10个):")
|
||||
for item in unfollow_items[:10]:
|
||||
print(f" - {item['mid']}: {item['name']}")
|
||||
|
||||
if len(unfollow_items) > 10:
|
||||
print(f" ... 还有 {len(unfollow_items) - 10} 个")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
67
source/scripts/remove_10content.py
Normal file
67
source/scripts/remove_10content.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import argparse
|
||||
import re
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="删除最近10条标题内容")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="输入报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="输出报告路径(默认覆盖输入)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
input_file = args.input
|
||||
output_file = args.output or input_file
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split('\n')
|
||||
new_lines = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
new_lines.append(line)
|
||||
|
||||
if line.startswith('## '):
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
curr = lines[i]
|
||||
if curr.startswith('## '):
|
||||
break
|
||||
if curr.startswith('### '):
|
||||
if '最近10条标题' in curr:
|
||||
i += 1
|
||||
while i < len(lines) and lines[i].startswith(' - '):
|
||||
i += 1
|
||||
continue
|
||||
else:
|
||||
break
|
||||
if curr.startswith('- ') and not curr.startswith(' - '):
|
||||
i += 1
|
||||
continue
|
||||
if curr.startswith(' - '):
|
||||
i += 1
|
||||
continue
|
||||
new_lines.append(curr)
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
result = '\n'.join(new_lines)
|
||||
result = re.sub(r'\n{3,}', '\n\n', result)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(result)
|
||||
|
||||
print(f'Done: {output_file}')
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
208
source/scripts/run_pipeline.py
Normal file
208
source/scripts/run_pipeline.py
Normal file
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
"""One-command pipeline: fetch titles -> batch analyze -> outputs.
|
||||
|
||||
Pipeline outputs:
|
||||
1) source/output/reports/1_up_titles_report.md
|
||||
2) source/output/reports/2_up_analysis_full_auto.md
|
||||
3) source/output/reports/3_up_keep_follow_only.md
|
||||
4) source/output/uids/4_unfollow_mids_list.txt (+ split files)
|
||||
|
||||
Pipeline steps:
|
||||
1) 抓取视频标题 (analyze_up_content.py)
|
||||
2) 分批AI分析 (batch_ai_summary_from_report.py)
|
||||
3) 生成保留关注报告 (extract_keep_follow_doc.py)
|
||||
4) 生成取关UID列表 (extract_unfollow_list.py)
|
||||
5) 按首字母排序 (sort_up_main.py)
|
||||
6) 提取分组信息 (extract_group_info.py)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="一键运行完整功能链")
|
||||
parser.add_argument(
|
||||
"--input-json",
|
||||
default="source/resources/export_uids.json",
|
||||
help="UP资源文件路径,默认: source/resources/export_uids.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--titles-report",
|
||||
default="source/output/reports/1_up_titles_report.md",
|
||||
help="标题抓取报告输出路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--analysis-report",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="分批分析报告输出路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keep-report",
|
||||
default="source/output/reports/3_up_keep_follow_only.md",
|
||||
help="保留关注报告输出路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unfollow-uids",
|
||||
default="source/output/uids/4_unfollow_mids_list.txt",
|
||||
help="取关UID输出路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--group_info",
|
||||
default="source/output/uids/only_group_info.md",
|
||||
help="分组信息输出路径",
|
||||
)
|
||||
parser.add_argument("--titles-per-up", type=int, default=10, help="每个UP抓取标题数量")
|
||||
parser.add_argument("--batch-size", type=int, default=20, help="分批分析每批条数")
|
||||
parser.add_argument("--workers", type=int, default=6, help="并发请求数")
|
||||
parser.add_argument("--max-retries", type=int, default=2, help="单条分析重试次数")
|
||||
parser.add_argument("--request-timeout", type=float, default=60.0, help="单次请求超时")
|
||||
parser.add_argument("--split-size", type=int, default=100, help="取关UID拆分分组大小")
|
||||
parser.add_argument("--sleep-seconds", type=float, default=0.0, help="任务间隔秒数")
|
||||
parser.add_argument("--retry-times", type=int, default=3, help="抓取重试次数")
|
||||
parser.add_argument("--fetch-mode", choices=["auto", "api", "html"], default="auto", help="标题抓取模式")
|
||||
parser.add_argument("--only-tag", default="", help="可选:仅处理包含该标签的UP")
|
||||
parser.add_argument("--max-ups", type=int, default=0, help="可选:限制处理UP数量")
|
||||
parser.add_argument("--bili-cookie", default="", help="可选:运行时传入B站Cookie")
|
||||
parser.add_argument("--skip-fetch", action="store_true", help="跳过抓取阶段,直接使用已有标题报告")
|
||||
parser.add_argument("--skip-analyze", action="store_true", help="跳过分析阶段,直接做产物提取")
|
||||
parser.add_argument("--skip-sort", action="store_true", help="跳过排序阶段")
|
||||
parser.add_argument("--skip-group", action="store_true", help="跳过提取分组阶段")
|
||||
parser.add_argument("--python", default=sys.executable, help="指定Python解释器")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_cmd(cmd: list[str], title: str) -> None:
|
||||
print(f"\n=== {title} ===")
|
||||
print("$", " ".join(cmd))
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
for p in [
|
||||
Path(args.titles_report).parent,
|
||||
Path(args.analysis_report).parent,
|
||||
Path(args.keep_report).parent,
|
||||
Path(args.unfollow_uids).parent,
|
||||
]:
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not args.skip_fetch:
|
||||
fetch_cmd = [
|
||||
args.python,
|
||||
"source/scripts/analyze_up_content.py",
|
||||
"--input",
|
||||
args.input_json,
|
||||
"--output",
|
||||
args.titles_report,
|
||||
"--titles-per-up",
|
||||
str(max(1, args.titles_per_up)),
|
||||
"--retry-times",
|
||||
str(max(1, args.retry_times)),
|
||||
"--fetch-mode",
|
||||
args.fetch_mode,
|
||||
"--sleep-seconds",
|
||||
str(max(0.0, args.sleep_seconds)),
|
||||
"--skip-ai",
|
||||
]
|
||||
if args.only_tag:
|
||||
fetch_cmd += ["--only-tag", args.only_tag]
|
||||
if args.max_ups > 0:
|
||||
fetch_cmd += ["--max-ups", str(args.max_ups)]
|
||||
if args.bili_cookie:
|
||||
fetch_cmd += ["--bili-cookie", args.bili_cookie]
|
||||
|
||||
run_cmd(fetch_cmd, "步骤1/6 抓取视频标题")
|
||||
|
||||
if not args.skip_analyze:
|
||||
analyze_cmd = [
|
||||
args.python,
|
||||
"source/scripts/batch_ai_summary_from_report.py",
|
||||
"--input-report",
|
||||
args.titles_report,
|
||||
"--output-report",
|
||||
args.analysis_report,
|
||||
"--batch-size",
|
||||
str(max(1, args.batch_size)),
|
||||
"--run-all-batches",
|
||||
"--workers",
|
||||
str(max(1, args.workers)),
|
||||
"--max-retries",
|
||||
str(max(1, args.max_retries)),
|
||||
"--request-timeout",
|
||||
str(max(1.0, args.request_timeout)),
|
||||
"--sleep-seconds",
|
||||
str(max(0.0, args.sleep_seconds)),
|
||||
]
|
||||
run_cmd(analyze_cmd, "步骤2/6 分批AI分析")
|
||||
|
||||
keep_cmd = [
|
||||
args.python,
|
||||
"source/scripts/extract_keep_follow_doc.py",
|
||||
"--input-report",
|
||||
args.analysis_report,
|
||||
"--output-report",
|
||||
args.keep_report,
|
||||
]
|
||||
run_cmd(keep_cmd, "步骤3/6 生成保留关注报告")
|
||||
|
||||
uid_cmd = [
|
||||
args.python,
|
||||
"source/scripts/extract_unfollow_list.py",
|
||||
"--input-report",
|
||||
args.analysis_report,
|
||||
"--output-csv",
|
||||
args.unfollow_uids,
|
||||
"--format",
|
||||
"mid-only",
|
||||
"--split-size",
|
||||
str(max(0, args.split_size)),
|
||||
]
|
||||
run_cmd(uid_cmd, "步骤4/6 生成取关UID列表")
|
||||
|
||||
sorted_report = "source/output/reports/5_sorted_up_analysis.md"
|
||||
group_report = "source/output/reports/6_group_info.md"
|
||||
|
||||
if not args.skip_sort:
|
||||
sort_cmd = [
|
||||
args.python,
|
||||
"source/scripts/sort_up_main.py",
|
||||
"--input",
|
||||
args.analysis_report,
|
||||
"--output",
|
||||
sorted_report,
|
||||
]
|
||||
run_cmd(sort_cmd, "步骤5/6 按首字母排序")
|
||||
|
||||
if not args.skip_group:
|
||||
input_for_group = sorted_report if not args.skip_sort else args.analysis_report
|
||||
group_cmd = [
|
||||
args.python,
|
||||
"source/scripts/extract_group_info.py",
|
||||
"--input",
|
||||
input_for_group,
|
||||
"--output",
|
||||
group_report,
|
||||
]
|
||||
run_cmd(group_cmd, "步骤6/6 提取分组信息")
|
||||
|
||||
print("\n流水线完成。")
|
||||
print(f"- 1 标题报告: {args.titles_report}")
|
||||
print(f"- 2 分析报告: {args.analysis_report}")
|
||||
print(f"- 3 保留报告: {args.keep_report}")
|
||||
print(f"- 4 取关UID: {args.unfollow_uids}")
|
||||
if not args.skip_sort:
|
||||
print(f"- 5 排序报告: {sorted_report}")
|
||||
if not args.skip_group:
|
||||
print(f"- 6 分组报告: {group_report}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
93
source/scripts/sort_up_main.py
Normal file
93
source/scripts/sort_up_main.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import argparse
|
||||
import re
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="对UP主按首字母排序")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
default="source/output/reports/2_up_analysis_full_auto.md",
|
||||
help="输入报告路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="输出报告路径(默认覆盖输入)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
input_file = args.input
|
||||
output_file = args.output or input_file
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
header_lines = []
|
||||
section_starts = []
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith('## '):
|
||||
section_starts.append(i)
|
||||
|
||||
if len(section_starts) < 2:
|
||||
print('No sections found')
|
||||
return 1
|
||||
|
||||
header = '\n'.join(lines[:section_starts[0]])
|
||||
sections_data = []
|
||||
|
||||
for idx in range(len(section_starts)):
|
||||
start = section_starts[idx]
|
||||
if idx + 1 < len(section_starts):
|
||||
end = section_starts[idx + 1]
|
||||
else:
|
||||
end = len(lines)
|
||||
|
||||
section_lines = lines[start:end]
|
||||
section_text = '\n'.join(section_lines)
|
||||
sections_data.append(section_text)
|
||||
|
||||
sections_data = sections_data[1:]
|
||||
|
||||
parsed = []
|
||||
for sec in sections_data:
|
||||
match = re.match(r'^## (\d+)\. (.+) \(mid: (\d+)\)', sec)
|
||||
if match:
|
||||
num = int(match.group(1))
|
||||
name = match.group(2)
|
||||
mid = match.group(3)
|
||||
parsed.append({
|
||||
'num': num,
|
||||
'name': name,
|
||||
'mid': mid,
|
||||
'content': sec
|
||||
})
|
||||
|
||||
def sort_key(item):
|
||||
name = item['name']
|
||||
first_char = name[0].lower() if name else ''
|
||||
if first_char.isdigit():
|
||||
return '0' + first_char
|
||||
elif first_char.isalpha():
|
||||
return '1' + first_char
|
||||
else:
|
||||
return '2' + first_char
|
||||
|
||||
parsed.sort(key=sort_key)
|
||||
|
||||
new_content = header + '\n'
|
||||
for i, sec in enumerate(parsed):
|
||||
new_content += sec['content'] + '\n'
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
|
||||
print(f'Sorted {len(parsed)} sections')
|
||||
print('First 10:')
|
||||
for s in parsed[:10]:
|
||||
print(f' {s["name"]}')
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user