Files
bili_follow_group/source/batch_ai_summary_from_report copy.py
digouyou b34239f5ea first_test
Co-authored-by: Copilot <copilot@github.com>
2026-04-26 19:26:17 +08:00

574 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Batch AI summary from existing UP markdown report.
Read an existing report (e.g. source/up_analysis_report.md),
extract each UP's title list, and generate AI summaries in batches.
"""
from __future__ import annotations
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import math
import re
import sys
import time
from pathlib import Path
from typing import Any
from urllib import request
# Fill your Volcengine Ark settings here.
VOLCENGINE_API_KEY = "586d443c-5034-4810-9760-50ce77394e8a"
VOLCENGINE_MODEL = "deepseek-v3-1-terminus"
VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
SKIP_MARKERS = {
"",
"测试模式已跳过AI分析",
"(待分析)",
}
# 预设分组及关键词规则(可自行扩展)。
PRESET_GROUPS: dict[str, list[str]] = {
"AAA_核心每日必读":[
"编程", "算法", "工程", "干货", "新闻", "趋势",
],
"AA_编程信息干货必留": [
"编程", "算法", "工程", "教程", "实战", "课程", "新技术", "开源", "工具", "效率", "技术", "架构",
],
"A_硬核知识保留": [
"科普", "数学", "物理", "编程", "算法", "工程", "历史", "新闻", "深度",
],
"B_技能学习保留": [
"英语", "四六级", "考研", "面试", "教程", "实战", "学习", "课程", "写作",
],
"C_资讯快餐观察": [
"热点", "速览", "信息差", "快报", "盘点", "吐槽", "观点", "趋势",
],
"D_娱乐消遣可取关": [
"搞笑", "整活", "抽象", "乐子", "娱乐", "段子", "鬼畜", "日常", "情侣",
],
"E_营销带货谨慎": [
"好物", "测评", "种草", "直播", "带货", "优惠", "开箱", "广告", "激活",
],
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="基于现有报告分批做AI总结")
parser.add_argument(
"--input-report",
default="source/output/reports/up_titles_report.md",
help="已有标题报告路径,默认: source/output/reports/up_titles_report.md",
)
parser.add_argument(
"--output-report",
default="source/output/reports/up_analysis_full_auto.md",
help="输出报告路径,默认: source/output/reports/up_analysis_full_auto.md",
)
parser.add_argument(
"--batch-size",
type=int,
default=20,
help="每批处理数量,默认: 20",
)
parser.add_argument(
"--batch-index",
type=int,
default=1,
help="批次序号从1开始默认: 1",
)
parser.add_argument(
"--sleep-seconds",
type=float,
default=0.0,
help="提交任务间隔秒数,默认: 0并发模式建议0",
)
parser.add_argument(
"--workers",
type=int,
default=4,
help="并发请求数,默认: 4",
)
parser.add_argument(
"--max-retries",
type=int,
default=2,
help="单个UP分析最大重试次数默认: 2",
)
parser.add_argument(
"--request-timeout",
type=float,
default=60.0,
help="单次AI请求超时秒数默认: 60",
)
parser.add_argument(
"--force",
action="store_true",
help="强制覆盖已有AI分析默认只处理待分析项",
)
parser.add_argument(
"--debug",
action="store_true",
help="输出调试信息",
)
parser.add_argument(
"--config-from",
default="source/analyze_up_content.py",
help="自动读取API配置的脚本路径默认: source/analyze_up_content.py",
)
parser.add_argument(
"--run-all-batches",
action="store_true",
help="自动连续跑完所有批次忽略batch-index",
)
return parser.parse_args()
def load_api_config_from_script(path: Path) -> dict[str, str]:
if not path.exists():
return {}
text = path.read_text(encoding="utf-8", errors="replace")
result: dict[str, str] = {}
for key in ("VOLCENGINE_API_KEY", "VOLCENGINE_MODEL", "VOLCENGINE_BASE_URL"):
m = re.search(rf"^{key}\s*=\s*\"([^\"]*)\"", text, flags=re.MULTILINE)
if m:
result[key] = m.group(1).strip()
return result
def parse_report(path: Path) -> list[dict[str, Any]]:
lines = path.read_text(encoding="utf-8").splitlines()
items: list[dict[str, Any]] = []
current: dict[str, Any] | None = None
section = ""
for line in lines:
m = re.match(r"^##\s+\d+\.\s+(.*?)\s+\(mid:\s*(\d+)\)", line)
if m:
if current is not None:
items.append(current)
mid = int(m.group(2))
current = {
"mid": mid,
"name": m.group(1).strip(),
"tag": [],
"url": f"https://space.bilibili.com/{mid}/video",
"titles": [],
"analysis": "",
"error": "",
}
section = ""
continue
if current is None:
continue
if line.startswith("- 主页: "):
current["url"] = line.replace("- 主页: ", "", 1).strip()
continue
if line.startswith("- 标签: "):
raw = line.replace("- 标签: ", "", 1).strip()
current["tag"] = [] if raw in ("", "") else [x.strip() for x in raw.split(",") if x.strip()]
continue
if line == "### 最近10条标题":
section = "titles"
continue
if line == "### AI分析":
section = "analysis"
continue
if line == "### 异常":
section = "error"
continue
if line.startswith("### "):
section = ""
continue
if section == "titles" and line.startswith("- "):
text = line[2:].strip()
if text and text != "(未抓取到标题)":
current["titles"].append(text)
elif section == "analysis" and line.strip():
current["analysis"] = (current["analysis"] + "\n" + line.strip()).strip()
elif section == "error" and line.startswith("- "):
current["error"] = line[2:].strip()
if current is not None:
items.append(current)
return items
def call_volcengine_chat(
system_prompt: str,
user_prompt: str,
cfg: dict[str, str],
timeout: float,
) -> str:
api_key = cfg.get("VOLCENGINE_API_KEY", "").strip()
model = cfg.get("VOLCENGINE_MODEL", "").strip()
base_url = cfg.get("VOLCENGINE_BASE_URL", "").strip()
if (not api_key) or ("在这里填" in api_key):
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_API_KEY")
if (not model) or ("在这里填" in model):
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_MODEL")
if not base_url:
raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_BASE_URL")
payload = {
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"temperature": 0.4,
}
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
req = request.Request(
f"{base_url.rstrip('/')}/chat/completions",
data=body,
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
},
method="POST",
)
with request.urlopen(req, timeout=timeout) as resp:
text = resp.read().decode("utf-8", errors="replace")
data = json.loads(text)
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
if not isinstance(content, str) or not content.strip():
raise RuntimeError(f"AI响应异常: {text[:500]}")
return content.strip()
def summarize_one_up(
name: str,
mid: int,
titles: list[str],
tags: list[str],
cfg: dict[str, str],
timeout: float,
) -> dict[str, str]:
system_prompt = (
"你是内容定位与订阅决策助手。"
"你必须输出合法JSON不要输出其它文本。"
)
joined_titles = "\n".join(f"- {t}" for t in titles)
joined_tags = "".join(tags) if tags else ""
rule_hint = heuristic_group_hint(titles, tags)
groups_desc = "\n".join(f"- {k}" for k in PRESET_GROUPS)
user_prompt = f"""
请基于以下信息完成分组与总结。
UP主: {name}
mid: {mid}
标签: {joined_tags}
最近标题:
{joined_titles}
预设分组:
{groups_desc}
代码规则初判:
{rule_hint}
要求:
1) 输出JSON对象字段严格为: summary, group, action, reason。
2) summary: 一段中文总结50-100字。
3) group: 必须从预设分组里选一个。给出详细的分组类别和命中分组中的规则词。
4) action: 只能是"保留关注""可以取关"。敏感一点只保留真正核心优质的up其他都建议取关。
5) reason: 30-60字解释为什么分到该组并给出该动作。
""".strip()
content = call_volcengine_chat(system_prompt, user_prompt, cfg, timeout=timeout)
return parse_ai_json(content)
def parse_ai_json(content: str) -> dict[str, str]:
text = content.strip()
if text.startswith("```"):
text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
text = re.sub(r"\n?```$", "", text).strip()
m = re.search(r"\{.*\}", text, flags=re.DOTALL)
if m:
text = m.group(0)
data = json.loads(text)
summary = str(data.get("summary", "")).strip()
group = str(data.get("group", "")).strip()
action = str(data.get("action", "")).strip()
reason = str(data.get("reason", "")).strip()
if not summary:
raise RuntimeError("AI返回缺少summary")
if group not in PRESET_GROUPS:
raise RuntimeError(f"AI返回未知group: {group}")
if action not in ("保留关注", "可以取关"):
raise RuntimeError(f"AI返回未知action: {action}")
if not reason:
reason = "基于标题内容与更新风格综合判断。"
return {
"summary": summary,
"group": group,
"action": action,
"reason": reason,
}
def heuristic_group_hint(titles: list[str], tags: list[str]) -> str:
text = "\n".join(titles) + "\n" + " ".join(tags)
score: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
lower_text = text.lower()
for group, words in PRESET_GROUPS.items():
for w in words:
w_lower = w.lower()
if w_lower in lower_text:
score[group] += 1
ranked = sorted(score.items(), key=lambda x: x[1], reverse=True)
best_group, best_score = ranked[0]
if best_score <= 0:
return "未命中关键词,倾向按内容专业度与稳定性判断。"
top3 = ", ".join(f"{g}:{s}" for g, s in ranked[:3])
return f"关键词命中最高组={best_group}score={best_score}),参考分布: {top3}"
def summarize_one_up_with_retry(
item: dict[str, Any],
cfg: dict[str, str],
max_retries: int,
timeout: float,
debug: bool,
) -> dict[str, str]:
last_exc: Exception | None = None
total_try = max(1, max_retries)
for attempt in range(1, total_try + 1):
try:
return summarize_one_up(
item["name"],
item["mid"],
item.get("titles", []),
item.get("tag", []),
cfg,
timeout=timeout,
)
except Exception as exc: # noqa: BLE001
last_exc = exc
if debug:
print(f"[debug] {item['name']}{attempt}次失败: {exc}")
if attempt < total_try:
time.sleep(min(2.0, 0.5 * attempt))
raise RuntimeError(str(last_exc) if last_exc else "未知错误")
def build_report(items: list[dict[str, Any]], batch_note: str) -> str:
now = time.strftime("%Y-%m-%d %H:%M:%S")
lines: list[str] = [
"# UP主内容分析报告分批AI总结",
"",
f"- 生成时间: {now}",
f"- 分析数量: {len(items)}",
f"- 处理说明: {batch_note}",
"",
]
group_stats: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
action_stats: dict[str, int] = {"保留关注": 0, "可以取关": 0}
for item in items:
g = item.get("group", "")
a = item.get("action", "")
if g in group_stats:
group_stats[g] += 1
if a in action_stats:
action_stats[a] += 1
lines.append("## 分组统计")
lines.append("")
for g, c in group_stats.items():
lines.append(f"- {g}: {c}")
lines.append(f"- 保留关注: {action_stats['保留关注']}")
lines.append(f"- 可以取关: {action_stats['可以取关']}")
lines.append("")
for idx, item in enumerate(items, start=1):
lines.append(f"## {idx}. {item['name']} (mid: {item['mid']})")
lines.append("")
lines.append(f"- 主页: {item['url']}")
tags = item.get("tag", [])
lines.append(f"- 标签: {', '.join(tags) if tags else ''}")
lines.append("")
lines.append("### 最近10条标题")
lines.append("")
titles = item.get("titles", [])
if titles:
for t in titles:
lines.append(f"- {t}")
else:
lines.append("- (未抓取到标题)")
lines.append("")
lines.append("### AI分析")
lines.append("")
analysis = item.get("analysis", "")
lines.append(analysis if analysis else "(待分析)")
lines.append("")
lines.append("### 分组建议")
lines.append("")
group = item.get("group", "")
action = item.get("action", "")
reason = item.get("group_reason", "")
if group and action:
lines.append(f"- 预设分组: {group}")
lines.append(f"- 建议动作: {action}")
lines.append(f"- 判断依据: {reason if reason else '基于标题与更新风格综合判断。'}")
else:
lines.append("- (待分组)")
lines.append("")
error = item.get("error", "")
if error:
lines.append("### 异常")
lines.append("")
lines.append(f"- {error}")
lines.append("")
return "\n".join(lines).rstrip() + "\n"
def main() -> int:
args = parse_args()
input_report = Path(args.input_report)
output_report = Path(args.output_report)
if not input_report.exists():
print(f"输入报告不存在: {input_report}", file=sys.stderr)
return 1
items = parse_report(input_report)
if not items:
print("输入报告未解析出任何UP条目", file=sys.stderr)
return 1
config = {
"VOLCENGINE_API_KEY": VOLCENGINE_API_KEY,
"VOLCENGINE_MODEL": VOLCENGINE_MODEL,
"VOLCENGINE_BASE_URL": VOLCENGINE_BASE_URL,
}
if ("在这里填" in config["VOLCENGINE_API_KEY"]) or ("在这里填" in config["VOLCENGINE_MODEL"]):
inherited = load_api_config_from_script(Path(args.config_from))
if inherited:
config.update(inherited)
if args.force:
pending = [it for it in items if it.get("titles")]
else:
pending = [
it for it in items
if it.get("titles") and it.get("analysis", "").strip() in SKIP_MARKERS
]
if not pending:
print("没有待分析条目,直接输出当前报告")
output_report.write_text(build_report(items, "无待分析条目"), encoding="utf-8")
return 0
index_map = {f"{it['mid']}::{it['name']}": idx for idx, it in enumerate(items)}
success_total = 0
failed_total = 0
batch_size = max(1, args.batch_size)
if args.run_all_batches:
total_batches = math.ceil(len(pending) / batch_size)
batch_indexes = list(range(1, total_batches + 1))
print(f"自动连续模式: 共{total_batches}批, 待分析总数{len(pending)}")
else:
batch_indexes = [max(1, args.batch_index)]
workers = max(1, args.workers)
print(f"并发配置: workers={workers}, retries={max(1, args.max_retries)}, timeout={args.request_timeout}s")
for batch_index in batch_indexes:
start = (batch_index - 1) * batch_size
end = start + batch_size
batch = pending[start:end]
if not batch:
continue
print(
f"开始分批AI总结: 第{batch_index}批, 每批{batch_size}条, "
f"本批{len(batch)}条, 待分析总数{len(pending)}"
)
success = 0
failed = 0
future_to_item: dict[Any, dict[str, Any]] = {}
with ThreadPoolExecutor(max_workers=workers) as executor:
for i, it in enumerate(batch, start=1):
print(f"[submit {i}/{len(batch)}] {it['name']} ({it['mid']})")
future = executor.submit(
summarize_one_up_with_retry,
it,
config,
max(1, args.max_retries),
float(args.request_timeout),
args.debug,
)
future_to_item[future] = it
if args.sleep_seconds > 0:
time.sleep(args.sleep_seconds)
done_count = 0
for future in as_completed(future_to_item):
done_count += 1
it = future_to_item[future]
idx = index_map.get(f"{it['mid']}::{it['name']}")
try:
ai_res = future.result()
if idx is not None:
items[idx]["analysis"] = ai_res["summary"]
items[idx]["group"] = ai_res["group"]
items[idx]["action"] = ai_res["action"]
items[idx]["group_reason"] = ai_res["reason"]
items[idx]["error"] = ""
success += 1
print(f"[done {done_count}/{len(batch)}] 成功: {it['name']} ({it['mid']})")
except Exception as exc: # noqa: BLE001
if idx is not None:
items[idx]["error"] = str(exc)
failed += 1
print(f"[done {done_count}/{len(batch)}] 失败: {it['name']} ({it['mid']})")
if args.debug:
print(f"[debug] 失败详情: {exc}")
success_total += success
failed_total += failed
step_note = (
f"{batch_index}批完成: 成功{success}, 失败{failed}, "
f"本批{len(batch)}, 待分析总数{len(pending)}"
)
output_report.parent.mkdir(parents=True, exist_ok=True)
output_report.write_text(build_report(items, step_note), encoding="utf-8")
print(f"{batch_index}批写入完成: {output_report}")
mode_text = "自动连续" if args.run_all_batches else "单批"
note = (
f"{mode_text}模式完成: 成功{success_total}, 失败{failed_total}, "
f"处理批次数={len(batch_indexes)}, 待分析总数={len(pending)}"
)
output_report.parent.mkdir(parents=True, exist_ok=True)
output_report.write_text(build_report(items, note), encoding="utf-8")
print(f"输出完成: {output_report}")
return 0
if __name__ == "__main__":
raise SystemExit(main())