first_test

Co-authored-by: Copilot <copilot@github.com>
2026-04-26 19:26:17 +08:00
commit b34239f5ea
29 changed files with 138300 additions and 0 deletions
--- a/source/batch_ai_summary_from_report
+++ b/source/batch_ai_summary_from_report
@@ -0,0 +1,573 @@
+#!/usr/bin/env python3
+"""Batch AI summary from existing UP markdown report.
+
+Read an existing report (e.g. source/up_analysis_report.md),
+extract each UP's title list, and generate AI summaries in batches.
+"""
+
+from __future__ import annotations
+
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import math
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from urllib import request
+
+# Fill your Volcengine Ark settings here.
+VOLCENGINE_API_KEY = "586d443c-5034-4810-9760-50ce77394e8a"
+VOLCENGINE_MODEL = "deepseek-v3-1-terminus"
+VOLCENGINE_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
+
+SKIP_MARKERS = {
+    "",
+    "测试模式已跳过AI分析",
+    "（待分析）",
+}
+
+# 预设分组及关键词规则（可自行扩展）。
+PRESET_GROUPS: dict[str, list[str]] = {
+    "AAA_核心每日必读":[
+        "编程", "算法", "工程", "干货", "新闻", "趋势", 
+    ],
+    "AA_编程信息干货必留": [
+        "编程", "算法", "工程", "教程", "实战", "课程", "新技术", "开源", "工具", "效率", "技术", "架构",
+    ],
+    "A_硬核知识保留": [
+        "科普", "数学", "物理", "编程", "算法", "工程", "历史", "新闻", "深度",
+    ],
+    "B_技能学习保留": [
+        "英语", "四六级", "考研", "面试", "教程", "实战", "学习", "课程", "写作",
+    ],
+    "C_资讯快餐观察": [
+        "热点", "速览", "信息差", "快报", "盘点", "吐槽", "观点", "趋势",
+    ],
+    "D_娱乐消遣可取关": [
+        "搞笑", "整活", "抽象", "乐子", "娱乐", "段子", "鬼畜", "日常", "情侣",
+    ],
+    "E_营销带货谨慎": [
+        "好物", "测评", "种草", "直播", "带货", "优惠", "开箱", "广告", "激活",
+    ],
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="基于现有报告分批做AI总结")
+    parser.add_argument(
+        "--input-report",
+        default="source/output/reports/up_titles_report.md",
+        help="已有标题报告路径，默认: source/output/reports/up_titles_report.md",
+    )
+    parser.add_argument(
+        "--output-report",
+        default="source/output/reports/up_analysis_full_auto.md",
+        help="输出报告路径，默认: source/output/reports/up_analysis_full_auto.md",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=20,
+        help="每批处理数量，默认: 20",
+    )
+    parser.add_argument(
+        "--batch-index",
+        type=int,
+        default=1,
+        help="批次序号（从1开始），默认: 1",
+    )
+    parser.add_argument(
+        "--sleep-seconds",
+        type=float,
+        default=0.0,
+        help="提交任务间隔秒数，默认: 0（并发模式建议0）",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=4,
+        help="并发请求数，默认: 4",
+    )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=2,
+        help="单个UP分析最大重试次数，默认: 2",
+    )
+    parser.add_argument(
+        "--request-timeout",
+        type=float,
+        default=60.0,
+        help="单次AI请求超时秒数，默认: 60",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="强制覆盖已有AI分析（默认只处理待分析项）",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="输出调试信息",
+    )
+    parser.add_argument(
+        "--config-from",
+        default="source/analyze_up_content.py",
+        help="自动读取API配置的脚本路径，默认: source/analyze_up_content.py",
+    )
+    parser.add_argument(
+        "--run-all-batches",
+        action="store_true",
+        help="自动连续跑完所有批次（忽略batch-index）",
+    )
+    return parser.parse_args()
+
+
+def load_api_config_from_script(path: Path) -> dict[str, str]:
+    if not path.exists():
+        return {}
+    text = path.read_text(encoding="utf-8", errors="replace")
+    result: dict[str, str] = {}
+    for key in ("VOLCENGINE_API_KEY", "VOLCENGINE_MODEL", "VOLCENGINE_BASE_URL"):
+        m = re.search(rf"^{key}\s*=\s*\"([^\"]*)\"", text, flags=re.MULTILINE)
+        if m:
+            result[key] = m.group(1).strip()
+    return result
+
+
+def parse_report(path: Path) -> list[dict[str, Any]]:
+    lines = path.read_text(encoding="utf-8").splitlines()
+
+    items: list[dict[str, Any]] = []
+    current: dict[str, Any] | None = None
+    section = ""
+
+    for line in lines:
+        m = re.match(r"^##\s+\d+\.\s+(.*?)\s+\(mid:\s*(\d+)\)", line)
+        if m:
+            if current is not None:
+                items.append(current)
+            mid = int(m.group(2))
+            current = {
+                "mid": mid,
+                "name": m.group(1).strip(),
+                "tag": [],
+                "url": f"https://space.bilibili.com/{mid}/video",
+                "titles": [],
+                "analysis": "",
+                "error": "",
+            }
+            section = ""
+            continue
+
+        if current is None:
+            continue
+
+        if line.startswith("- 主页: "):
+            current["url"] = line.replace("- 主页: ", "", 1).strip()
+            continue
+        if line.startswith("- 标签: "):
+            raw = line.replace("- 标签: ", "", 1).strip()
+            current["tag"] = [] if raw in ("", "无") else [x.strip() for x in raw.split(",") if x.strip()]
+            continue
+
+        if line == "### 最近10条标题":
+            section = "titles"
+            continue
+        if line == "### AI分析":
+            section = "analysis"
+            continue
+        if line == "### 异常":
+            section = "error"
+            continue
+        if line.startswith("### "):
+            section = ""
+            continue
+
+        if section == "titles" and line.startswith("- "):
+            text = line[2:].strip()
+            if text and text != "（未抓取到标题）":
+                current["titles"].append(text)
+        elif section == "analysis" and line.strip():
+            current["analysis"] = (current["analysis"] + "\n" + line.strip()).strip()
+        elif section == "error" and line.startswith("- "):
+            current["error"] = line[2:].strip()
+
+    if current is not None:
+        items.append(current)
+
+    return items
+
+
+def call_volcengine_chat(
+    system_prompt: str,
+    user_prompt: str,
+    cfg: dict[str, str],
+    timeout: float,
+) -> str:
+    api_key = cfg.get("VOLCENGINE_API_KEY", "").strip()
+    model = cfg.get("VOLCENGINE_MODEL", "").strip()
+    base_url = cfg.get("VOLCENGINE_BASE_URL", "").strip()
+
+    if (not api_key) or ("在这里填" in api_key):
+        raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_API_KEY")
+    if (not model) or ("在这里填" in model):
+        raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_MODEL")
+    if not base_url:
+        raise RuntimeError("请先在脚本顶部填写 VOLCENGINE_BASE_URL")
+
+    payload = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        "temperature": 0.4,
+    }
+
+    body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
+    req = request.Request(
+        f"{base_url.rstrip('/')}/chat/completions",
+        data=body,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}",
+        },
+        method="POST",
+    )
+
+    with request.urlopen(req, timeout=timeout) as resp:
+        text = resp.read().decode("utf-8", errors="replace")
+
+    data = json.loads(text)
+    content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+    if not isinstance(content, str) or not content.strip():
+        raise RuntimeError(f"AI响应异常: {text[:500]}")
+    return content.strip()
+
+
+def summarize_one_up(
+    name: str,
+    mid: int,
+    titles: list[str],
+    tags: list[str],
+    cfg: dict[str, str],
+    timeout: float,
+) -> dict[str, str]:
+    system_prompt = (
+        "你是内容定位与订阅决策助手。"
+        "你必须输出合法JSON，不要输出其它文本。"
+    )
+    joined_titles = "\n".join(f"- {t}" for t in titles)
+    joined_tags = "、".join(tags) if tags else "无"
+    rule_hint = heuristic_group_hint(titles, tags)
+    groups_desc = "\n".join(f"- {k}" for k in PRESET_GROUPS)
+
+    user_prompt = f"""
+请基于以下信息完成分组与总结。
+
+UP主: {name}
+mid: {mid}
+标签: {joined_tags}
+最近标题:
+{joined_titles}
+
+预设分组:
+{groups_desc}
+
+代码规则初判:
+{rule_hint}
+
+要求:
+1) 输出JSON对象，字段严格为: summary, group, action, reason。
+2) summary: 一段中文总结，50-100字。
+3) group: 必须从预设分组里选一个。给出详细的分组类别和命中分组中的规则词。
+4) action: 只能是"保留关注"或"可以取关"。敏感一点，只保留真正核心优质的up，其他都建议取关。
+5) reason: 30-60字，解释为什么分到该组并给出该动作。
+""".strip()
+
+    content = call_volcengine_chat(system_prompt, user_prompt, cfg, timeout=timeout)
+    return parse_ai_json(content)
+
+
+def parse_ai_json(content: str) -> dict[str, str]:
+    text = content.strip()
+    if text.startswith("```"):
+        text = re.sub(r"^```[a-zA-Z]*\n?", "", text)
+        text = re.sub(r"\n?```$", "", text).strip()
+    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
+    if m:
+        text = m.group(0)
+    data = json.loads(text)
+    summary = str(data.get("summary", "")).strip()
+    group = str(data.get("group", "")).strip()
+    action = str(data.get("action", "")).strip()
+    reason = str(data.get("reason", "")).strip()
+    if not summary:
+        raise RuntimeError("AI返回缺少summary")
+    if group not in PRESET_GROUPS:
+        raise RuntimeError(f"AI返回未知group: {group}")
+    if action not in ("保留关注", "可以取关"):
+        raise RuntimeError(f"AI返回未知action: {action}")
+    if not reason:
+        reason = "基于标题内容与更新风格综合判断。"
+    return {
+        "summary": summary,
+        "group": group,
+        "action": action,
+        "reason": reason,
+    }
+
+
+def heuristic_group_hint(titles: list[str], tags: list[str]) -> str:
+    text = "\n".join(titles) + "\n" + " ".join(tags)
+    score: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
+    lower_text = text.lower()
+    for group, words in PRESET_GROUPS.items():
+        for w in words:
+            w_lower = w.lower()
+            if w_lower in lower_text:
+                score[group] += 1
+    ranked = sorted(score.items(), key=lambda x: x[1], reverse=True)
+    best_group, best_score = ranked[0]
+    if best_score <= 0:
+        return "未命中关键词，倾向按内容专业度与稳定性判断。"
+    top3 = ", ".join(f"{g}:{s}" for g, s in ranked[:3])
+    return f"关键词命中最高组={best_group}（score={best_score}），参考分布: {top3}"
+
+
+def summarize_one_up_with_retry(
+    item: dict[str, Any],
+    cfg: dict[str, str],
+    max_retries: int,
+    timeout: float,
+    debug: bool,
+) -> dict[str, str]:
+    last_exc: Exception | None = None
+    total_try = max(1, max_retries)
+    for attempt in range(1, total_try + 1):
+        try:
+            return summarize_one_up(
+                item["name"],
+                item["mid"],
+                item.get("titles", []),
+                item.get("tag", []),
+                cfg,
+                timeout=timeout,
+            )
+        except Exception as exc:  # noqa: BLE001
+            last_exc = exc
+            if debug:
+                print(f"[debug] {item['name']} 第{attempt}次失败: {exc}")
+            if attempt < total_try:
+                time.sleep(min(2.0, 0.5 * attempt))
+    raise RuntimeError(str(last_exc) if last_exc else "未知错误")
+
+
+def build_report(items: list[dict[str, Any]], batch_note: str) -> str:
+    now = time.strftime("%Y-%m-%d %H:%M:%S")
+    lines: list[str] = [
+        "# UP主内容分析报告（分批AI总结）",
+        "",
+        f"- 生成时间: {now}",
+        f"- 分析数量: {len(items)}",
+        f"- 处理说明: {batch_note}",
+        "",
+    ]
+
+    group_stats: dict[str, int] = {k: 0 for k in PRESET_GROUPS}
+    action_stats: dict[str, int] = {"保留关注": 0, "可以取关": 0}
+    for item in items:
+        g = item.get("group", "")
+        a = item.get("action", "")
+        if g in group_stats:
+            group_stats[g] += 1
+        if a in action_stats:
+            action_stats[a] += 1
+
+    lines.append("## 分组统计")
+    lines.append("")
+    for g, c in group_stats.items():
+        lines.append(f"- {g}: {c}")
+    lines.append(f"- 保留关注: {action_stats['保留关注']}")
+    lines.append(f"- 可以取关: {action_stats['可以取关']}")
+    lines.append("")
+
+    for idx, item in enumerate(items, start=1):
+        lines.append(f"## {idx}. {item['name']} (mid: {item['mid']})")
+        lines.append("")
+        lines.append(f"- 主页: {item['url']}")
+        tags = item.get("tag", [])
+        lines.append(f"- 标签: {', '.join(tags) if tags else '无'}")
+        lines.append("")
+        lines.append("### 最近10条标题")
+        lines.append("")
+        titles = item.get("titles", [])
+        if titles:
+            for t in titles:
+                lines.append(f"- {t}")
+        else:
+            lines.append("- （未抓取到标题）")
+        lines.append("")
+
+        lines.append("### AI分析")
+        lines.append("")
+        analysis = item.get("analysis", "")
+        lines.append(analysis if analysis else "（待分析）")
+        lines.append("")
+
+        lines.append("### 分组建议")
+        lines.append("")
+        group = item.get("group", "")
+        action = item.get("action", "")
+        reason = item.get("group_reason", "")
+        if group and action:
+            lines.append(f"- 预设分组: {group}")
+            lines.append(f"- 建议动作: {action}")
+            lines.append(f"- 判断依据: {reason if reason else '基于标题与更新风格综合判断。'}")
+        else:
+            lines.append("- （待分组）")
+        lines.append("")
+
+        error = item.get("error", "")
+        if error:
+            lines.append("### 异常")
+            lines.append("")
+            lines.append(f"- {error}")
+            lines.append("")
+
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def main() -> int:
+    args = parse_args()
+    input_report = Path(args.input_report)
+    output_report = Path(args.output_report)
+
+    if not input_report.exists():
+        print(f"输入报告不存在: {input_report}", file=sys.stderr)
+        return 1
+
+    items = parse_report(input_report)
+    if not items:
+        print("输入报告未解析出任何UP条目", file=sys.stderr)
+        return 1
+
+    config = {
+        "VOLCENGINE_API_KEY": VOLCENGINE_API_KEY,
+        "VOLCENGINE_MODEL": VOLCENGINE_MODEL,
+        "VOLCENGINE_BASE_URL": VOLCENGINE_BASE_URL,
+    }
+    if ("在这里填" in config["VOLCENGINE_API_KEY"]) or ("在这里填" in config["VOLCENGINE_MODEL"]):
+        inherited = load_api_config_from_script(Path(args.config_from))
+        if inherited:
+            config.update(inherited)
+
+    if args.force:
+        pending = [it for it in items if it.get("titles")]
+    else:
+        pending = [
+            it for it in items
+            if it.get("titles") and it.get("analysis", "").strip() in SKIP_MARKERS
+        ]
+
+    if not pending:
+        print("没有待分析条目，直接输出当前报告")
+        output_report.write_text(build_report(items, "无待分析条目"), encoding="utf-8")
+        return 0
+
+    index_map = {f"{it['mid']}::{it['name']}": idx for idx, it in enumerate(items)}
+    success_total = 0
+    failed_total = 0
+
+    batch_size = max(1, args.batch_size)
+    if args.run_all_batches:
+        total_batches = math.ceil(len(pending) / batch_size)
+        batch_indexes = list(range(1, total_batches + 1))
+        print(f"自动连续模式: 共{total_batches}批, 待分析总数{len(pending)}")
+    else:
+        batch_indexes = [max(1, args.batch_index)]
+
+    workers = max(1, args.workers)
+    print(f"并发配置: workers={workers}, retries={max(1, args.max_retries)}, timeout={args.request_timeout}s")
+
+    for batch_index in batch_indexes:
+        start = (batch_index - 1) * batch_size
+        end = start + batch_size
+        batch = pending[start:end]
+        if not batch:
+            continue
+
+        print(
+            f"开始分批AI总结: 第{batch_index}批, 每批{batch_size}条, "
+            f"本批{len(batch)}条, 待分析总数{len(pending)}"
+        )
+
+        success = 0
+        failed = 0
+        future_to_item: dict[Any, dict[str, Any]] = {}
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            for i, it in enumerate(batch, start=1):
+                print(f"[submit {i}/{len(batch)}] {it['name']} ({it['mid']})")
+                future = executor.submit(
+                    summarize_one_up_with_retry,
+                    it,
+                    config,
+                    max(1, args.max_retries),
+                    float(args.request_timeout),
+                    args.debug,
+                )
+                future_to_item[future] = it
+                if args.sleep_seconds > 0:
+                    time.sleep(args.sleep_seconds)
+
+            done_count = 0
+            for future in as_completed(future_to_item):
+                done_count += 1
+                it = future_to_item[future]
+                idx = index_map.get(f"{it['mid']}::{it['name']}")
+                try:
+                    ai_res = future.result()
+                    if idx is not None:
+                        items[idx]["analysis"] = ai_res["summary"]
+                        items[idx]["group"] = ai_res["group"]
+                        items[idx]["action"] = ai_res["action"]
+                        items[idx]["group_reason"] = ai_res["reason"]
+                        items[idx]["error"] = ""
+                    success += 1
+                    print(f"[done {done_count}/{len(batch)}] 成功: {it['name']} ({it['mid']})")
+                except Exception as exc:  # noqa: BLE001
+                    if idx is not None:
+                        items[idx]["error"] = str(exc)
+                    failed += 1
+                    print(f"[done {done_count}/{len(batch)}] 失败: {it['name']} ({it['mid']})")
+                    if args.debug:
+                        print(f"[debug] 失败详情: {exc}")
+
+        success_total += success
+        failed_total += failed
+
+        step_note = (
+            f"第{batch_index}批完成: 成功{success}, 失败{failed}, "
+            f"本批{len(batch)}, 待分析总数{len(pending)}"
+        )
+        output_report.parent.mkdir(parents=True, exist_ok=True)
+        output_report.write_text(build_report(items, step_note), encoding="utf-8")
+        print(f"第{batch_index}批写入完成: {output_report}")
+
+    mode_text = "自动连续" if args.run_all_batches else "单批"
+    note = (
+        f"{mode_text}模式完成: 成功{success_total}, 失败{failed_total}, "
+        f"处理批次数={len(batch_indexes)}, 待分析总数={len(pending)}"
+    )
+    output_report.parent.mkdir(parents=True, exist_ok=True)
+    output_report.write_text(build_report(items, note), encoding="utf-8")
+    print(f"输出完成: {output_report}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())