From 4f9e5e0d191794b2595ea00b58d0e9db7a8da869 Mon Sep 17 00:00:00 2001 From: HAHAJN <14188146+HAHAJN@user.noreply.gitee.com> Date: Sun, 31 May 2026 15:18:17 +0800 Subject: [PATCH] feat: add data-flow-skill --- README.md | 13 +- data/last_update.txt | 2 +- data/local_skills.json | 30 +- public/data/last_update.txt | 2 +- public/data/local_skills.json | 30 +- .../data-flow-skill/data-flow-skill/README.md | 53 + .../data-flow-skill/data-flow-skill/SKILL.md | 266 +++ .../data-flow-skill/references/data-types.md | 21 + .../data-flow-skill/references/reporting.md | 19 + .../data-flow-skill/references/slides.md | 23 + .../data-flow-skill/references/validation.md | 26 + .../references/visualization.md | 24 + .../data-flow-skill/references/workflow.md | 16 + .../data-flow-skill/requirements.txt | 6 + .../data-flow-skill/scripts/README.md | 34 + .../scripts/analysis/__init__.py | 2 + .../analysis/analyzer_data_understanding.py | 145 ++ .../analysis/analyzer_dataset_detection.py | 195 +++ .../analysis/analyzer_findings_generation.py | 1503 +++++++++++++++++ .../analysis/analyzer_preprocessing.py | 61 + .../analysis/analyzer_statistical_analysis.py | 36 + .../analysis/analyzer_strategy_registry.py | 81 + .../analysis/analyzer_time_series_analysis.py | 56 + .../scripts/image_gen/image_generator.py | 202 +++ .../scripts/mermaid/__init__.py | 5 + .../data-flow-skill/scripts/mermaid/cli.py | 117 ++ .../scripts/mermaid/echarts_export.py | 239 +++ .../scripts/mermaid/flowchart.py | 314 ++++ .../visualization/matplotlib/bar_memevolve.py | 113 ++ .../visualization/matplotlib/bar_spice.py | 169 ++ .../visualization/matplotlib/box_plot.py | 113 ++ .../visualization/matplotlib/bubble_chart.py | 149 ++ .../matplotlib/calendar_heatmap.py | 208 +++ .../visualization/matplotlib/line_aime.py | 98 ++ .../matplotlib/line_loss_inset.py | 161 ++ .../matplotlib/line_selfdistill.py | 173 ++ .../matplotlib/parallel_coordinates.py | 173 ++ .../visualization/matplotlib/radar_dora.py | 155 ++ .../visualization/matplotlib/scatter_break.py | 173 ++ .../visualization/matplotlib/scatter_tsne.py | 137 ++ .../visualization/matplotlib/stacked_bar.py | 121 ++ .../visualization/matplotlib/violin_plot.py | 139 ++ ...00\350\203\275\346\270\205\345\215\225.md" | 51 +- 43 files changed, 5607 insertions(+), 47 deletions(-) create mode 100644 skills/data-flow-skill/data-flow-skill/README.md create mode 100644 skills/data-flow-skill/data-flow-skill/SKILL.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/data-types.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/reporting.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/slides.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/validation.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/visualization.md create mode 100644 skills/data-flow-skill/data-flow-skill/references/workflow.md create mode 100644 skills/data-flow-skill/data-flow-skill/requirements.txt create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/README.md create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/__init__.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_data_understanding.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_dataset_detection.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_findings_generation.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_preprocessing.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_statistical_analysis.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_strategy_registry.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_time_series_analysis.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/image_gen/image_generator.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/mermaid/__init__.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/mermaid/cli.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/mermaid/echarts_export.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/mermaid/flowchart.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_memevolve.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_spice.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/box_plot.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bubble_chart.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/calendar_heatmap.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_aime.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_loss_inset.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_selfdistill.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/parallel_coordinates.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/radar_dora.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_break.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_tsne.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/stacked_bar.py create mode 100644 skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/violin_plot.py diff --git a/README.md b/README.md index 8b20ab9..cf5ab33 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,15 @@ 收录最全、更新最快的AI Agent技能库,涵盖**文档处理、内容创作、编程开发、机器学习、自动化工作流**等多个领域的精选技能包。 [![官方技能](https://img.shields.io/badge/官方技能-182-blue?style=flat-square)](https://github.com/anbeime/skill) -[![本地技能](https://img.shields.io/badge/本地技能-61-green?style=flat-square)](https://github.com/anbeime/skill) +[![本地技能](https://img.shields.io/badge/本地技能-62-green?style=flat-square)](https://github.com/anbeime/skill) [![备份覆盖](https://img.shields.io/badge/备份覆盖-100%25-success?style=flat-square)](https://github.com/anbeime/skill) [![自动更新](https://img.shields.io/badge/更新-每24小时-orange?style=flat-square)](https://github.com/anbeime/skill) ## 📊 统计数据 - **官方技能**: 182 个(来自 awesome-agent-skills,自动爬取) -- **本地技能**: 61 个(25核心 + 30子技能 + 6系统内置) -- **技能总数**: 243 个(官方 + 本地) +- **本地技能**: 62 个(26核心 + 30子技能 + 6系统内置) +- **技能总数**: 244 个(官方 + 本地) - **备份覆盖率**: 100%(71个压缩包,完整备份) - **自动更新**: 每24小时自动爬取最新技能 @@ -22,7 +22,7 @@ ### 📦 双重技能库 - **官方技能**:182个来自Anthropic、Vercel、Cloudflare、Google Labs、Hugging Face等顶级团队 -- **本地技能**:61个精选中文技能,涵盖内容创作、视频制作、电商营销等垂直领域 +- **本地技能**:62个精选中文技能,涵盖内容创作、视频制作、电商营销、数据分析等垂直领域 ### 🏷️ 智能分类 按照功能、来源、Star数量等多维度标签进行分类整理。 @@ -47,7 +47,7 @@ - **Better Auth** (3个) - best-practices, commands, create-auth - **其他团队** - Tinybird, Remotion, Inngest 等 -## 💾 本地技能库(61个) +## 💾 本地技能库(62个) ### 📝 内容创作与发布(10个) - **content-creation-publisher** ⭐⭐⭐⭐⭐ - 内容创作与发布全流程 @@ -99,7 +99,8 @@ - **dream-video-prompt-generator** ⭐⭐ - 即梦视频提示词 - **agentkit-multimedia-shopping** ⭐⭐ - 多媒体带货视频 -### 📄 文档与分析(4个) +### 📄 文档与分析(5个) +- **data-flow-skill** ⭐⭐⭐⭐⭐ - 数据分析全流程:数据类型检测、统计分析、可视化、报告与幻灯片生成 - **paper-analysis-assistant** ⭐⭐⭐⭐ - arXiv论文分析 - **contract-review** ⭐⭐⭐ - 合同审核 - **law-to-markdown** ⭐⭐ - 法律文档转换 diff --git a/data/last_update.txt b/data/last_update.txt index 6f1fcba..22bec20 100644 --- a/data/last_update.txt +++ b/data/last_update.txt @@ -1 +1 @@ -2026-02-02T17:07:33.826738 \ No newline at end of file +2026-05-31T00:00:00 diff --git a/data/local_skills.json b/data/local_skills.json index 1714c8d..82baa23 100644 --- a/data/local_skills.json +++ b/data/local_skills.json @@ -1,10 +1,10 @@ { "metadata": { - "total_skills": 61, - "core_skills": 25, + "total_skills": 62, + "core_skills": 26, "sub_skills": 30, "system_builtin": 6, - "last_updated": "2026-02-11T15:30:00", + "last_updated": "2026-05-31T00:00:00", "location": "D:\\tool\\skills", "backup_location": "D:\\tool\\skills\\skill压缩文件", "backup_coverage": "100%", @@ -80,12 +80,13 @@ ] }, "文档与分析": { - "count": 4, + "count": 5, "skills": [ "paper-analysis-assistant", "contract-review", "law-to-markdown", - "stock-analysis" + "stock-analysis", + "data-flow-skill" ] }, "智能体协作": { @@ -148,10 +149,10 @@ "by_api_requirement": { "必需API": 15, "可选API": 20, - "完全免费": 26 + "完全免费": 27 }, "by_usage_frequency": { - "⭐⭐⭐⭐⭐": 10, + "⭐⭐⭐⭐⭐": 11, "⭐⭐⭐⭐": 18, "⭐⭐⭐": 21, "⭐⭐": 12 @@ -188,10 +189,25 @@ "category": "数字人与视频配音", "rating": 5, "description": "音频驱动的稀疏帧视频配音工具,支持音频驱动的Video-to-Video和Image-to-Video" + }, + { + "name": "data-flow-skill", + "category": "文档与分析", + "rating": 5, + "description": "数据分析全流程 Agent Skill,覆盖数据类型检测、预处理、统计分析、可视化、报告和幻灯片生成" } ] }, "update_log": [ + { + "date": "2026-05-31", + "version": "2.1", + "changes": [ + "新增 data-flow-skill:数据类型检测、统计分析、可视化、报告和幻灯片生成一体化技能", + "本地技能总数从61个增加到62个", + "文档与分析分类从4个增加到5个" + ] + }, { "date": "2026-02-11", "version": "2.0", diff --git a/public/data/last_update.txt b/public/data/last_update.txt index 6f1fcba..22bec20 100644 --- a/public/data/last_update.txt +++ b/public/data/last_update.txt @@ -1 +1 @@ -2026-02-02T17:07:33.826738 \ No newline at end of file +2026-05-31T00:00:00 diff --git a/public/data/local_skills.json b/public/data/local_skills.json index 1714c8d..82baa23 100644 --- a/public/data/local_skills.json +++ b/public/data/local_skills.json @@ -1,10 +1,10 @@ { "metadata": { - "total_skills": 61, - "core_skills": 25, + "total_skills": 62, + "core_skills": 26, "sub_skills": 30, "system_builtin": 6, - "last_updated": "2026-02-11T15:30:00", + "last_updated": "2026-05-31T00:00:00", "location": "D:\\tool\\skills", "backup_location": "D:\\tool\\skills\\skill压缩文件", "backup_coverage": "100%", @@ -80,12 +80,13 @@ ] }, "文档与分析": { - "count": 4, + "count": 5, "skills": [ "paper-analysis-assistant", "contract-review", "law-to-markdown", - "stock-analysis" + "stock-analysis", + "data-flow-skill" ] }, "智能体协作": { @@ -148,10 +149,10 @@ "by_api_requirement": { "必需API": 15, "可选API": 20, - "完全免费": 26 + "完全免费": 27 }, "by_usage_frequency": { - "⭐⭐⭐⭐⭐": 10, + "⭐⭐⭐⭐⭐": 11, "⭐⭐⭐⭐": 18, "⭐⭐⭐": 21, "⭐⭐": 12 @@ -188,10 +189,25 @@ "category": "数字人与视频配音", "rating": 5, "description": "音频驱动的稀疏帧视频配音工具,支持音频驱动的Video-to-Video和Image-to-Video" + }, + { + "name": "data-flow-skill", + "category": "文档与分析", + "rating": 5, + "description": "数据分析全流程 Agent Skill,覆盖数据类型检测、预处理、统计分析、可视化、报告和幻灯片生成" } ] }, "update_log": [ + { + "date": "2026-05-31", + "version": "2.1", + "changes": [ + "新增 data-flow-skill:数据类型检测、统计分析、可视化、报告和幻灯片生成一体化技能", + "本地技能总数从61个增加到62个", + "文档与分析分类从4个增加到5个" + ] + }, { "date": "2026-02-11", "version": "2.0", diff --git a/skills/data-flow-skill/data-flow-skill/README.md b/skills/data-flow-skill/data-flow-skill/README.md new file mode 100644 index 0000000..3fdabbd --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/README.md @@ -0,0 +1,53 @@ +# data-flow-skill + +面向数据分析任务的一站式 Agent Skill,覆盖“数据类型检测 → 数据理解 → 预处理 → 统计分析 → 可视化图表生成 → 正式报告生成 → 幻灯片生成”的端到端链路。 + +## 适用场景 + +- 课程作业/论文实验的可复现分析管线 +- CSV、Excel、JSON、文本语料等数据集的探索性分析 +- 问卷、量表、时间序列、文学语料等专项分析 +- SEO/GEO、内容表现、营销数据和业务指标报告 +- 报告与幻灯片的结构化生成 + +## 核心特性 + +- 数据类型检测与策略分派:`tabular_generic`、`questionnaire`、`time_series`、`literary` +- 结构化产物契约:检测、画像、预处理日志、图表计划、分析发现和报告上下文 +- 可视化规划:围绕问题选择趋势、分布、比较、关系、构成和异常类图表 +- 报告与幻灯片生成:从结构化发现中组织正式报告和演示材料 +- 质量约束:子任务拆分、源数据保护、预处理留痕、结论证据化 +- Python 脚本支持:`scripts/analysis/`、`scripts/visualization/`、`scripts/mermaid/`、`scripts/image_gen/` + +## 脚本与依赖 + +可复用脚本已迁移到 `scripts/` 目录。运行前建议安装依赖: + +```bash +pip install -r requirements.txt +``` + +详细说明见 `scripts/README.md`。 + +## 推荐输出结构 + +```text +output/ + figures/ + tables/ + report/ + slides/ + artifacts/ + dataset_detection.json + data_profile.json + preprocessing_log.json + visualization_plan.json + analysis_findings.json + report_context.json +``` + +## 使用方式 + +在 Agent 环境中,当用户提供数据文件并要求分析、绘图、报告或幻灯片时,调用本 Skill。正式分析前应先生成 `plan.md` 并等待用户确认。 + +详细流程见 `SKILL.md` 与 `references/` 目录。 \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/SKILL.md b/skills/data-flow-skill/data-flow-skill/SKILL.md new file mode 100644 index 0000000..674c37e --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/SKILL.md @@ -0,0 +1,266 @@ +--- +name: data-flow-skill +description: 面向数据分析任务的一站式 Agent Skill,覆盖数据类型检测、数据理解、预处理、统计分析、可视化图表生成、正式报告和幻灯片输出。适用于课程作业、论文实验、业务分析、SEO/GEO 数据报告和可追溯数据分析工作流。 +dependency: + python: + - pandas>=2.0.0 + - numpy>=1.24.0 + - matplotlib>=3.7.0 + - scipy>=1.10.0 + optional: + - Node.js 与 @mermaid-js/mermaid-cli 用于 Mermaid 本地渲染 + - LaTeX 用于报告或幻灯片 PDF 编译 + - DASHSCOPE_API_KEY 用于主题示意图生成 +--- + +# Data Flow Skill + +## 任务目标 + +本 Skill 用于将用户提供的数据集转化为可追溯的数据分析产物,覆盖“数据类型检测 → 数据理解 → 预处理 → 统计分析 → 可视化图表 → 报告 → 幻灯片”的端到端流程。 + +适用场景包括: + +- 课程作业或论文实验中的可复现数据分析 +- CSV、Excel、JSON、文本语料等数据集的探索性分析 +- 问卷、量表、时间序列和文学语料的专项分析 +- SEO/GEO、内容表现、营销数据和业务指标报告 +- 将分析结果整理为正式报告、PPT 或演示材料 + +## 触发条件 + +当用户提出以下需求时使用本 Skill: + +- “分析这个数据集” +- “根据 CSV/Excel 生成图表和报告” +- “帮我判断数据类型并做统计分析” +- “把分析结果整理成论文/课程报告” +- “根据数据生成 PPT 或幻灯片” +- “分析 SEO/GEO 指标、Search Console 导出或业务表现数据” + +如果用户需要实时网页抓取、SERP 采集、API 拉取或站点爬虫,应先使用相应的数据收集技能,再使用本 Skill 进行分析。 + +## 核心能力 + +1. **数据类型检测**:识别 `tabular_generic`、`questionnaire`、`time_series`、`literary` 四类主要数据策略。 +2. **数据画像**:检查字段类型、缺失值、重复值、异常值、时间覆盖、类别分布和指标含义。 +3. **透明预处理**:保留源数据,不静默修改文件,所有清洗动作写入日志。 +4. **分任务分析**:将数据处理、统计分析、图表生成和发现提炼拆成可检查的小任务。 +5. **可视化生成**:根据问题选择趋势、分布、比较、关系、构成和异常类图表。 +6. **结构化发现**:为每条发现记录证据、来源、限制、置信度和建议动作。 +7. **报告与幻灯片**:从结构化产物生成正式报告和 slide-ready 输出。 + +## 标准工作流 + +### 1. 确认任务上下文 + +先确认以下信息: + +- 数据文件或目录路径 +- 分析目标和要回答的问题 +- 受众:课程、论文、业务汇报、SEO 团队或管理层 +- 交付形式:探索性分析、图表包、正式报告、PPT 或全部产物 +- 语言、风格和格式要求 +- 是否允许自动预处理、是否有字段含义或指标公式说明 + +### 2. 检测数据类型 + +对输入数据进行策略识别,并将结果写入: + +```text +output/artifacts/dataset_detection.json +``` + +检测结果应包含: + +- `strategy`:主策略 +- `confidence`:置信度 +- `evidence`:判断依据 +- `alternatives`:备选策略 +- `assumptions`:假设 +- `fallback_plan`:回退方案 + +### 3. 制定并确认计划 + +正式分析前生成 `plan.md`,内容包括: + +- 分析目标 +- 数据摘要 +- 数据类型策略 +- 开放问题和假设 +- 预处理规则 +- 分析任务拆分 +- 可视化计划 +- 预期输出 +- 风险与校验点 + +在用户确认计划前,不进入正式分析、报告生成或幻灯片生成。 + +### 4. 数据理解与画像 + +读取数据后生成: + +```text +output/artifacts/data_profile.json +``` + +至少检查: + +- 文件类型、编码和解析问题 +- 行数、列数、字段类型 +- 缺失值、重复值、异常值 +- 数值范围和类别分布 +- 时间字段、时间粒度和覆盖范围 +- 指标定义、单位和方向 + +### 5. 透明预处理 + +不得直接覆盖原始数据。所有预处理动作写入: + +```text +output/artifacts/preprocessing_log.json +``` + +常见动作包括: + +- 字段名标准化 +- 日期解析 +- 数值格式转换 +- 缺失值处理 +- 重复记录处理 +- 类别归一化 +- 派生指标计算 + +### 6. 分策略分析 + +- `tabular_generic`:描述统计、分组比较、相关性、异常点、业务含义。 +- `questionnaire`:量表方向、选项分布、组间差异、开放题归纳、信度检查。 +- `time_series`:趋势、季节性、峰值、下降点、同比/环比、异常时段。 +- `literary`:篇章结构、人物/地点/主题、词频、共现关系、情绪和风格特征。 + +### 7. 可视化规划与生成 + +先写入: + +```text +output/artifacts/visualization_plan.json +``` + +每张图需要说明: + +- 图表标题 +- 回答的问题 +- 输入数据 +- 变量和筛选条件 +- 图表类型 +- 选择原因 +- 输出路径 +- 解读要点 + +图表输出到: + +```text +output/figures/ +``` + +### 8. 生成结构化发现 + +将结论写入: + +```text +output/artifacts/analysis_findings.json +``` + +每条发现应包含: + +- 结论 claim +- 证据 evidence +- 来源图表或表格路径 +- 适用范围 scope +- 限制 limitation +- 置信度 confidence +- 建议动作 recommendation + +### 9. 生成报告和幻灯片 + +报告应基于结构化产物,而不是重新从原始数据开始分析。建议报告结构: + +- 执行摘要 +- 数据来源与质量说明 +- 方法与预处理说明 +- 关键指标与趋势 +- 分组/策略分析 +- 图表证据 +- 结论与建议 +- 局限性和附录 + +幻灯片应基于 `analysis_findings.json` 和 `report_context.json`,围绕受众、演示目标、叙事主线和关键图表组织。 + +## 输出目录约定 + +推荐输出结构: + +```text +output/ + figures/ + tables/ + report/ + slides/ + artifacts/ + dataset_detection.json + data_profile.json + preprocessing_log.json + visualization_plan.json + analysis_findings.json + report_context.json +``` + +## 质量检查 + +交付前确认: + +- 数据类型已检测并记录 +- `plan.md` 已经用户确认 +- 字段含义、时间窗口和指标方向已明确或列为假设 +- 源数据未被静默修改 +- 预处理动作已记录 +- 分析任务已拆分 +- 图表有明确问题和解读 +- 发现包含证据、限制和置信度 +- 报告和幻灯片基于结构化产物 +- 结论没有超出数据证据 + +## 资源索引 + +- 工作流说明:见 [references/workflow.md](references/workflow.md) +- 数据类型策略:见 [references/data-types.md](references/data-types.md) +- 可视化规范:见 [references/visualization.md](references/visualization.md) +- 报告生成规范:见 [references/reporting.md](references/reporting.md) +- 幻灯片生成规范:见 [references/slides.md](references/slides.md) +- 质量校验清单:见 [references/validation.md](references/validation.md) + +## 使用示例 + +用户请求: + +```text +请分析这个 Google Search Console 导出的 CSV,并生成 SEO 表现报告和几张关键图表。 +``` + +执行摘要: + +1. 确认文件路径、日期范围、站点、受众和报告格式。 +2. 检测数据类型为 `tabular_generic` 或 `time_series`。 +3. 创建 `plan.md` 并等待确认。 +4. 分析 query、page、country、device、clicks、impressions、CTR 和 average position。 +5. 检查缺失值、时间覆盖、重复记录和分组覆盖。 +6. 生成趋势图、页面贡献图、查询机会图和设备/国家对比图。 +7. 保存结构化发现并生成正式报告。 + +## 注意事项 + +- 不要把描述性相关关系写成因果结论。 +- 不要在未说明的情况下修改源数据。 +- 不要用一个脚本完成读取、清洗、分析、绘图和报告的所有步骤。 +- 不要堆叠图表而不写解释。 +- 对 SEO/GEO 数据,应区分“数据观察”“可能解释”“优化建议”和“需要额外爬取/审计验证的事项”。 diff --git a/skills/data-flow-skill/data-flow-skill/references/data-types.md b/skills/data-flow-skill/data-flow-skill/references/data-types.md new file mode 100644 index 0000000..5229318 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/data-types.md @@ -0,0 +1,21 @@ +# Dataset Type Reference + +## `tabular_generic` + +Use for structured CSV, TSV, XLSX, JSON tables, analytics exports, keyword sheets, ranking tables, content inventories, and business metrics. + +## `questionnaire` + +Use for surveys, scales, feedback forms, and mixed closed/open-ended responses. Confirm scale direction, response coding, skip logic, and grouping variables. + +## `time_series` + +Use when date/time is central: daily metrics, ranking trends, traffic logs, revenue by period, or repeated observations. Confirm time zone, granularity, gaps, and period comparability. + +## `literary` + +Use for novels, poems, scripts, dialogues, essays, lyrics, and other text corpora. Confirm corpus boundaries, metadata, segmentation units, and interpretation scope. + +## Detection Artifact + +`dataset_detection.json` should record `strategy`, `confidence`, `evidence`, `alternatives`, `assumptions`, and `fallback_plan`. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/references/reporting.md b/skills/data-flow-skill/data-flow-skill/references/reporting.md new file mode 100644 index 0000000..4f11621 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/reporting.md @@ -0,0 +1,19 @@ +# Reporting Reference + +Reports must be assembled from validated artifacts, not from untracked memory. + +Recommended structure: + +1. Executive summary. +2. Objective, audience, and scope. +3. Data sources and quality notes. +4. Methodology and preprocessing summary. +5. Key metrics and descriptive statistics. +6. Segment, trend, comparison, or model results. +7. Visual evidence with interpretations. +8. Findings, recommendations, limitations, and next questions. +9. Appendix with artifact paths, formulas, and assumptions. + +Each finding should include claim, evidence, artifact path, scope, limitation, confidence level, and recommended action. + +For SEO/GEO reporting, separate observed data, plausible explanations, optimization opportunities, and follow-up work requiring crawl, SERP, rank, content, or authority review. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/references/slides.md b/skills/data-flow-skill/data-flow-skill/references/slides.md new file mode 100644 index 0000000..4210d88 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/slides.md @@ -0,0 +1,23 @@ +# Slides Reference + +Slides should summarize validated findings for a specific audience and decision moment. + +Before creating slides, derive or save `output/artifacts/report_context.json` with audience, presentation goal, narrative arc, key messages, figure/table references, known limitations, and speaker-note preference. + +Recommended deck shape: + +1. Title and decision question. +2. Executive takeaway. +3. Data scope and method in one slide. +4. Three to five evidence slides with clear chart references. +5. Recommendation or action-priority slide. +6. Risks, limitations, and next steps. +7. Appendix for detailed tables or definitions. + +Rules: + +- Do not restart analysis from raw data unless asked. +- Use `analysis_findings.json` as source of truth. +- Keep one primary message per slide. +- Pair charts with a takeaway and optional speaker notes. +- Do not hide caveats that affect interpretation. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/references/validation.md b/skills/data-flow-skill/data-flow-skill/references/validation.md new file mode 100644 index 0000000..bdaa0da --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/validation.md @@ -0,0 +1,26 @@ +# Validation Reference + +Before handoff, verify: + +- Dataset strategy selected and documented. +- User confirmed `plan.md` before formal analysis. +- Field semantics, metric definitions, time windows, and scale directions are confirmed or listed as assumptions. +- Source data preserved. +- Preprocessing actions logged. +- Analysis split into small, inspectable tasks. +- Visualization plan created before final chart generation. +- Key charts include interpretation and limitations. +- Findings include claim, evidence, artifact path, scope, limitation, confidence, and action. +- Reports and slides are built from structured artifacts. +- Claims do not exceed evidence. +- Output paths are predictable and included in the handoff summary. + +Common failure modes: + +- Treating descriptive correlations as causal proof. +- Cleaning data without recording changes. +- Running one monolithic script for the whole workflow. +- Generating chart galleries without narrative. +- Ignoring scale direction in questionnaire data. +- Comparing time periods with different coverage or leakage. +- Mixing live web collection into analysis without explicit permission. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/references/visualization.md b/skills/data-flow-skill/data-flow-skill/references/visualization.md new file mode 100644 index 0000000..d641b97 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/visualization.md @@ -0,0 +1,24 @@ +# Visualization Reference + +Create `output/artifacts/visualization_plan.json` before final chart generation. + +Each chart should define: + +- Title and question answered. +- Input artifact or table. +- Variables and filters. +- Chart type and reason for choosing it. +- Output path. +- Interpretation notes and limitations. + +Chart families: + +- Overview: KPI cards, summary tables, bars. +- Trend: line charts, rolling averages, annotated events. +- Distribution: histograms, box plots, violin plots. +- Comparison: grouped bars, dot plots, slope charts. +- Relationship: scatter plots, bubble charts, heatmaps. +- Composition: stacked bars, treemaps, area charts. +- Anomaly: highlighted outliers and before/after panels. + +Every key chart needs a plain-language takeaway. Avoid chart galleries without narrative. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/references/workflow.md b/skills/data-flow-skill/data-flow-skill/references/workflow.md new file mode 100644 index 0000000..20d186a --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/references/workflow.md @@ -0,0 +1,16 @@ +# Workflow Reference + +Data Flow Skill follows this stage order: + +1. Confirm dataset path, task goal, audience, deliverables, language, and style. +2. Detect dataset strategy and save `output/artifacts/dataset_detection.json`. +3. Draft `plan.md` and wait for user confirmation. +4. Profile raw data and save `output/artifacts/data_profile.json`. +5. Preprocess transparently and log changes. +6. Run analysis in small, inspectable task units. +7. Create `output/artifacts/visualization_plan.json` before final charts. +8. Save evidence-backed findings to `analysis_findings.json`. +9. Build reports and slides from validated artifacts. +10. End with a concise handoff summary. + +`plan.md` should include objective, audience, dataset summary, detected strategy, assumptions, preprocessing plan, analysis tasks, visualization outline, expected outputs, risks, and validation checkpoints. \ No newline at end of file diff --git a/skills/data-flow-skill/data-flow-skill/requirements.txt b/skills/data-flow-skill/data-flow-skill/requirements.txt new file mode 100644 index 0000000..9c95b6a --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/requirements.txt @@ -0,0 +1,6 @@ +pandas>=2.0.0 +numpy>=1.24.0 +matplotlib>=3.7.0 +scipy>=1.10.0 +scikit-learn>=1.3.0 +requests>=2.31.0 diff --git a/skills/data-flow-skill/data-flow-skill/scripts/README.md b/skills/data-flow-skill/data-flow-skill/scripts/README.md new file mode 100644 index 0000000..30bff1d --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/README.md @@ -0,0 +1,34 @@ +# Scripts + +本目录包含从原 dataflow 项目迁移来的可复用 Python 脚本。 + +## 目录结构 + +```text +scripts/ + analysis/ # 数据检测、画像、预处理、统计分析、发现生成 + visualization/matplotlib/ # Matplotlib 静态图模板 + mermaid/ # Mermaid 流程图生成与渲染辅助 + image_gen/ # 主题示意图生成 CLI +``` + +## 安装依赖 + +在 `data-flow-skill` 目录下运行: + +```bash +pip install -r requirements.txt +``` + +可选依赖: + +- Node.js 与 `@mermaid-js/mermaid-cli`:用于 Mermaid 本地渲染。 +- LaTeX:用于启用 `text.usetex` 的 Matplotlib 模板或 PDF 编译。 +- `DASHSCOPE_API_KEY`:用于 `image_gen/image_generator.py` 调用图片生成服务。 + +## 使用约束 + +- 不要用一个脚本完成读取、清洗、分析、绘图和报告的完整链路。 +- 每个脚本应服务一个明确子任务。 +- 运行脚本前确认输入路径和输出路径。 +- 输出建议写入 `output/` 下的对应子目录。 diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/__init__.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/__init__.py new file mode 100644 index 0000000..5896671 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/__init__.py @@ -0,0 +1,2 @@ +"""分析阶段脚本模块。""" + diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_data_understanding.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_data_understanding.py new file mode 100644 index 0000000..4e81811 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_data_understanding.py @@ -0,0 +1,145 @@ +"""Dataset profiling helpers.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import Any + + +TARGET_MARKERS = ("target", "label", "\u76ee\u6807") +CLUSTER_MARKERS = ("cluster", "\u805a\u7c7b") +DIMENSION_REDUCTION_MARKERS = ("dimension reduction", "\u964d\u7ef4") + + +@dataclass(slots=True) +class FieldSummary: + field_name: str + inferred_type: str + missing_rate: float + is_candidate_target: bool = False + is_candidate_group: bool = False + is_candidate_time: bool = False + is_high_cardinality: bool = False + + +def is_missing_value(value: Any) -> bool: + if value is None: + return True + if isinstance(value, str) and value.strip() == "": + return True + return False + + +def infer_type(sample_values: list[Any]) -> str: + valid_values = [value for value in sample_values if not is_missing_value(value)] + if not valid_values: + return "unknown" + + if all(isinstance(value, bool) for value in valid_values): + return "boolean" + + if all(isinstance(value, (int, float)) and not isinstance(value, bool) for value in valid_values): + return "numeric" + + normalized_values = [str(value).strip() for value in valid_values] + if all(text.lower() in {"true", "false", "yes", "no", "0", "1"} for text in normalized_values): + return "boolean" + + numeric_parse_count = 0 + datetime_parse_count = 0 + for text in normalized_values: + try: + float(text) + numeric_parse_count += 1 + except ValueError: + pass + for date_format in ("%Y-%m-%d", "%Y/%m/%d", "%Y-%m", "%Y/%m", "%d/%m/%Y"): + try: + datetime.strptime(text, date_format) + datetime_parse_count += 1 + break + except ValueError: + continue + + if numeric_parse_count == len(normalized_values): + return "numeric" + if datetime_parse_count >= max(1, len(normalized_values) // 2): + return "datetime" + if len(set(normalized_values)) <= max(12, len(normalized_values) // 3): + return "categorical" + return "text" + + +def build_data_profile( + sample_rows: list[dict[str, Any]] | None = None, + task_description: str = "", +) -> dict[str, Any]: + sample_rows = sample_rows or [] + if not sample_rows: + return { + "schema": [], + "data_profile": { + "row_count": 0, + "column_count": 0, + "field_summaries": [], + "missing_rate_overview": {}, + "constant_columns": [], + "high_missing_fields": [], + "analysis_opportunities": [], + }, + } + + field_names = list(sample_rows[0].keys()) + row_count = len(sample_rows) + schema: list[dict[str, Any]] = [] + constant_columns: list[str] = [] + high_missing_fields: list[str] = [] + + for field_name in field_names: + column_values = [row.get(field_name) for row in sample_rows] + missing_count = sum(1 for value in column_values if is_missing_value(value)) + non_missing_values = [value for value in column_values if not is_missing_value(value)] + inferred_type = infer_type(column_values) + unique_value_count = len({str(value) for value in non_missing_values}) + lower_field_name = field_name.lower() + + if non_missing_values and unique_value_count == 1: + constant_columns.append(field_name) + if row_count and missing_count / row_count >= 0.3: + high_missing_fields.append(field_name) + + summary = FieldSummary( + field_name=field_name, + inferred_type=inferred_type, + missing_rate=round(missing_count / row_count, 4) if row_count else 0.0, + is_candidate_target=any(marker in lower_field_name or marker in field_name for marker in TARGET_MARKERS), + is_candidate_group=inferred_type == "categorical", + is_candidate_time=inferred_type == "datetime", + is_high_cardinality=inferred_type == "categorical" and unique_value_count >= max(20, row_count // 5), + ) + schema.append(asdict(summary)) + + analysis_opportunities: list[str] = [] + if any(field["inferred_type"] == "datetime" for field in schema): + analysis_opportunities.append("Trend analysis is available.") + if sum(field["inferred_type"] == "numeric" for field in schema) >= 2: + analysis_opportunities.append("Correlation analysis is available.") + if any(marker in task_description.lower() for marker in CLUSTER_MARKERS) or any( + marker in task_description.lower() or marker in task_description + for marker in DIMENSION_REDUCTION_MARKERS + ): + analysis_opportunities.append("The task explicitly requests clustering or dimensionality reduction.") + + return { + "schema": schema, + "data_profile": { + "row_count": row_count, + "column_count": len(field_names), + "field_summaries": schema, + "missing_rate_overview": {field["field_name"]: field["missing_rate"] for field in schema}, + "constant_columns": constant_columns, + "high_missing_fields": high_missing_fields, + "analysis_opportunities": analysis_opportunities, + }, + } diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_dataset_detection.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_dataset_detection.py new file mode 100644 index 0000000..889b916 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_dataset_detection.py @@ -0,0 +1,195 @@ +"""Dataset type detection helpers.""" + +from __future__ import annotations + +import csv +import re +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + + +SURVEY_COLUMN_PATTERN = re.compile( + r"(\u6ee1\u610f|\u95ee\u5377|\u9898\u76ee|\u8bc4\u5206|\u5206\u503c|\u5efa\u8bae|\u53cd\u9988|likert|score|survey|question|q\d+|\u662f\u5426|\u5e74\u7ea7|\u73ed\u7ea7|\u90e8\u95e8)", + re.IGNORECASE, +) +TIME_COLUMN_PATTERN = re.compile( + r"(date|time|year|month|day|week|timestamp|\u65e5\u671f|\u65f6\u95f4|\u5e74\u6708|\u5b63\u5ea6|\u5468|\u65f6\u70b9)", + re.IGNORECASE, +) +LITERARY_TITLE_PATTERN = re.compile( + r"(\u7b2c[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u96f6\u3007\u4e24\d]+\u56de|\u7b2c[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u96f6\u3007\u4e24\d]+\u7ae0|\u5bf9\u4e0b\u8054|\u4e0a\u8054|\u4e0b\u8054)" +) +POETRY_PATTERN = re.compile( + r"(\u4e03\u5f8b|\u4e94\u5f8b|\u7edd\u53e5|\u8bcd\u724c|\u6d63\u6eaa\u6c99|\u6c34\u8c03\u6b4c\u5934|\u9e67\u9e2a\u5929|\u8776\u604b\u82b1)" +) +SURVEY_VALUE_MARKERS = ( + "\u6ee1\u610f", + "\u4e00\u822c", + "\u4e0d\u540c\u610f", + "\u540c\u610f", + "\u975e\u5e38", + "\u662f", + "\u5426", + "\u7537", + "\u5973", +) +COUPLET_MARKERS = ("\u5bf9\u4e0b\u8054", "\u4e0a\u8054", "\u4e0b\u8054") +SALES_MARKERS = ("\u9500\u91cf", "\u9500\u552e", "price", "amount") + + +@dataclass(slots=True) +class DetectionResult: + primary_type: str + subtype: str | None + confidence: float + fallback_type: str + signals: list[str] + file_extension: str + detected_strategy: str + + +def read_text_sample(data_path: Path, max_characters: int = 4000) -> str: + raw_bytes = data_path.read_bytes()[: max_characters * 4] + for encoding in ("utf-8", "gb18030", "gbk", "big5"): + try: + return raw_bytes.decode(encoding) + except UnicodeDecodeError: + continue + return raw_bytes.decode("utf-8", errors="ignore") + + +def read_table_sample(data_path: Path, max_rows: int = 30) -> tuple[list[str], list[dict[str, Any]]]: + if data_path.suffix.lower() != ".csv": + return [], [] + with data_path.open("r", encoding="utf-8", newline="") as file: + reader = csv.DictReader(file) + rows: list[dict[str, Any]] = [] + for row in reader: + rows.append(dict(row)) + if len(rows) >= max_rows: + break + return reader.fieldnames or [], rows + + +def is_low_cardinality_survey_column(sample_rows: list[dict[str, Any]], column_name: str) -> bool: + values = {str(row.get(column_name, "")).strip() for row in sample_rows if str(row.get(column_name, "")).strip()} + if not values: + return False + if all(value in {"1", "2", "3", "4", "5", "6", "7"} for value in values): + return True + if len(values) <= 7 and any(marker in value for value in values for marker in SURVEY_VALUE_MARKERS): + return True + return False + + +def detect_dataset(data_path: str, task_description: str = "") -> dict[str, Any]: + path = Path(data_path) + extension = path.suffix.lower() + signals: list[str] = [] + + if extension in {".txt", ".md", ".jsonl"}: + text_sample = read_text_sample(path) + if LITERARY_TITLE_PATTERN.search(text_sample): + subtype = "novel_or_couplet" + if any(marker in text_sample for marker in COUPLET_MARKERS): + subtype = "couplet" + signals.append("The text contains couplet markers.") + elif POETRY_PATTERN.search(text_sample): + subtype = "poetry" + signals.append("The text contains poetry-form clues.") + else: + signals.append("The text contains literary chapter markers.") + result = DetectionResult( + primary_type="literary", + subtype=subtype, + confidence=0.9, + fallback_type="unknown", + signals=signals, + file_extension=extension, + detected_strategy="literary", + ) + return asdict(result) + + result = DetectionResult( + primary_type="unknown", + subtype=None, + confidence=0.45, + fallback_type="tabular_generic", + signals=["The text file does not expose clear literary structure cues."], + file_extension=extension, + detected_strategy="tabular_generic", + ) + return asdict(result) + + if extension in {".csv", ".xlsx"}: + headers, sample_rows = read_table_sample(path) + survey_match_count = sum(bool(SURVEY_COLUMN_PATTERN.search(header)) for header in headers) + time_match_columns = [header for header in headers if TIME_COLUMN_PATTERN.search(header)] + low_cardinality_survey_columns = ( + sum(is_low_cardinality_survey_column(sample_rows, header) for header in headers) if sample_rows else 0 + ) + long_text_column_count = 0 + if sample_rows: + for header in headers: + max_length = max((len(str(row.get(header, "")).strip()) for row in sample_rows), default=0) + if max_length >= 24: + long_text_column_count += 1 + + if survey_match_count >= 2 or low_cardinality_survey_columns >= max(3, len(headers) // 3): + signals.extend( + [ + f"Matched {survey_match_count} survey-related header columns.", + f"Detected {low_cardinality_survey_columns} low-cardinality survey columns in samples.", + ] + ) + if long_text_column_count: + signals.append(f"Detected {long_text_column_count} possible free-text response columns.") + result = DetectionResult( + primary_type="questionnaire", + subtype="mixed_questionnaire" if long_text_column_count else "structured_questionnaire", + confidence=0.88, + fallback_type="tabular_generic", + signals=signals, + file_extension=extension, + detected_strategy="questionnaire", + ) + return asdict(result) + + if time_match_columns: + signals.append(f"Detected candidate time columns: {time_match_columns}") + if any(marker in header.lower() or marker in header for header in headers for marker in SALES_MARKERS): + signals.append("Candidate time columns co-occur with numeric business metric columns.") + result = DetectionResult( + primary_type="time_series", + subtype="tabular_time_series", + confidence=0.82, + fallback_type="tabular_generic", + signals=signals, + file_extension=extension, + detected_strategy="time_series", + ) + return asdict(result) + + signals.append("Detected a standard table file without strong questionnaire or time-series signals.") + result = DetectionResult( + primary_type="tabular_generic", + subtype="generic_spreadsheet", + confidence=0.72, + fallback_type="tabular_generic", + signals=signals, + file_extension=extension, + detected_strategy="tabular_generic", + ) + return asdict(result) + + result = DetectionResult( + primary_type="unknown", + subtype=None, + confidence=0.3, + fallback_type="tabular_generic", + signals=[f"Unsupported extension encountered: {extension or 'no_extension'}"], + file_extension=extension, + detected_strategy="tabular_generic", + ) + return asdict(result) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_findings_generation.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_findings_generation.py new file mode 100644 index 0000000..d8d7357 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_findings_generation.py @@ -0,0 +1,1503 @@ +import re +import sys +import copy +import types +import inspect +import keyword +import builtins +import functools +import itertools +import abc +import _thread +from types import FunctionType, GenericAlias + + +__all__ = ['dataclass', + 'field', + 'Field', + 'FrozenInstanceError', + 'InitVar', + 'KW_ONLY', + 'MISSING', + + # Helper functions. + 'fields', + 'asdict', + 'astuple', + 'make_dataclass', + 'replace', + 'is_dataclass', + ] + +# Conditions for adding methods. The boxes indicate what action the +# dataclass decorator takes. For all of these tables, when I talk +# about init=, repr=, eq=, order=, unsafe_hash=, or frozen=, I'm +# referring to the arguments to the @dataclass decorator. When +# checking if a dunder method already exists, I mean check for an +# entry in the class's __dict__. I never check to see if an attribute +# is defined in a base class. + +# Key: +# +=========+=========================================+ +# + Value | Meaning | +# +=========+=========================================+ +# | | No action: no method is added. | +# +---------+-----------------------------------------+ +# | add | Generated method is added. | +# +---------+-----------------------------------------+ +# | raise | TypeError is raised. | +# +---------+-----------------------------------------+ +# | None | Attribute is set to None. | +# +=========+=========================================+ + +# __init__ +# +# +--- init= parameter +# | +# v | | | +# | no | yes | <--- class has __init__ in __dict__? +# +=======+=======+=======+ +# | False | | | +# +-------+-------+-------+ +# | True | add | | <- the default +# +=======+=======+=======+ + +# __repr__ +# +# +--- repr= parameter +# | +# v | | | +# | no | yes | <--- class has __repr__ in __dict__? +# +=======+=======+=======+ +# | False | | | +# +-------+-------+-------+ +# | True | add | | <- the default +# +=======+=======+=======+ + + +# __setattr__ +# __delattr__ +# +# +--- frozen= parameter +# | +# v | | | +# | no | yes | <--- class has __setattr__ or __delattr__ in __dict__? +# +=======+=======+=======+ +# | False | | | <- the default +# +-------+-------+-------+ +# | True | add | raise | +# +=======+=======+=======+ +# Raise because not adding these methods would break the "frozen-ness" +# of the class. + +# __eq__ +# +# +--- eq= parameter +# | +# v | | | +# | no | yes | <--- class has __eq__ in __dict__? +# +=======+=======+=======+ +# | False | | | +# +-------+-------+-------+ +# | True | add | | <- the default +# +=======+=======+=======+ + +# __lt__ +# __le__ +# __gt__ +# __ge__ +# +# +--- order= parameter +# | +# v | | | +# | no | yes | <--- class has any comparison method in __dict__? +# +=======+=======+=======+ +# | False | | | <- the default +# +-------+-------+-------+ +# | True | add | raise | +# +=======+=======+=======+ +# Raise because to allow this case would interfere with using +# functools.total_ordering. + +# __hash__ + +# +------------------- unsafe_hash= parameter +# | +----------- eq= parameter +# | | +--- frozen= parameter +# | | | +# v v v | | | +# | no | yes | <--- class has explicitly defined __hash__ +# +=======+=======+=======+========+========+ +# | False | False | False | | | No __eq__, use the base class __hash__ +# +-------+-------+-------+--------+--------+ +# | False | False | True | | | No __eq__, use the base class __hash__ +# +-------+-------+-------+--------+--------+ +# | False | True | False | None | | <-- the default, not hashable +# +-------+-------+-------+--------+--------+ +# | False | True | True | add | | Frozen, so hashable, allows override +# +-------+-------+-------+--------+--------+ +# | True | False | False | add | raise | Has no __eq__, but hashable +# +-------+-------+-------+--------+--------+ +# | True | False | True | add | raise | Has no __eq__, but hashable +# +-------+-------+-------+--------+--------+ +# | True | True | False | add | raise | Not frozen, but hashable +# +-------+-------+-------+--------+--------+ +# | True | True | True | add | raise | Frozen, so hashable +# +=======+=======+=======+========+========+ +# For boxes that are blank, __hash__ is untouched and therefore +# inherited from the base class. If the base is object, then +# id-based hashing is used. +# +# Note that a class may already have __hash__=None if it specified an +# __eq__ method in the class body (not one that was created by +# @dataclass). +# +# See _hash_action (below) for a coded version of this table. + +# __match_args__ +# +# +--- match_args= parameter +# | +# v | | | +# | no | yes | <--- class has __match_args__ in __dict__? +# +=======+=======+=======+ +# | False | | | +# +-------+-------+-------+ +# | True | add | | <- the default +# +=======+=======+=======+ +# __match_args__ is always added unless the class already defines it. It is a +# tuple of __init__ parameter names; non-init fields must be matched by keyword. + + +# Raised when an attempt is made to modify a frozen class. +class FrozenInstanceError(AttributeError): pass + +# A sentinel object for default values to signal that a default +# factory will be used. This is given a nice repr() which will appear +# in the function signature of dataclasses' constructors. +class _HAS_DEFAULT_FACTORY_CLASS: + def __repr__(self): + return '' +_HAS_DEFAULT_FACTORY = _HAS_DEFAULT_FACTORY_CLASS() + +# A sentinel object to detect if a parameter is supplied or not. Use +# a class to give it a better repr. +class _MISSING_TYPE: + pass +MISSING = _MISSING_TYPE() + +# A sentinel object to indicate that following fields are keyword-only by +# default. Use a class to give it a better repr. +class _KW_ONLY_TYPE: + pass +KW_ONLY = _KW_ONLY_TYPE() + +# Since most per-field metadata will be unused, create an empty +# read-only proxy that can be shared among all fields. +_EMPTY_METADATA = types.MappingProxyType({}) + +# Markers for the various kinds of fields and pseudo-fields. +class _FIELD_BASE: + def __init__(self, name): + self.name = name + def __repr__(self): + return self.name +_FIELD = _FIELD_BASE('_FIELD') +_FIELD_CLASSVAR = _FIELD_BASE('_FIELD_CLASSVAR') +_FIELD_INITVAR = _FIELD_BASE('_FIELD_INITVAR') + +# The name of an attribute on the class where we store the Field +# objects. Also used to check if a class is a Data Class. +_FIELDS = '__dataclass_fields__' + +# The name of an attribute on the class that stores the parameters to +# @dataclass. +_PARAMS = '__dataclass_params__' + +# The name of the function, that if it exists, is called at the end of +# __init__. +_POST_INIT_NAME = '__post_init__' + +# String regex that string annotations for ClassVar or InitVar must match. +# Allows "identifier.identifier[" or "identifier[". +# https://bugs.python.org/issue33453 for details. +_MODULE_IDENTIFIER_RE = re.compile(r'^(?:\s*(\w+)\s*\.)?\s*(\w+)') + +# This function's logic is copied from "recursive_repr" function in +# reprlib module to avoid dependency. +def _recursive_repr(user_function): + # Decorator to make a repr function return "..." for a recursive + # call. + repr_running = set() + + @functools.wraps(user_function) + def wrapper(self): + key = id(self), _thread.get_ident() + if key in repr_running: + return '...' + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + return wrapper + +class InitVar: + __slots__ = ('type', ) + + def __init__(self, type): + self.type = type + + def __repr__(self): + if isinstance(self.type, type): + type_name = self.type.__name__ + else: + # typing objects, e.g. List[int] + type_name = repr(self.type) + return f'dataclasses.InitVar[{type_name}]' + + def __class_getitem__(cls, type): + return InitVar(type) + +# Instances of Field are only ever created from within this module, +# and only from the field() function, although Field instances are +# exposed externally as (conceptually) read-only objects. +# +# name and type are filled in after the fact, not in __init__. +# They're not known at the time this class is instantiated, but it's +# convenient if they're available later. +# +# When cls._FIELDS is filled in with a list of Field objects, the name +# and type fields will have been populated. +class Field: + __slots__ = ('name', + 'type', + 'default', + 'default_factory', + 'repr', + 'hash', + 'init', + 'compare', + 'metadata', + 'kw_only', + '_field_type', # Private: not to be used by user code. + ) + + def __init__(self, default, default_factory, init, repr, hash, compare, + metadata, kw_only): + self.name = None + self.type = None + self.default = default + self.default_factory = default_factory + self.init = init + self.repr = repr + self.hash = hash + self.compare = compare + self.metadata = (_EMPTY_METADATA + if metadata is None else + types.MappingProxyType(metadata)) + self.kw_only = kw_only + self._field_type = None + + @_recursive_repr + def __repr__(self): + return ('Field(' + f'name={self.name!r},' + f'type={self.type!r},' + f'default={self.default!r},' + f'default_factory={self.default_factory!r},' + f'init={self.init!r},' + f'repr={self.repr!r},' + f'hash={self.hash!r},' + f'compare={self.compare!r},' + f'metadata={self.metadata!r},' + f'kw_only={self.kw_only!r},' + f'_field_type={self._field_type}' + ')') + + # This is used to support the PEP 487 __set_name__ protocol in the + # case where we're using a field that contains a descriptor as a + # default value. For details on __set_name__, see + # https://peps.python.org/pep-0487/#implementation-details. + # + # Note that in _process_class, this Field object is overwritten + # with the default value, so the end result is a descriptor that + # had __set_name__ called on it at the right time. + def __set_name__(self, owner, name): + func = getattr(type(self.default), '__set_name__', None) + if func: + # There is a __set_name__ method on the descriptor, call + # it. + func(self.default, owner, name) + + __class_getitem__ = classmethod(GenericAlias) + + +class _DataclassParams: + __slots__ = ('init', + 'repr', + 'eq', + 'order', + 'unsafe_hash', + 'frozen', + ) + + def __init__(self, init, repr, eq, order, unsafe_hash, frozen): + self.init = init + self.repr = repr + self.eq = eq + self.order = order + self.unsafe_hash = unsafe_hash + self.frozen = frozen + + def __repr__(self): + return ('_DataclassParams(' + f'init={self.init!r},' + f'repr={self.repr!r},' + f'eq={self.eq!r},' + f'order={self.order!r},' + f'unsafe_hash={self.unsafe_hash!r},' + f'frozen={self.frozen!r}' + ')') + + +# This function is used instead of exposing Field creation directly, +# so that a type checker can be told (via overloads) that this is a +# function whose type depends on its parameters. +def field(*, default=MISSING, default_factory=MISSING, init=True, repr=True, + hash=None, compare=True, metadata=None, kw_only=MISSING): + """Return an object to identify dataclass fields. + + default is the default value of the field. default_factory is a + 0-argument function called to initialize a field's value. If init + is true, the field will be a parameter to the class's __init__() + function. If repr is true, the field will be included in the + object's repr(). If hash is true, the field will be included in the + object's hash(). If compare is true, the field will be used in + comparison functions. metadata, if specified, must be a mapping + which is stored but not otherwise examined by dataclass. If kw_only + is true, the field will become a keyword-only parameter to + __init__(). + + It is an error to specify both default and default_factory. + """ + + if default is not MISSING and default_factory is not MISSING: + raise ValueError('cannot specify both default and default_factory') + return Field(default, default_factory, init, repr, hash, compare, + metadata, kw_only) + + +def _fields_in_init_order(fields): + # Returns the fields as __init__ will output them. It returns 2 tuples: + # the first for normal args, and the second for keyword args. + + return (tuple(f for f in fields if f.init and not f.kw_only), + tuple(f for f in fields if f.init and f.kw_only) + ) + + +def _tuple_str(obj_name, fields): + # Return a string representing each field of obj_name as a tuple + # member. So, if fields is ['x', 'y'] and obj_name is "self", + # return "(self.x,self.y)". + + # Special case for the 0-tuple. + if not fields: + return '()' + # Note the trailing comma, needed if this turns out to be a 1-tuple. + return f'({",".join([f"{obj_name}.{f.name}" for f in fields])},)' + + +def _create_fn(name, args, body, *, globals=None, locals=None, + return_type=MISSING): + # Note that we may mutate locals. Callers beware! + # The only callers are internal to this module, so no + # worries about external callers. + if locals is None: + locals = {} + return_annotation = '' + if return_type is not MISSING: + locals['_return_type'] = return_type + return_annotation = '->_return_type' + args = ','.join(args) + body = '\n'.join(f' {b}' for b in body) + + # Compute the text of the entire function. + txt = f' def {name}({args}){return_annotation}:\n{body}' + + local_vars = ', '.join(locals.keys()) + txt = f"def __create_fn__({local_vars}):\n{txt}\n return {name}" + ns = {} + exec(txt, globals, ns) + return ns['__create_fn__'](**locals) + + +def _field_assign(frozen, name, value, self_name): + # If we're a frozen class, then assign to our fields in __init__ + # via object.__setattr__. Otherwise, just use a simple + # assignment. + # + # self_name is what "self" is called in this function: don't + # hard-code "self", since that might be a field name. + if frozen: + return f'__dataclass_builtins_object__.__setattr__({self_name},{name!r},{value})' + return f'{self_name}.{name}={value}' + + +def _field_init(f, frozen, globals, self_name, slots): + # Return the text of the line in the body of __init__ that will + # initialize this field. + + default_name = f'_dflt_{f.name}' + if f.default_factory is not MISSING: + if f.init: + # This field has a default factory. If a parameter is + # given, use it. If not, call the factory. + globals[default_name] = f.default_factory + value = (f'{default_name}() ' + f'if {f.name} is _HAS_DEFAULT_FACTORY ' + f'else {f.name}') + else: + # This is a field that's not in the __init__ params, but + # has a default factory function. It needs to be + # initialized here by calling the factory function, + # because there's no other way to initialize it. + + # For a field initialized with a default=defaultvalue, the + # class dict just has the default value + # (cls.fieldname=defaultvalue). But that won't work for a + # default factory, the factory must be called in __init__ + # and we must assign that to self.fieldname. We can't + # fall back to the class dict's value, both because it's + # not set, and because it might be different per-class + # (which, after all, is why we have a factory function!). + + globals[default_name] = f.default_factory + value = f'{default_name}()' + else: + # No default factory. + if f.init: + if f.default is MISSING: + # There's no default, just do an assignment. + value = f.name + elif f.default is not MISSING: + globals[default_name] = f.default + value = f.name + else: + # If the class has slots, then initialize this field. + if slots and f.default is not MISSING: + globals[default_name] = f.default + value = default_name + else: + # This field does not need initialization: reading from it will + # just use the class attribute that contains the default. + # Signify that to the caller by returning None. + return None + + # Only test this now, so that we can create variables for the + # default. However, return None to signify that we're not going + # to actually do the assignment statement for InitVars. + if f._field_type is _FIELD_INITVAR: + return None + + # Now, actually generate the field assignment. + return _field_assign(frozen, f.name, value, self_name) + + +def _init_param(f): + # Return the __init__ parameter string for this field. For + # example, the equivalent of 'x:int=3' (except instead of 'int', + # reference a variable set to int, and instead of '3', reference a + # variable set to 3). + if f.default is MISSING and f.default_factory is MISSING: + # There's no default, and no default_factory, just output the + # variable name and type. + default = '' + elif f.default is not MISSING: + # There's a default, this will be the name that's used to look + # it up. + default = f'=_dflt_{f.name}' + elif f.default_factory is not MISSING: + # There's a factory function. Set a marker. + default = '=_HAS_DEFAULT_FACTORY' + return f'{f.name}:_type_{f.name}{default}' + + +def _init_fn(fields, std_fields, kw_only_fields, frozen, has_post_init, + self_name, globals, slots): + # fields contains both real fields and InitVar pseudo-fields. + + # Make sure we don't have fields without defaults following fields + # with defaults. This actually would be caught when exec-ing the + # function source code, but catching it here gives a better error + # message, and future-proofs us in case we build up the function + # using ast. + + seen_default = False + for f in std_fields: + # Only consider the non-kw-only fields in the __init__ call. + if f.init: + if not (f.default is MISSING and f.default_factory is MISSING): + seen_default = True + elif seen_default: + raise TypeError(f'non-default argument {f.name!r} ' + 'follows default argument') + + locals = {f'_type_{f.name}': f.type for f in fields} + locals.update({ + 'MISSING': MISSING, + '_HAS_DEFAULT_FACTORY': _HAS_DEFAULT_FACTORY, + '__dataclass_builtins_object__': object, + }) + + body_lines = [] + for f in fields: + line = _field_init(f, frozen, locals, self_name, slots) + # line is None means that this field doesn't require + # initialization (it's a pseudo-field). Just skip it. + if line: + body_lines.append(line) + + # Does this class have a post-init function? + if has_post_init: + params_str = ','.join(f.name for f in fields + if f._field_type is _FIELD_INITVAR) + body_lines.append(f'{self_name}.{_POST_INIT_NAME}({params_str})') + + # If no body lines, use 'pass'. + if not body_lines: + body_lines = ['pass'] + + _init_params = [_init_param(f) for f in std_fields] + if kw_only_fields: + # Add the keyword-only args. Because the * can only be added if + # there's at least one keyword-only arg, there needs to be a test here + # (instead of just concatenting the lists together). + _init_params += ['*'] + _init_params += [_init_param(f) for f in kw_only_fields] + return _create_fn('__init__', + [self_name] + _init_params, + body_lines, + locals=locals, + globals=globals, + return_type=None) + + +def _repr_fn(fields, globals): + fn = _create_fn('__repr__', + ('self',), + ['return self.__class__.__qualname__ + f"(' + + ', '.join([f"{f.name}={{self.{f.name}!r}}" + for f in fields]) + + ')"'], + globals=globals) + return _recursive_repr(fn) + + +def _frozen_get_del_attr(cls, fields, globals): + locals = {'cls': cls, + 'FrozenInstanceError': FrozenInstanceError} + if fields: + fields_str = '(' + ','.join(repr(f.name) for f in fields) + ',)' + else: + # Special case for the zero-length tuple. + fields_str = '()' + return (_create_fn('__setattr__', + ('self', 'name', 'value'), + (f'if type(self) is cls or name in {fields_str}:', + ' raise FrozenInstanceError(f"cannot assign to field {name!r}")', + f'super(cls, self).__setattr__(name, value)'), + locals=locals, + globals=globals), + _create_fn('__delattr__', + ('self', 'name'), + (f'if type(self) is cls or name in {fields_str}:', + ' raise FrozenInstanceError(f"cannot delete field {name!r}")', + f'super(cls, self).__delattr__(name)'), + locals=locals, + globals=globals), + ) + + +def _cmp_fn(name, op, self_tuple, other_tuple, globals): + # Create a comparison function. If the fields in the object are + # named 'x' and 'y', then self_tuple is the string + # '(self.x,self.y)' and other_tuple is the string + # '(other.x,other.y)'. + + return _create_fn(name, + ('self', 'other'), + [ 'if other.__class__ is self.__class__:', + f' return {self_tuple}{op}{other_tuple}', + 'return NotImplemented'], + globals=globals) + + +def _hash_fn(fields, globals): + self_tuple = _tuple_str('self', fields) + return _create_fn('__hash__', + ('self',), + [f'return hash({self_tuple})'], + globals=globals) + + +def _is_classvar(a_type, typing): + # This test uses a typing internal class, but it's the best way to + # test if this is a ClassVar. + return (a_type is typing.ClassVar + or (type(a_type) is typing._GenericAlias + and a_type.__origin__ is typing.ClassVar)) + + +def _is_initvar(a_type, dataclasses): + # The module we're checking against is the module we're + # currently in (dataclasses.py). + return (a_type is dataclasses.InitVar + or type(a_type) is dataclasses.InitVar) + +def _is_kw_only(a_type, dataclasses): + return a_type is dataclasses.KW_ONLY + + +def _is_type(annotation, cls, a_module, a_type, is_type_predicate): + # Given a type annotation string, does it refer to a_type in + # a_module? For example, when checking that annotation denotes a + # ClassVar, then a_module is typing, and a_type is + # typing.ClassVar. + + # It's possible to look up a_module given a_type, but it involves + # looking in sys.modules (again!), and seems like a waste since + # the caller already knows a_module. + + # - annotation is a string type annotation + # - cls is the class that this annotation was found in + # - a_module is the module we want to match + # - a_type is the type in that module we want to match + # - is_type_predicate is a function called with (obj, a_module) + # that determines if obj is of the desired type. + + # Since this test does not do a local namespace lookup (and + # instead only a module (global) lookup), there are some things it + # gets wrong. + + # With string annotations, cv0 will be detected as a ClassVar: + # CV = ClassVar + # @dataclass + # class C0: + # cv0: CV + + # But in this example cv1 will not be detected as a ClassVar: + # @dataclass + # class C1: + # CV = ClassVar + # cv1: CV + + # In C1, the code in this function (_is_type) will look up "CV" in + # the module and not find it, so it will not consider cv1 as a + # ClassVar. This is a fairly obscure corner case, and the best + # way to fix it would be to eval() the string "CV" with the + # correct global and local namespaces. However that would involve + # a eval() penalty for every single field of every dataclass + # that's defined. It was judged not worth it. + + match = _MODULE_IDENTIFIER_RE.match(annotation) + if match: + ns = None + module_name = match.group(1) + if not module_name: + # No module name, assume the class's module did + # "from dataclasses import InitVar". + ns = sys.modules.get(cls.__module__).__dict__ + else: + # Look up module_name in the class's module. + module = sys.modules.get(cls.__module__) + if module and module.__dict__.get(module_name) is a_module: + ns = sys.modules.get(a_type.__module__).__dict__ + if ns and is_type_predicate(ns.get(match.group(2)), a_module): + return True + return False + + +def _get_field(cls, a_name, a_type, default_kw_only): + # Return a Field object for this field name and type. ClassVars and + # InitVars are also returned, but marked as such (see f._field_type). + # default_kw_only is the value of kw_only to use if there isn't a field() + # that defines it. + + # If the default value isn't derived from Field, then it's only a + # normal default value. Convert it to a Field(). + default = getattr(cls, a_name, MISSING) + if isinstance(default, Field): + f = default + else: + if isinstance(default, types.MemberDescriptorType): + # This is a field in __slots__, so it has no default value. + default = MISSING + f = field(default=default) + + # Only at this point do we know the name and the type. Set them. + f.name = a_name + f.type = a_type + + # Assume it's a normal field until proven otherwise. We're next + # going to decide if it's a ClassVar or InitVar, everything else + # is just a normal field. + f._field_type = _FIELD + + # In addition to checking for actual types here, also check for + # string annotations. get_type_hints() won't always work for us + # (see https://github.com/python/typing/issues/508 for example), + # plus it's expensive and would require an eval for every string + # annotation. So, make a best effort to see if this is a ClassVar + # or InitVar using regex's and checking that the thing referenced + # is actually of the correct type. + + # For the complete discussion, see https://bugs.python.org/issue33453 + + # If typing has not been imported, then it's impossible for any + # annotation to be a ClassVar. So, only look for ClassVar if + # typing has been imported by any module (not necessarily cls's + # module). + typing = sys.modules.get('typing') + if typing: + if (_is_classvar(a_type, typing) + or (isinstance(f.type, str) + and _is_type(f.type, cls, typing, typing.ClassVar, + _is_classvar))): + f._field_type = _FIELD_CLASSVAR + + # If the type is InitVar, or if it's a matching string annotation, + # then it's an InitVar. + if f._field_type is _FIELD: + # The module we're checking against is the module we're + # currently in (dataclasses.py). + dataclasses = sys.modules[__name__] + if (_is_initvar(a_type, dataclasses) + or (isinstance(f.type, str) + and _is_type(f.type, cls, dataclasses, dataclasses.InitVar, + _is_initvar))): + f._field_type = _FIELD_INITVAR + + # Validations for individual fields. This is delayed until now, + # instead of in the Field() constructor, since only here do we + # know the field name, which allows for better error reporting. + + # Special restrictions for ClassVar and InitVar. + if f._field_type in (_FIELD_CLASSVAR, _FIELD_INITVAR): + if f.default_factory is not MISSING: + raise TypeError(f'field {f.name} cannot have a ' + 'default factory') + # Should I check for other field settings? default_factory + # seems the most serious to check for. Maybe add others. For + # example, how about init=False (or really, + # init=)? It makes no sense for + # ClassVar and InitVar to specify init=. + + # kw_only validation and assignment. + if f._field_type in (_FIELD, _FIELD_INITVAR): + # For real and InitVar fields, if kw_only wasn't specified use the + # default value. + if f.kw_only is MISSING: + f.kw_only = default_kw_only + else: + # Make sure kw_only isn't set for ClassVars + assert f._field_type is _FIELD_CLASSVAR + if f.kw_only is not MISSING: + raise TypeError(f'field {f.name} is a ClassVar but specifies ' + 'kw_only') + + # For real fields, disallow mutable defaults. Use unhashable as a proxy + # indicator for mutability. Read the __hash__ attribute from the class, + # not the instance. + if f._field_type is _FIELD and f.default.__class__.__hash__ is None: + raise ValueError(f'mutable default {type(f.default)} for field ' + f'{f.name} is not allowed: use default_factory') + + return f + +def _set_qualname(cls, value): + # Ensure that the functions returned from _create_fn uses the proper + # __qualname__ (the class they belong to). + if isinstance(value, FunctionType): + value.__qualname__ = f"{cls.__qualname__}.{value.__name__}" + return value + +def _set_new_attribute(cls, name, value): + # Never overwrites an existing attribute. Returns True if the + # attribute already exists. + if name in cls.__dict__: + return True + _set_qualname(cls, value) + setattr(cls, name, value) + return False + + +# Decide if/how we're going to create a hash function. Key is +# (unsafe_hash, eq, frozen, does-hash-exist). Value is the action to +# take. The common case is to do nothing, so instead of providing a +# function that is a no-op, use None to signify that. + +def _hash_set_none(cls, fields, globals): + return None + +def _hash_add(cls, fields, globals): + flds = [f for f in fields if (f.compare if f.hash is None else f.hash)] + return _set_qualname(cls, _hash_fn(flds, globals)) + +def _hash_exception(cls, fields, globals): + # Raise an exception. + raise TypeError(f'Cannot overwrite attribute __hash__ ' + f'in class {cls.__name__}') + +# +# +-------------------------------------- unsafe_hash? +# | +------------------------------- eq? +# | | +------------------------ frozen? +# | | | +---------------- has-explicit-hash? +# | | | | +# | | | | +------- action +# | | | | | +# v v v v v +_hash_action = {(False, False, False, False): None, + (False, False, False, True ): None, + (False, False, True, False): None, + (False, False, True, True ): None, + (False, True, False, False): _hash_set_none, + (False, True, False, True ): None, + (False, True, True, False): _hash_add, + (False, True, True, True ): None, + (True, False, False, False): _hash_add, + (True, False, False, True ): _hash_exception, + (True, False, True, False): _hash_add, + (True, False, True, True ): _hash_exception, + (True, True, False, False): _hash_add, + (True, True, False, True ): _hash_exception, + (True, True, True, False): _hash_add, + (True, True, True, True ): _hash_exception, + } +# See https://bugs.python.org/issue32929#msg312829 for an if-statement +# version of this table. + + +def _process_class(cls, init, repr, eq, order, unsafe_hash, frozen, + match_args, kw_only, slots, weakref_slot): + # Now that dicts retain insertion order, there's no reason to use + # an ordered dict. I am leveraging that ordering here, because + # derived class fields overwrite base class fields, but the order + # is defined by the base class, which is found first. + fields = {} + + if cls.__module__ in sys.modules: + globals = sys.modules[cls.__module__].__dict__ + else: + # Theoretically this can happen if someone writes + # a custom string to cls.__module__. In which case + # such dataclass won't be fully introspectable + # (w.r.t. typing.get_type_hints) but will still function + # correctly. + globals = {} + + setattr(cls, _PARAMS, _DataclassParams(init, repr, eq, order, + unsafe_hash, frozen)) + + # Find our base classes in reverse MRO order, and exclude + # ourselves. In reversed order so that more derived classes + # override earlier field definitions in base classes. As long as + # we're iterating over them, see if any are frozen. + any_frozen_base = False + has_dataclass_bases = False + for b in cls.__mro__[-1:0:-1]: + # Only process classes that have been processed by our + # decorator. That is, they have a _FIELDS attribute. + base_fields = getattr(b, _FIELDS, None) + if base_fields is not None: + has_dataclass_bases = True + for f in base_fields.values(): + fields[f.name] = f + if getattr(b, _PARAMS).frozen: + any_frozen_base = True + + # Annotations that are defined in this class (not in base + # classes). If __annotations__ isn't present, then this class + # adds no new annotations. We use this to compute fields that are + # added by this class. + # + # Fields are found from cls_annotations, which is guaranteed to be + # ordered. Default values are from class attributes, if a field + # has a default. If the default value is a Field(), then it + # contains additional info beyond (and possibly including) the + # actual default value. Pseudo-fields ClassVars and InitVars are + # included, despite the fact that they're not real fields. That's + # dealt with later. + cls_annotations = cls.__dict__.get('__annotations__', {}) + + # Now find fields in our class. While doing so, validate some + # things, and set the default values (as class attributes) where + # we can. + cls_fields = [] + # Get a reference to this module for the _is_kw_only() test. + KW_ONLY_seen = False + dataclasses = sys.modules[__name__] + for name, type in cls_annotations.items(): + # See if this is a marker to change the value of kw_only. + if (_is_kw_only(type, dataclasses) + or (isinstance(type, str) + and _is_type(type, cls, dataclasses, dataclasses.KW_ONLY, + _is_kw_only))): + # Switch the default to kw_only=True, and ignore this + # annotation: it's not a real field. + if KW_ONLY_seen: + raise TypeError(f'{name!r} is KW_ONLY, but KW_ONLY ' + 'has already been specified') + KW_ONLY_seen = True + kw_only = True + else: + # Otherwise it's a field of some type. + cls_fields.append(_get_field(cls, name, type, kw_only)) + + for f in cls_fields: + fields[f.name] = f + + # If the class attribute (which is the default value for this + # field) exists and is of type 'Field', replace it with the + # real default. This is so that normal class introspection + # sees a real default value, not a Field. + if isinstance(getattr(cls, f.name, None), Field): + if f.default is MISSING: + # If there's no default, delete the class attribute. + # This happens if we specify field(repr=False), for + # example (that is, we specified a field object, but + # no default value). Also if we're using a default + # factory. The class attribute should not be set at + # all in the post-processed class. + delattr(cls, f.name) + else: + setattr(cls, f.name, f.default) + + # Do we have any Field members that don't also have annotations? + for name, value in cls.__dict__.items(): + if isinstance(value, Field) and not name in cls_annotations: + raise TypeError(f'{name!r} is a field but has no type annotation') + + # Check rules that apply if we are derived from any dataclasses. + if has_dataclass_bases: + # Raise an exception if any of our bases are frozen, but we're not. + if any_frozen_base and not frozen: + raise TypeError('cannot inherit non-frozen dataclass from a ' + 'frozen one') + + # Raise an exception if we're frozen, but none of our bases are. + if not any_frozen_base and frozen: + raise TypeError('cannot inherit frozen dataclass from a ' + 'non-frozen one') + + # Remember all of the fields on our class (including bases). This + # also marks this class as being a dataclass. + setattr(cls, _FIELDS, fields) + + # Was this class defined with an explicit __hash__? Note that if + # __eq__ is defined in this class, then python will automatically + # set __hash__ to None. This is a heuristic, as it's possible + # that such a __hash__ == None was not auto-generated, but it + # close enough. + class_hash = cls.__dict__.get('__hash__', MISSING) + has_explicit_hash = not (class_hash is MISSING or + (class_hash is None and '__eq__' in cls.__dict__)) + + # If we're generating ordering methods, we must be generating the + # eq methods. + if order and not eq: + raise ValueError('eq must be true if order is true') + + # Include InitVars and regular fields (so, not ClassVars). This is + # initialized here, outside of the "if init:" test, because std_init_fields + # is used with match_args, below. + all_init_fields = [f for f in fields.values() + if f._field_type in (_FIELD, _FIELD_INITVAR)] + (std_init_fields, + kw_only_init_fields) = _fields_in_init_order(all_init_fields) + + if init: + # Does this class have a post-init function? + has_post_init = hasattr(cls, _POST_INIT_NAME) + + _set_new_attribute(cls, '__init__', + _init_fn(all_init_fields, + std_init_fields, + kw_only_init_fields, + frozen, + has_post_init, + # The name to use for the "self" + # param in __init__. Use "self" + # if possible. + '__dataclass_self__' if 'self' in fields + else 'self', + globals, + slots, + )) + + # Get the fields as a list, and include only real fields. This is + # used in all of the following methods. + field_list = [f for f in fields.values() if f._field_type is _FIELD] + + if repr: + flds = [f for f in field_list if f.repr] + _set_new_attribute(cls, '__repr__', _repr_fn(flds, globals)) + + if eq: + # Create __eq__ method. There's no need for a __ne__ method, + # since python will call __eq__ and negate it. + flds = [f for f in field_list if f.compare] + self_tuple = _tuple_str('self', flds) + other_tuple = _tuple_str('other', flds) + _set_new_attribute(cls, '__eq__', + _cmp_fn('__eq__', '==', + self_tuple, other_tuple, + globals=globals)) + + if order: + # Create and set the ordering methods. + flds = [f for f in field_list if f.compare] + self_tuple = _tuple_str('self', flds) + other_tuple = _tuple_str('other', flds) + for name, op in [('__lt__', '<'), + ('__le__', '<='), + ('__gt__', '>'), + ('__ge__', '>='), + ]: + if _set_new_attribute(cls, name, + _cmp_fn(name, op, self_tuple, other_tuple, + globals=globals)): + raise TypeError(f'Cannot overwrite attribute {name} ' + f'in class {cls.__name__}. Consider using ' + 'functools.total_ordering') + + if frozen: + for fn in _frozen_get_del_attr(cls, field_list, globals): + if _set_new_attribute(cls, fn.__name__, fn): + raise TypeError(f'Cannot overwrite attribute {fn.__name__} ' + f'in class {cls.__name__}') + + # Decide if/how we're going to create a hash function. + hash_action = _hash_action[bool(unsafe_hash), + bool(eq), + bool(frozen), + has_explicit_hash] + if hash_action: + # No need to call _set_new_attribute here, since by the time + # we're here the overwriting is unconditional. + cls.__hash__ = hash_action(cls, field_list, globals) + + if not getattr(cls, '__doc__'): + # Create a class doc-string. + try: + # In some cases fetching a signature is not possible. + # But, we surely should not fail in this case. + text_sig = str(inspect.signature(cls)).replace(' -> None', '') + except (TypeError, ValueError): + text_sig = '' + cls.__doc__ = (cls.__name__ + text_sig) + + if match_args: + # I could probably compute this once + _set_new_attribute(cls, '__match_args__', + tuple(f.name for f in std_init_fields)) + + # It's an error to specify weakref_slot if slots is False. + if weakref_slot and not slots: + raise TypeError('weakref_slot is True but slots is False') + if slots: + cls = _add_slots(cls, frozen, weakref_slot) + + abc.update_abstractmethods(cls) + + return cls + + +# _dataclass_getstate and _dataclass_setstate are needed for pickling frozen +# classes with slots. These could be slightly more performant if we generated +# the code instead of iterating over fields. But that can be a project for +# another day, if performance becomes an issue. +def _dataclass_getstate(self): + return [getattr(self, f.name) for f in fields(self)] + + +def _dataclass_setstate(self, state): + for field, value in zip(fields(self), state): + # use setattr because dataclass may be frozen + object.__setattr__(self, field.name, value) + + +def _get_slots(cls): + match cls.__dict__.get('__slots__'): + # A class which does not define __slots__ at all is equivalent + # to a class defining __slots__ = ('__dict__', '__weakref__') + case None: + yield from ('__dict__', '__weakref__') + case str(slot): + yield slot + # Slots may be any iterable, but we cannot handle an iterator + # because it will already be (partially) consumed. + case iterable if not hasattr(iterable, '__next__'): + yield from iterable + case _: + raise TypeError(f"Slots of '{cls.__name__}' cannot be determined") + + +def _add_slots(cls, is_frozen, weakref_slot): + # Need to create a new class, since we can't set __slots__ + # after a class has been created. + + # Make sure __slots__ isn't already set. + if '__slots__' in cls.__dict__: + raise TypeError(f'{cls.__name__} already specifies __slots__') + + # Create a new dict for our new class. + cls_dict = dict(cls.__dict__) + field_names = tuple(f.name for f in fields(cls)) + # Make sure slots don't overlap with those in base classes. + inherited_slots = set( + itertools.chain.from_iterable(map(_get_slots, cls.__mro__[1:-1])) + ) + # The slots for our class. Remove slots from our base classes. Add + # '__weakref__' if weakref_slot was given, unless it is already present. + cls_dict["__slots__"] = tuple( + itertools.filterfalse( + inherited_slots.__contains__, + itertools.chain( + # gh-93521: '__weakref__' also needs to be filtered out if + # already present in inherited_slots + field_names, ('__weakref__',) if weakref_slot else () + ) + ), + ) + + for field_name in field_names: + # Remove our attributes, if present. They'll still be + # available in _MARKER. + cls_dict.pop(field_name, None) + + # Remove __dict__ itself. + cls_dict.pop('__dict__', None) + + # Clear existing `__weakref__` descriptor, it belongs to a previous type: + cls_dict.pop('__weakref__', None) # gh-102069 + + # And finally create the class. + qualname = getattr(cls, '__qualname__', None) + cls = type(cls)(cls.__name__, cls.__bases__, cls_dict) + if qualname is not None: + cls.__qualname__ = qualname + + if is_frozen: + # Need this for pickling frozen classes with slots. + if '__getstate__' not in cls_dict: + cls.__getstate__ = _dataclass_getstate + if '__setstate__' not in cls_dict: + cls.__setstate__ = _dataclass_setstate + + return cls + + +def dataclass(cls=None, /, *, init=True, repr=True, eq=True, order=False, + unsafe_hash=False, frozen=False, match_args=True, + kw_only=False, slots=False, weakref_slot=False): + """Add dunder methods based on the fields defined in the class. + + Examines PEP 526 __annotations__ to determine fields. + + If init is true, an __init__() method is added to the class. If repr + is true, a __repr__() method is added. If order is true, rich + comparison dunder methods are added. If unsafe_hash is true, a + __hash__() method is added. If frozen is true, fields may not be + assigned to after instance creation. If match_args is true, the + __match_args__ tuple is added. If kw_only is true, then by default + all fields are keyword-only. If slots is true, a new class with a + __slots__ attribute is returned. + """ + + def wrap(cls): + return _process_class(cls, init, repr, eq, order, unsafe_hash, + frozen, match_args, kw_only, slots, + weakref_slot) + + # See if we're being called as @dataclass or @dataclass(). + if cls is None: + # We're called with parens. + return wrap + + # We're called as @dataclass without parens. + return wrap(cls) + + +def fields(class_or_instance): + """Return a tuple describing the fields of this dataclass. + + Accepts a dataclass or an instance of one. Tuple elements are of + type Field. + """ + + # Might it be worth caching this, per class? + try: + fields = getattr(class_or_instance, _FIELDS) + except AttributeError: + raise TypeError('must be called with a dataclass type or instance') from None + + # Exclude pseudo-fields. Note that fields is sorted by insertion + # order, so the order of the tuple is as the fields were defined. + return tuple(f for f in fields.values() if f._field_type is _FIELD) + + +def _is_dataclass_instance(obj): + """Returns True if obj is an instance of a dataclass.""" + return hasattr(type(obj), _FIELDS) + + +def is_dataclass(obj): + """Returns True if obj is a dataclass or an instance of a + dataclass.""" + cls = obj if isinstance(obj, type) else type(obj) + return hasattr(cls, _FIELDS) + + +def asdict(obj, *, dict_factory=dict): + """Return the fields of a dataclass instance as a new dictionary mapping + field names to field values. + + Example usage:: + + @dataclass + class C: + x: int + y: int + + c = C(1, 2) + assert asdict(c) == {'x': 1, 'y': 2} + + If given, 'dict_factory' will be used instead of built-in dict. + The function applies recursively to field values that are + dataclass instances. This will also look into built-in containers: + tuples, lists, and dicts. + """ + if not _is_dataclass_instance(obj): + raise TypeError("asdict() should be called on dataclass instances") + return _asdict_inner(obj, dict_factory) + + +def _asdict_inner(obj, dict_factory): + if _is_dataclass_instance(obj): + result = [] + for f in fields(obj): + value = _asdict_inner(getattr(obj, f.name), dict_factory) + result.append((f.name, value)) + return dict_factory(result) + elif isinstance(obj, tuple) and hasattr(obj, '_fields'): + # obj is a namedtuple. Recurse into it, but the returned + # object is another namedtuple of the same type. This is + # similar to how other list- or tuple-derived classes are + # treated (see below), but we just need to create them + # differently because a namedtuple's __init__ needs to be + # called differently (see bpo-34363). + + # I'm not using namedtuple's _asdict() + # method, because: + # - it does not recurse in to the namedtuple fields and + # convert them to dicts (using dict_factory). + # - I don't actually want to return a dict here. The main + # use case here is json.dumps, and it handles converting + # namedtuples to lists. Admittedly we're losing some + # information here when we produce a json list instead of a + # dict. Note that if we returned dicts here instead of + # namedtuples, we could no longer call asdict() on a data + # structure where a namedtuple was used as a dict key. + + return type(obj)(*[_asdict_inner(v, dict_factory) for v in obj]) + elif isinstance(obj, (list, tuple)): + # Assume we can create an object of this type by passing in a + # generator (which is not true for namedtuples, handled + # above). + return type(obj)(_asdict_inner(v, dict_factory) for v in obj) + elif isinstance(obj, dict): + return type(obj)((_asdict_inner(k, dict_factory), + _asdict_inner(v, dict_factory)) + for k, v in obj.items()) + else: + return copy.deepcopy(obj) + + +def astuple(obj, *, tuple_factory=tuple): + """Return the fields of a dataclass instance as a new tuple of field values. + + Example usage:: + + @dataclass + class C: + x: int + y: int + + c = C(1, 2) + assert astuple(c) == (1, 2) + + If given, 'tuple_factory' will be used instead of built-in tuple. + The function applies recursively to field values that are + dataclass instances. This will also look into built-in containers: + tuples, lists, and dicts. + """ + + if not _is_dataclass_instance(obj): + raise TypeError("astuple() should be called on dataclass instances") + return _astuple_inner(obj, tuple_factory) + + +def _astuple_inner(obj, tuple_factory): + if _is_dataclass_instance(obj): + result = [] + for f in fields(obj): + value = _astuple_inner(getattr(obj, f.name), tuple_factory) + result.append(value) + return tuple_factory(result) + elif isinstance(obj, tuple) and hasattr(obj, '_fields'): + # obj is a namedtuple. Recurse into it, but the returned + # object is another namedtuple of the same type. This is + # similar to how other list- or tuple-derived classes are + # treated (see below), but we just need to create them + # differently because a namedtuple's __init__ needs to be + # called differently (see bpo-34363). + return type(obj)(*[_astuple_inner(v, tuple_factory) for v in obj]) + elif isinstance(obj, (list, tuple)): + # Assume we can create an object of this type by passing in a + # generator (which is not true for namedtuples, handled + # above). + return type(obj)(_astuple_inner(v, tuple_factory) for v in obj) + elif isinstance(obj, dict): + return type(obj)((_astuple_inner(k, tuple_factory), _astuple_inner(v, tuple_factory)) + for k, v in obj.items()) + else: + return copy.deepcopy(obj) + + +def make_dataclass(cls_name, fields, *, bases=(), namespace=None, init=True, + repr=True, eq=True, order=False, unsafe_hash=False, + frozen=False, match_args=True, kw_only=False, slots=False, + weakref_slot=False): + """Return a new dynamically created dataclass. + + The dataclass name will be 'cls_name'. 'fields' is an iterable + of either (name), (name, type) or (name, type, Field) objects. If type is + omitted, use the string 'typing.Any'. Field objects are created by + the equivalent of calling 'field(name, type [, Field-info])'.:: + + C = make_dataclass('C', ['x', ('y', int), ('z', int, field(init=False))], bases=(Base,)) + + is equivalent to:: + + @dataclass + class C(Base): + x: 'typing.Any' + y: int + z: int = field(init=False) + + For the bases and namespace parameters, see the builtin type() function. + + The parameters init, repr, eq, order, unsafe_hash, and frozen are passed to + dataclass(). + """ + + if namespace is None: + namespace = {} + + # While we're looking through the field names, validate that they + # are identifiers, are not keywords, and not duplicates. + seen = set() + annotations = {} + defaults = {} + for item in fields: + if isinstance(item, str): + name = item + tp = 'typing.Any' + elif len(item) == 2: + name, tp, = item + elif len(item) == 3: + name, tp, spec = item + defaults[name] = spec + else: + raise TypeError(f'Invalid field: {item!r}') + + if not isinstance(name, str) or not name.isidentifier(): + raise TypeError(f'Field names must be valid identifiers: {name!r}') + if keyword.iskeyword(name): + raise TypeError(f'Field names must not be keywords: {name!r}') + if name in seen: + raise TypeError(f'Field name duplicated: {name!r}') + + seen.add(name) + annotations[name] = tp + + # Update 'ns' with the user-supplied namespace plus our calculated values. + def exec_body_callback(ns): + ns.update(namespace) + ns.update(defaults) + ns['__annotations__'] = annotations + + # We use `types.new_class()` instead of simply `type()` to allow dynamic creation + # of generic dataclasses. + cls = types.new_class(cls_name, bases, {}, exec_body_callback) + + # Apply the normal decorator. + return dataclass(cls, init=init, repr=repr, eq=eq, order=order, + unsafe_hash=unsafe_hash, frozen=frozen, + match_args=match_args, kw_only=kw_only, slots=slots, + weakref_slot=weakref_slot) + + +def replace(obj, /, **changes): + """Return a new object replacing specified fields with new values. + + This is especially useful for frozen classes. Example usage:: + + @dataclass(frozen=True) + class C: + x: int + y: int + + c = C(1, 2) + c1 = replace(c, x=3) + assert c1.x == 3 and c1.y == 2 + """ + + # We're going to mutate 'changes', but that's okay because it's a + # new dict, even if called with 'replace(obj, **my_changes)'. + + if not _is_dataclass_instance(obj): + raise TypeError("replace() should be called on dataclass instances") + + # It's an error to have init=False fields in 'changes'. + # If a field is not in 'changes', read its value from the provided obj. + + for f in getattr(obj, _FIELDS).values(): + # Only consider normal fields or InitVars. + if f._field_type is _FIELD_CLASSVAR: + continue + + if not f.init: + # Error if this field is specified in changes. + if f.name in changes: + raise ValueError(f'field {f.name} is declared with ' + 'init=False, it cannot be specified with ' + 'replace()') + continue + + if f.name not in changes: + if f._field_type is _FIELD_INITVAR and f.default is MISSING: + raise ValueError(f"InitVar {f.name!r} " + 'must be specified with replace()') + changes[f.name] = getattr(obj, f.name) + + # Create the new object, which calls __init__() and + # __post_init__() (if defined), using all of the init fields we've + # added and/or left in 'changes'. If there are values supplied in + # changes that aren't fields, this will correctly raise a + # TypeError. + return obj.__class__(**changes) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_preprocessing.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_preprocessing.py new file mode 100644 index 0000000..265af57 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_preprocessing.py @@ -0,0 +1,61 @@ +"""Preprocessing recommendation helpers.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass(slots=True) +class PreprocessingAction: + action_name: str + target_fields: list[str] + reason: str + result: str + + +def build_preprocessing_log( + should_run: bool, + actions: list[PreprocessingAction] | None = None, + raw_snapshot_path: str = "", + processed_snapshot_path: str = "", +) -> dict[str, Any]: + return { + "should_run_preprocessing": should_run, + "raw_snapshot_path": raw_snapshot_path, + "processed_snapshot_path": processed_snapshot_path, + "actions": [asdict(action) for action in (actions or [])], + } + + +def generate_preprocessing_recommendations( + data_profile: dict[str, Any], + preprocessing_preference: str, +) -> list[PreprocessingAction]: + actions: list[PreprocessingAction] = [] + if preprocessing_preference == "no preprocessing": + return actions + + high_missing_fields = data_profile.get("data_profile", {}).get("high_missing_fields", []) + if high_missing_fields: + actions.append( + PreprocessingAction( + action_name="Review missing-value treatment", + target_fields=list(high_missing_fields), + reason="High-missing fields may distort statistics and figures.", + result="Wait for user confirmation before dropping or imputing values.", + ) + ) + + constant_columns = data_profile.get("data_profile", {}).get("constant_columns", []) + if constant_columns: + actions.append( + PreprocessingAction( + action_name="Remove constant columns", + target_fields=list(constant_columns), + reason="Constant columns do not provide useful analytical signal.", + result="Recommended for removal.", + ) + ) + + return actions diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_statistical_analysis.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_statistical_analysis.py new file mode 100644 index 0000000..93c311b --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_statistical_analysis.py @@ -0,0 +1,36 @@ +"""Statistical summary helpers.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass(slots=True) +class NumericSummary: + field_name: str + sample_size: int + minimum: float | None + maximum: float | None + mean_value: float | None + median_value: float | None + + +@dataclass(slots=True) +class CategoricalSummary: + field_name: str + unique_value_count: int + most_frequent_category: str | None + most_frequent_count: int + + +def generate_statistical_summary( + numeric_fields: list[NumericSummary] | None = None, + categorical_fields: list[CategoricalSummary] | None = None, + notes: list[str] | None = None, +) -> dict[str, Any]: + return { + "numeric_summary": [asdict(field) for field in (numeric_fields or [])], + "categorical_summary": [asdict(field) for field in (categorical_fields or [])], + "notes": notes or [], + } diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_strategy_registry.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_strategy_registry.py new file mode 100644 index 0000000..5a4fca5 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_strategy_registry.py @@ -0,0 +1,81 @@ +"""Strategy registry for dataset analysis.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass(slots=True) +class StrategyDefinition: + strategy_id: str + display_name: str + supported_types: list[str] + core_tasks: list[str] + recommended_artifacts: list[str] + risk_notes: list[str] + + +STRATEGY_REGISTRY: dict[str, StrategyDefinition] = { + "tabular_generic": StrategyDefinition( + strategy_id="tabular_generic", + display_name="Generic tabular analysis", + supported_types=["tabular_generic", "unknown"], + core_tasks=["schema/profile", "missing and duplicate checks", "distribution analysis", "group comparison", "correlation analysis"], + recommended_artifacts=["schema.json", "data_profile.json", "visualization_plan.json", "analysis_findings.json"], + risk_notes=["Do not invent business semantics when field meaning is unclear."], + ), + "questionnaire": StrategyDefinition( + strategy_id="questionnaire", + display_name="Questionnaire analysis", + supported_types=["questionnaire"], + core_tasks=["question-type identification", "scale normalization", "invalid response detection", "group comparison", "open-response analysis"], + recommended_artifacts=[ + "questionnaire_profile.json", + "questionnaire_scoring.json", + "group_comparison.csv", + "open_response_keywords.csv", + ], + risk_notes=["Confirm reverse-coded items, dimension grouping, and scale direction before execution."], + ), + "literary": StrategyDefinition( + strategy_id="literary", + display_name="Literary corpus analysis", + supported_types=["literary"], + core_tasks=["genre refinement", "token statistics", "character and imagery analysis", "structural pattern mining", "corpus limitation summary"], + recommended_artifacts=[ + "character_frequency.csv", + "imagery_frequency.csv", + "character_cooccurrence.csv", + "analysis_findings.json", + ], + risk_notes=["Do not present rule-based extraction outputs as human-annotated ground truth."], + ), + "time_series": StrategyDefinition( + strategy_id="time_series", + display_name="Time-series analysis", + supported_types=["time_series"], + core_tasks=["time index identification", "frequency checks", "trend analysis", "anomaly detection", "rolling metrics and seasonality review"], + recommended_artifacts=[ + "time_series_profile.json", + "time_series_summary.csv", + "time_series_trend.png", + "rolling_summary.csv", + ], + risk_notes=["Confirm time columns, frequency, and aggregation granularity before execution."], + ), +} + + +def select_strategy(detection_result: dict[str, Any]) -> dict[str, Any]: + dataset_type = detection_result.get("primary_type", "unknown") + strategy_name = detection_result.get("detected_strategy", "tabular_generic") + if strategy_name not in STRATEGY_REGISTRY: + strategy_name = "tabular_generic" + + strategy = STRATEGY_REGISTRY[strategy_name] + if dataset_type not in strategy.supported_types and dataset_type != "unknown": + fallback_name = detection_result.get("fallback_type", "tabular_generic") + strategy = STRATEGY_REGISTRY.get(fallback_name, STRATEGY_REGISTRY["tabular_generic"]) + + return asdict(strategy) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_time_series_analysis.py b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_time_series_analysis.py new file mode 100644 index 0000000..7b15ec2 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/analysis/analyzer_time_series_analysis.py @@ -0,0 +1,56 @@ +"""Time-series profiling helpers.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass +from statistics import mean +from typing import Any + + +@dataclass(slots=True) +class TimeSeriesSummary: + time_field: str + metric_field: str + sample_size: int + earliest_time: str | None + latest_time: str | None + gap_count: int + candidate_frequency: str + is_univariate: bool + + +def build_time_series_profile( + time_field: str, + metric_field: str, + time_values: list[str], + metric_values: list[float | int], +) -> dict[str, Any]: + non_empty_time_values = [str(value) for value in time_values if value not in (None, "")] + non_empty_metric_values = [float(value) for value in metric_values if value not in (None, "")] + sample_size = min(len(non_empty_time_values), len(non_empty_metric_values)) + + summary = TimeSeriesSummary( + time_field=time_field, + metric_field=metric_field, + sample_size=sample_size, + earliest_time=min(non_empty_time_values) if non_empty_time_values else None, + latest_time=max(non_empty_time_values) if non_empty_time_values else None, + gap_count=max(len(time_values) - len(non_empty_time_values), 0), + candidate_frequency="unknown", + is_univariate=True, + ) + + return { + "time_series_profile": asdict(summary), + "summary_statistics": { + "mean": round(mean(non_empty_metric_values), 4) if non_empty_metric_values else None, + "minimum": min(non_empty_metric_values) if non_empty_metric_values else None, + "maximum": max(non_empty_metric_values) if non_empty_metric_values else None, + }, + "recommended_checks": [ + "Check whether the time index is continuous.", + "Check whether daily, weekly, or monthly aggregation is needed.", + "Check anomaly spikes and missing intervals.", + "Add rolling, period-over-period, or seasonal views when appropriate.", + ], + } diff --git a/skills/data-flow-skill/data-flow-skill/scripts/image_gen/image_generator.py b/skills/data-flow-skill/data-flow-skill/scripts/image_gen/image_generator.py new file mode 100644 index 0000000..97e8b43 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/image_gen/image_generator.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""命令行图片生成脚本。 + +该脚本只负责: +1. 接收已经写好的图片生成 prompt +2. 调用图像模型生成图片 +3. 将图片和元数据保存到本地 + +不负责: +1. 根据任务内容猜测主题 +2. 根据数据类型硬编码匹配场景 +3. 自动编写主题插画 prompt + +示例: + python image_gen/image_generator.py generate \ + --prompt "简约动画风格,三位运动员站在领奖台上,背景留白,无文字" \ + --output output/figures/theme_illustration.png +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + + +def _load_env_key() -> str | None: + env_path = Path(__file__).parent.parent / ".env" + if env_path.exists(): + for line in env_path.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#") and "=" in line: + k, v = line.split("=", 1) + if k.strip() == "DASHSCOPE_API_KEY": + return v.strip() + return None + + +DASHSCOPE_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation" +DEFAULT_MODEL = "qwen-image-2.0-pro" +DEFAULT_SIZE = "1328*1328" +DEFAULT_NEGATIVE_PROMPT = ( + "低分辨率,低画质,肢体畸形,手指错误,画面过饱和,文字,水印,logo,AI感过强," + "构图混乱,背景杂乱,模糊,重影,写实照片风,恐怖风。" +) + + +def ensure_parent(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def parse_image_url(response_data: dict[str, Any]) -> str: + try: + return response_data["output"]["choices"][0]["message"]["content"][0]["image"] + except (KeyError, IndexError, TypeError) as exc: + raise RuntimeError(f"未在接口响应中找到图片 URL: {response_data}") from exc + + +def request_image( + prompt: str, + api_key: str, + *, + model: str = DEFAULT_MODEL, + size: str = DEFAULT_SIZE, + negative_prompt: str = DEFAULT_NEGATIVE_PROMPT, + prompt_extend: bool = True, + watermark: bool = False, + timeout: int = 300, +) -> str: + payload = { + "model": model, + "input": { + "messages": [ + { + "role": "user", + "content": [{"text": prompt}], + } + ] + }, + "parameters": { + "negative_prompt": negative_prompt, + "prompt_extend": prompt_extend, + "watermark": watermark, + "size": size, + }, + } + body = json.dumps(payload, ensure_ascii=False).encode("utf-8") + request = urllib.request.Request( + DASHSCOPE_URL, + data=body, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + method="POST", + ) + try: + with urllib.request.urlopen(request, timeout=timeout) as response: + data = json.loads(response.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="ignore") + raise RuntimeError(f"图片生成请求失败: HTTP {exc.code} {detail}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"图片生成请求失败: {exc.reason}") from exc + return parse_image_url(data) + + +def download_image(url: str, output_path: Path, *, timeout: int = 300) -> None: + ensure_parent(output_path) + try: + with urllib.request.urlopen(url, timeout=timeout) as response: + output_path.write_bytes(response.read()) + except urllib.error.URLError as exc: + raise RuntimeError(f"下载图片失败: {exc.reason}") from exc + + +def generate_image( + prompt: str, + output_path: str | Path, + *, + api_key=None, + model: str = DEFAULT_MODEL, + size: str = DEFAULT_SIZE, + negative_prompt: str = DEFAULT_NEGATIVE_PROMPT, + prompt_extend: bool = True, + watermark: bool = False, + timeout: int = 300, +) -> dict[str, Any]: + resolved_key = api_key or os.getenv("DASHSCOPE_API_KEY") or _load_env_key() + if not resolved_key: + raise RuntimeError("缺少 DASHSCOPE_API_KEY,无法生成图片。") + + output = Path(output_path) + image_url = request_image( + prompt=prompt, + api_key=resolved_key, + model=model, + size=size, + negative_prompt=negative_prompt, + prompt_extend=prompt_extend, + watermark=watermark, + timeout=timeout, + ) + download_image(image_url, output, timeout=timeout) + + metadata = { + "status": "success", + "prompt": prompt, + "model": model, + "size": size, + "image_url": image_url, + "output_path": str(output.resolve()), + } + metadata_path = output.with_suffix(output.suffix + ".json") + metadata_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8") + metadata["metadata_path"] = str(metadata_path.resolve()) + return metadata + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="图片生成脚本") + subparsers = parser.add_subparsers(dest="command", required=True) + + generate_parser = subparsers.add_parser("generate", help="按给定 prompt 生成图片") + generate_parser.add_argument("--prompt", required=True, help="图片生成 prompt") + generate_parser.add_argument("--output", required=True, help="输出图片路径") + generate_parser.add_argument("--size", default=DEFAULT_SIZE, help="图片尺寸,例如 1328*1328") + generate_parser.add_argument("--model", default=DEFAULT_MODEL, help="模型名") + generate_parser.add_argument("--api-key", default=None, help="可选,直接传入 API Key") + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.command != "generate": + raise RuntimeError(f"不支持的命令: {args.command}") + result = generate_image( + prompt=args.prompt, + output_path=args.output, + api_key=args.api_key, + model=args.model, + size=args.size, + ) + print(json.dumps(result, ensure_ascii=False, indent=2)) + return 0 + except Exception as exc: + print(json.dumps({"status": "failed", "error": str(exc)}, ensure_ascii=False, indent=2), file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/mermaid/__init__.py b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/__init__.py new file mode 100644 index 0000000..414a19e --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/__init__.py @@ -0,0 +1,5 @@ +"""Utilities for building and rendering Mermaid flowcharts.""" + +from .flowchart import FlowStep, MermaidFlowchart, render_mermaid_file + +__all__ = ["FlowStep", "MermaidFlowchart", "render_mermaid_file"] diff --git a/skills/data-flow-skill/data-flow-skill/scripts/mermaid/cli.py b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/cli.py new file mode 100644 index 0000000..7f15c82 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/cli.py @@ -0,0 +1,117 @@ +"""CLI for generating and exporting Mermaid flowcharts.""" + +from __future__ import annotations + +import argparse + +try: + from .flowchart import MermaidFlowchart, render_mermaid_file + from .echarts_export import render_echarts_file +except ImportError: + from flowchart import MermaidFlowchart, render_mermaid_file + from echarts_export import render_echarts_file + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Generate Mermaid flowcharts and export images.") + subparsers = parser.add_subparsers(dest="command", required=True) + + build_parser = subparsers.add_parser("build", help="Generate a Mermaid flowchart from ordered steps.") + build_parser.add_argument("--title", required=True, help="Chart title.") + build_parser.add_argument("--step", action="append", required=True, help="Ordered step text. Repeat this flag.") + build_parser.add_argument("--output", required=True, help="Output .mmd file path.") + build_parser.add_argument("--direction", default="TD", help="Mermaid direction, e.g. TD or LR.") + + render_parser = subparsers.add_parser("render", help="Render a Mermaid file to an image or PDF.") + render_parser.add_argument("--input", required=True, help="Input .mmd file path.") + render_parser.add_argument("--output", required=True, help="Output file path, e.g. .png/.svg/.pdf.") + render_parser.add_argument("--theme", default="default", help="Mermaid theme.") + render_parser.add_argument("--background", default="white", help="Background color.") + render_parser.add_argument("--scale", type=int, default=2, help="Render scale.") + render_parser.add_argument("--width", type=int, help="Optional canvas width.") + render_parser.add_argument("--height", type=int, help="Optional canvas height.") + + export_parser = subparsers.add_parser("export", help="Generate a Mermaid file and immediately render it.") + export_parser.add_argument("--title", required=True, help="Chart title.") + export_parser.add_argument("--step", action="append", required=True, help="Ordered step text. Repeat this flag.") + export_parser.add_argument("--mmd-output", required=True, help="Output .mmd file path.") + export_parser.add_argument("--image-output", required=True, help="Rendered output path.") + export_parser.add_argument("--direction", default="TD", help="Mermaid direction, e.g. TD or LR.") + export_parser.add_argument("--theme", default="default", help="Mermaid theme.") + export_parser.add_argument("--background", default="white", help="Background color.") + export_parser.add_argument("--scale", type=int, default=2, help="Render scale.") + export_parser.add_argument("--width", type=int, help="Optional canvas width.") + export_parser.add_argument("--height", type=int, help="Optional canvas height.") + + echarts_parser = subparsers.add_parser("echarts", help="Render an ECharts JSON config to an image.") + echarts_parser.add_argument("--input", required=True, help="Input ECharts JSON config file path.") + echarts_parser.add_argument("--output", required=True, help="Output image file path.") + echarts_parser.add_argument( + "--format", + choices=["png", "jpeg", "svg", "pdf"], + help="Output format. Inferred from output path if not specified.", + ) + echarts_parser.add_argument( + "--timeout", + type=int, + default=60, + help="Rendering timeout in seconds (default: 60).", + ) + + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + if args.command == "build": + flowchart = MermaidFlowchart.from_steps(args.title, args.step, direction=args.direction) + path = flowchart.write(args.output) + print(path) + return 0 + + if args.command == "render": + path = render_mermaid_file( + args.input, + args.output, + theme=args.theme, + background_color=args.background, + scale=args.scale, + width=args.width, + height=args.height, + ) + print(path) + return 0 + + if args.command == "export": + flowchart = MermaidFlowchart.from_steps(args.title, args.step, direction=args.direction) + mmd_path = flowchart.write(args.mmd_output) + image_path = render_mermaid_file( + mmd_path, + args.image_output, + theme=args.theme, + background_color=args.background, + scale=args.scale, + width=args.width, + height=args.height, + ) + print(image_path) + return 0 + + if args.command == "echarts": + path = render_echarts_file( + args.input, + args.output, + format=args.format, + cli_timeout_seconds=args.timeout, + ) + print(path) + return 0 + + parser.error("Unsupported command.") + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/mermaid/echarts_export.py b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/echarts_export.py new file mode 100644 index 0000000..5ce6080 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/echarts_export.py @@ -0,0 +1,239 @@ +"""Export ECharts configurations to image formats.""" + +from __future__ import annotations + +import json +import shutil +import subprocess +import tempfile +from pathlib import Path + + +def is_echarts_cli_available() -> bool: + return shutil.which("npx") is not None + + +def render_echarts_file( + input_path: str | Path, + output_path: str | Path, + *, + format: str | None = None, + cli_timeout_seconds: int = 60, +) -> Path: + """Render an ECharts JSON config file to an image. + + Args: + input_path: Path to ECharts JSON config file. + output_path: Output image file path. + format: Output format (png, jpeg, svg, pdf). Inferred from output_path if None. + cli_timeout_seconds: Timeout for CLI rendering. + + Returns: + Path to the rendered output file. + """ + input_file = Path(input_path) + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + + if format is None: + format = output_file.suffix.lstrip(".").lower() + if not format: + format = "png" + + if format not in {"png", "jpeg", "jpg", "svg", "pdf"}: + raise ValueError(f"Unsupported output format: {format}") + + if format in {"jpeg", "jpg"}: + format = "jpeg" + + if not input_file.exists(): + raise FileNotFoundError(f"Input file not found: {input_file}") + + config = json.loads(input_file.read_text(encoding="utf-8")) + + html_content = _build_echarts_html(config, format) + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".html", delete=False, encoding="utf-8" + ) as f: + f.write(html_content) + temp_html = Path(f.name) + + try: + _render_via_puppeteer(temp_html, output_file, format=format, timeout=cli_timeout_seconds) + except FileNotFoundError: + if shutil.which("node") is None: + raise RuntimeError( + "Node.js is not installed. Please install Node.js to render ECharts images." + ) from None + raise + except subprocess.TimeoutExpired: + raise RuntimeError(f"Rendering timed out after {cli_timeout_seconds} seconds.") + except subprocess.CalledProcessError as exc: + stderr = (exc.stderr or "").strip() + stdout = (exc.stdout or "").strip() + error_msg = stderr or stdout or "Unknown error" + raise RuntimeError(f"ECharts rendering failed: {error_msg}") from exc + finally: + temp_html.unlink(missing_ok=True) + + return output_file + + +def _build_echarts_html(config: dict, format: str) -> str: + width = config.get("width", 800) + height = config.get("height", 600) + + config_json = json.dumps(config, ensure_ascii=False, separators=(",", ":")) + + background_color = config.get("backgroundColor", "#ffffff") + + return f""" + + + + + + + +
+ + +""" + + +def _render_via_puppeteer( + html_path: Path, + output_path: Path, + *, + format: str, + timeout: int, +) -> None: + script = f""" +const puppeteer = require('puppeteer'); +const path = require('path'); + +(async () => {{ + const browser = await puppeteer.launch({{ + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }}); + const page = await browser.newPage(); + + const fileUrl = 'file://' + path.resolve('{html_path}'); + await page.goto(fileUrl, {{ waitUntil: 'networkidle0', timeout: {timeout * 1000} }}); + + // Wait for chart to render + await new Promise(r => setTimeout(r, 1000)); + + const chart = await page.$('#chart'); + if (!chart) throw new Error('Chart element not found'); + + const boundingBox = await chart.boundingBox(); + + if ('{format}' === 'svg') {{ + const svg = await page.evaluate(() => {{ + const canvas = document.querySelector('#chart canvas'); + if (!canvas) return null; + const svgData = new XMLSerializer().serializeToString(canvas); + return svgData; + }}); + if (svg) {{ + require('fs').writeFileSync('{output_path}', svg); + }} else {{ + // Fallback: get SVG directly + const content = await page.content(); + const svgMatch = content.match(/]*>.*<\\/svg>/s); + if (svgMatch) {{ + require('fs').writeFileSync('{output_path}', svgMatch[0]); + }} else {{ + throw new Error('SVG export not supported for this chart type'); + }} + }} + }} else {{ + await page.setViewport({{ + width: Math.ceil(boundingBox.width), + height: Math.ceil(boundingBox.height) + }}); + + await page.evaluate(() => {{ + const chartInst = document.querySelector('#chart canvas'); + if (chartInst && chartInst.style) {{ + chartInst.style.background = 'white'; + }} + }}); + + const screenshotOptions = {{ + path: '{output_path}', + type: '{format}', + fullPage: false, + clip: {{ + x: boundingBox.x, + y: boundingBox.y, + width: boundingBox.width, + height: boundingBox.height + }} + }}; + + if ('{format}' === 'jpeg' || '{format}' === 'jpg') {{ + screenshotOptions.type = 'jpeg'; + screenshotOptions.quality = 95; + }} + + await page.screenshot(screenshotOptions); + }} + + await browser.close(); + process.exit(0); +}})().catch(err => {{ + console.error(err); + process.exit(1); +}}); +""" + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".js", delete=False, encoding="utf-8" + ) as f: + f.write(script) + script_path = Path(f.name) + + try: + result = subprocess.run( + ["node", str(script_path)], + capture_output=True, + text=True, + timeout=timeout, + ) + if result.returncode != 0: + stderr = result.stderr.strip() + stdout = result.stdout.strip() + error_msg = stderr or stdout or "Unknown error" + raise RuntimeError(f"Puppeteer rendering failed: {error_msg}") + finally: + script_path.unlink(missing_ok=True) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/mermaid/flowchart.py b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/flowchart.py new file mode 100644 index 0000000..16938f1 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/mermaid/flowchart.py @@ -0,0 +1,314 @@ +"""Build and render Mermaid flowcharts.""" + +from __future__ import annotations + +import base64 +import json +import re +import shutil +import subprocess +import zlib +import urllib.error +import urllib.parse +import urllib.request +from dataclasses import dataclass, field +from pathlib import Path +from typing import Iterable + + +def _slugify(value: str) -> str: + normalized = re.sub(r"[^0-9A-Za-z\u4e00-\u9fff]+", "_", value.strip()) + normalized = normalized.strip("_") + return normalized or "node" + + +def _escape_mermaid_text(value: str) -> str: + return value.replace('"', "'").replace("\n", "
") + + +@dataclass(slots=True) +class FlowStep: + """Single step in a Mermaid flowchart.""" + + text: str + node_id: str | None = None + shape: str = "rect" + metadata: dict[str, str] = field(default_factory=dict) + + def resolved_node_id(self, index: int) -> str: + if self.node_id: + return self.node_id + return f"step_{index}_{_slugify(self.text)}" + + +class MermaidFlowchart: + """Composable Mermaid flowchart builder.""" + + _SHAPE_TEMPLATES = { + "rect": '[{label}]', + "round": '({label})', + "stadium": '([{label}])', + "subroutine": '[[{label}]]', + "cylindrical": '[( {label} )]', + "circle": '(({label}))', + "diamond": '{{{label}}}', + "hexagon": '{{{{{label}}}}}', + "parallelogram": '[/ {label} /]', + } + + def __init__( + self, + title: str, + direction: str = "TD", + class_definitions: Iterable[str] | None = None, + ) -> None: + self.title = title + self.direction = direction + self.class_definitions = list(class_definitions or []) + self.steps: list[FlowStep] = [] + self.links: list[tuple[str, str, str | None]] = [] + + def add_step(self, text: str, *, node_id: str | None = None, shape: str = "rect") -> FlowStep: + step = FlowStep(text=text, node_id=node_id, shape=shape) + self.steps.append(step) + return step + + def add_link(self, source_id: str, target_id: str, label: str | None = None) -> None: + self.links.append((source_id, target_id, label)) + + @classmethod + def from_steps( + cls, + title: str, + steps: Iterable[str], + *, + direction: str = "TD", + first_shape: str = "round", + last_shape: str = "stadium", + ) -> "MermaidFlowchart": + flowchart = cls(title=title, direction=direction) + step_items = [step for step in steps if step.strip()] + for index, step_text in enumerate(step_items): + if index == 0: + shape = first_shape + elif index == len(step_items) - 1: + shape = last_shape + else: + shape = "rect" + flowchart.add_step(step_text, shape=shape) + return flowchart + + def to_mermaid(self) -> str: + lines = [ + "---", + f"title: {self.title}", + "---", + f"flowchart {self.direction}", + ] + + resolved_steps: list[tuple[str, FlowStep]] = [] + for index, step in enumerate(self.steps, start=1): + node_id = step.resolved_node_id(index) + label = _escape_mermaid_text(step.text) + template = self._SHAPE_TEMPLATES.get(step.shape, self._SHAPE_TEMPLATES["rect"]) + lines.append(f" {node_id}{template.format(label=label)}") + resolved_steps.append((node_id, step)) + + if self.links: + for source_id, target_id, label in self.links: + connector = f' -->|{_escape_mermaid_text(label)}| ' if label else " --> " + lines.append(f" {source_id}{connector}{target_id}") + else: + for (source_id, _), (target_id, _) in zip(resolved_steps, resolved_steps[1:]): + lines.append(f" {source_id} --> {target_id}") + + lines.extend(f" {class_definition}" for class_definition in self.class_definitions) + return "\n".join(lines) + "\n" + + def write(self, output_path: str | Path) -> Path: + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(self.to_mermaid(), encoding="utf-8") + return path + + +def render_mermaid_file( + input_path: str | Path, + output_path: str | Path, + *, + theme: str = "default", + background_color: str = "white", + scale: int = 2, + width: int | None = None, + height: int | None = None, + cli_timeout_seconds: int = 45, + prefer_remote: bool = False, +) -> Path: + input_file = Path(input_path) + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + diagram_text = input_file.read_text(encoding="utf-8") + + if prefer_remote: + _render_via_mermaid_ink( + diagram_text, + output_file, + theme=theme, + background_color=background_color, + scale=scale, + width=width, + height=height, + ) + return output_file + + if shutil.which("npx") is None: + _render_via_mermaid_ink( + diagram_text, + output_file, + theme=theme, + background_color=background_color, + scale=scale, + width=width, + height=height, + ) + return output_file + + config = { + "theme": theme, + "flowchart": {"curve": "basis"}, + "fontFamily": "PingFang SC, Hiragino Sans GB, Microsoft YaHei, sans-serif", + } + config_path = output_file.with_suffix(".mermaid-config.json") + config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8") + + command = [ + "npx", + "-y", + "@mermaid-js/mermaid-cli", + "-i", + str(input_file), + "-o", + str(output_file), + "-t", + theme, + "-b", + background_color, + "-s", + str(scale), + "-c", + str(config_path), + ] + if width is not None: + command.extend(["-w", str(width)]) + if height is not None: + command.extend(["-H", str(height)]) + + try: + subprocess.run( + command, + check=True, + capture_output=True, + text=True, + timeout=cli_timeout_seconds, + ) + except FileNotFoundError as exc: + _render_via_mermaid_ink( + diagram_text, + output_file, + theme=theme, + background_color=background_color, + scale=scale, + width=width, + height=height, + ) + return output_file + except subprocess.TimeoutExpired: + _render_via_mermaid_ink( + diagram_text, + output_file, + theme=theme, + background_color=background_color, + scale=scale, + width=width, + height=height, + ) + return output_file + except subprocess.CalledProcessError as exc: + stderr = (exc.stderr or "").strip() + stdout = (exc.stdout or "").strip() + local_error = stderr or stdout or "未知错误" + try: + _render_via_mermaid_ink( + diagram_text, + output_file, + theme=theme, + background_color=background_color, + scale=scale, + width=width, + height=height, + ) + return output_file + except RuntimeError as remote_exc: + raise RuntimeError(f"Mermaid 导出失败。本地错误: {local_error};远程错误: {remote_exc}") from exc + finally: + config_path.unlink(missing_ok=True) + + return output_file + + +def _render_via_mermaid_ink( + diagram_text: str, + output_file: Path, + *, + theme: str, + background_color: str, + scale: int, + width: int | None, + height: int | None, +) -> None: + extension = output_file.suffix.lower().lstrip(".") + if extension not in {"png", "svg", "pdf"}: + raise RuntimeError(f"远程渲染暂不支持输出格式: {output_file.suffix or ''}") + + payload = json.dumps( + {"code": diagram_text, "mermaid": {"theme": theme}}, + ensure_ascii=False, + separators=(",", ":"), + ).encode("utf-8") + encoded = "pako:" + urllib.parse.quote( + base64.urlsafe_b64encode(zlib.compress(payload, 9)).decode("ascii").rstrip("="), + safe=":_-", + ) + + if extension == "svg": + endpoint = "svg" + query_params: dict[str, str] = {} + elif extension == "png": + endpoint = "img" + query_params = {"type": "png"} + else: + endpoint = "pdf" + query_params = {"fit": ""} + + if background_color: + query_params["bgColor"] = f"!{background_color}" if background_color.isalpha() else background_color + if width is not None: + query_params["width"] = str(width) + if height is not None: + query_params["height"] = str(height) + if width is not None or height is not None: + query_params["scale"] = str(max(1, min(scale, 3))) + + query_string = urllib.parse.urlencode(query_params, doseq=False) + url = f"https://mermaid.ink/{endpoint}/{encoded}" + if query_string: + url = f"{url}?{query_string}" + + request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) + try: + with urllib.request.urlopen(request, timeout=30) as response: + payload = response.read() + except urllib.error.URLError as exc: + raise RuntimeError(f"远程渲染服务不可用: {exc}") from exc + + output_file.write_bytes(payload) diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_memevolve.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_memevolve.py new file mode 100644 index 0000000..6ea069c --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_memevolve.py @@ -0,0 +1,113 @@ +""" +复现 image1: MemEvolve 论文风格柱状图 +特征:配对柱(baseline vs method)+ 箭头 + 红色百分比标注 + 虚线参考线 +来源:MemEvolve: Meta-Evolution of Agent Memory Systems +""" + +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import matplotlib.ticker as ticker +import numpy as np + +# ── 预分析结论 ───────────────────────────────────────────── +# 字体:serif,双层 'a',衬线精细 → Computer Modern 风格 +# matplotlib 中最接近的可用字体:STIXGeneral / DejaVu Serif +# 加粗:标题(bold) | 增益标注(bold) | 轴标签/刻度(normal) +# 间距:两柱紧贴,gap≈0,柱宽约占 group 的 28% +# 分辨率:300 dpi +plt.rcParams.update({ + 'font.family': 'serif', + 'font.serif': ['STIXGeneral', 'DejaVu Serif', 'Times New Roman'], + 'mathtext.fontset': 'stix', +}) + +# ── 颜色系统 ────────────────────────────────────────────── +COLOR_BASELINE = '#A8C8E8' # 浅钢蓝,baseline 柱 +COLOR_METHOD = '#1B3D6E' # 深海军蓝,method 柱 +COLOR_DELTA = '#CC2200' # 红色,标注增益 + +# ── 数据 ───────────────────────────────────────────────── +panels = [ + { + 'title': 'OWL-Workforce', # emoji 字体不支持,用文字替代 + 'groups': ['Web', 'xBench', 'TaskCraft', 'GAIA'], + 'baseline': [58.1, 55.2, 58.7, 59.3], + 'method': [62.3, 61.2, 65.5, 61.0], + 'delta': ['+7.1%', '+10.9%', '+11.9%', '+2.7%'], + 'ylim': (40, 71), # 原图左图 Y 轴 40-70 + }, + { + 'title': 'CK-Pro', + 'groups': ['Web', 'xBench', 'TaskCraft', 'GAIA'], + 'baseline': [61.2, 55.8, 63.8, 58.1], + 'method': [63.8, 64.8, 67.8, 63.1], + 'delta': ['+4.2%', '+16.1%', '+4.8%', '+8.4%'], + 'ylim': (40, 76), # 原图右图 Y 轴 40-75 + }, +] + +# ── 画布 ───────────────────────────────────────────────── +fig, axes = plt.subplots(1, 2, figsize=(10, 4.5), sharey=False) +fig.subplots_adjust(wspace=0.35) + +BAR_W = 0.28 +GAP = 0.01 # 两柱几乎紧贴(原图约为 0) +ARROW_KW = dict(arrowstyle='->', color='black', lw=1.2) + +for ax, panel in zip(axes, panels): + groups = panel['groups'] + baseline = np.array(panel['baseline']) + method = np.array(panel['method']) + delta = panel['delta'] + n = len(groups) + x = np.arange(n) + + # 柱体 + bars_b = ax.bar(x - (BAR_W + GAP) / 2, baseline, width=BAR_W, + color=COLOR_BASELINE, zorder=3) + bars_m = ax.bar(x + (BAR_W + GAP) / 2, method, width=BAR_W, + color=COLOR_METHOD, zorder=3) + + # 虚线参考线(baseline 高度) + for i, (bl, me) in enumerate(zip(baseline, method)): + # 水平虚线从 baseline 柱顶延伸到 method 柱顶 + ax.plot([x[i] - BAR_W, x[i] + BAR_W + GAP / 2], + [bl, bl], color='black', lw=0.9, ls='--', zorder=4) + + # 箭头(baseline 顶 → method 顶) + ax.annotate('', xy=(x[i] + (BAR_W + GAP) / 2, me - 0.3), + xytext=(x[i] + (BAR_W + GAP) / 2, bl + 0.3), + arrowprops=ARROW_KW, zorder=5) + + # 红色百分比标注 + ax.text(x[i] + (BAR_W + GAP) / 2, me + 0.6, + delta[i], color=COLOR_DELTA, + ha='center', va='bottom', fontsize=9.5, fontweight='bold') + + # 轴装饰:原图轴标签明显加粗 + ax.set_xticks(x) + ax.set_xticklabels(groups, fontsize=10.5, fontweight='bold') + ax.set_ylabel('Accuracy (Pass@1)', fontsize=10.5, fontweight='bold') + ax.set_ylim(*panel['ylim']) + ax.yaxis.set_major_locator(plt.MultipleLocator(5)) + + # 四边框全显,线宽偏厚(原图有明显框线) + for spine in ax.spines.values(): + spine.set_linewidth(1.5) + spine.set_color('black') + ax.tick_params(length=0) + ax.set_axisbelow(True) + + # 标题(左上角):粗体 serif,颜色深蓝对应原图 + ax.text(0.04, 0.97, panel['title'], transform=ax.transAxes, + fontsize=12, fontweight='bold', va='top', ha='left', + color='#003F6C', fontfamily='serif') + +from pathlib import Path + +output_path = Path('output/figures/bar_memevolve_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +plt.savefig(output_path, + dpi=300, bbox_inches='tight', facecolor='white') +plt.close() +print('saved: bar_memevolve_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_spice.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_spice.py new file mode 100644 index 0000000..f4432b2 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bar_spice.py @@ -0,0 +1,169 @@ +""" +复现 image5: SPICE 论文风格柱状图 +特征:分组柱 + 斜线填充(主方法)+ 柱顶数值标注(最优加粗)+ 灰色网格 +来源:SPICE: Self-play in corpus environments improves reasoning +""" + +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np + +# ── 预分析结论 ───────────────────────────────────────────── +# 字体:原图更接近 LaTeX/Computer Modern,而不是 Times +# 这里直接启用 usetex,优先还原论文图常见的 TeX 字体气质 +# 加粗:面板标题(normal) | 图例 SPICE 条目(bold) | 其他图例(normal) +# 主方法数值(bold+深红) | 其他数值(normal+黑) +# 间距:三柱较细,组间留白明显,子图整体更扁,接近原图长宽比 +# 边框:四边框都保留,且柱子层级低于边框 +# 分辨率:300 dpi +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, + 'hatch.color': 'white', # 原图是白色斜线刻在红底上 + 'hatch.linewidth': 1.4, +}) + +# ── 颜色 & 填充 ─────────────────────────────────────────── +# 左图 (ablation) +COLORS_ABL = ['#FFB695', '#FF7F5E', '#D00000'] # 浅橙/中橙/正红(对齐原图) +HATCHES_ABL = ['', '', '//'] +LABELS_ABL = ['SPICE (Fixed Challenger)', 'SPICE (No Corpus)', 'SPICE'] + +# 右图 (comparison) +COLORS_CMP = ['#D3D3D3', '#A9A9A9', '#D00000'] # 浅灰/中灰/正红(对齐原图) +HATCHES_CMP = ['', '', '//'] +LABELS_CMP = ['R-Zero', 'Absolute Zero', 'SPICE'] + +# ── 数据 ───────────────────────────────────────────────── +benchmarks = ['MATH500', "AIME'25", 'GPQA-Diamond', 'MMLU-Pro'] + +data_abl = { + 'SPICE (Fixed Challenger)': [68.2, 6.7, 26.3, 51.6], + 'SPICE (No Corpus)': [72.6, 12.3, 31.8, 53.7], + 'SPICE': [78.0, 19.1, 39.4, 58.1], +} +data_cmp = { + 'R-Zero': [72.6, 5.2, 27.8, 53.7], + 'Absolute Zero': [76.2, 13.4, 35.3, 52.6], + 'SPICE': [78.0, 19.1, 39.4, 58.1], +} + +BEST_METHOD = 'SPICE' # 柱顶数字加粗的方法 + + +def draw_panel( + ax, + data_dict, + colors, + hatches, + labels, + title, + x_positions, + total_w, + xlim, + legend_anchor, +): + n_groups = len(benchmarks) + n_methods = len(labels) + x = np.array(x_positions) + bar_w = total_w / n_methods + + for i, (label, color, hatch) in enumerate(zip(labels, colors, hatches)): + vals = data_dict[label] + offset = (i - n_methods / 2 + 0.5) * bar_w + bars = ax.bar(x + offset, vals, width=bar_w, + color=color, hatch=hatch, + edgecolor='white', + linewidth=0.8, zorder=2, label=label) + + is_best = (label == BEST_METHOD) + for bar, v in zip(bars, vals): + ax.text(bar.get_x() + bar.get_width() / 2, + v + 0.5, f'{v}', + ha='center', va='bottom', + fontsize=8.7, + fontweight='bold' if is_best else 'normal', + color='black', # 原图数值全部黑色,仅粗细区分 + zorder=3) + + # 轴 + ax.set_xticks(x) + ax.set_xticklabels(benchmarks, fontsize=10.8) + ax.set_xlabel('Benchmark', fontsize=11.2) + ax.set_ylabel(r'Accuracy (\%)', fontsize=11.2) + ax.set_ylim(0, 85) + ax.set_xlim(*xlim) + ax.set_title(title, fontsize=13.2, pad=5) + + # 网格(仅 y 轴浅灰实线) + ax.yaxis.grid(True, color='#EBEBEB', linewidth=0.7, linestyle='--', zorder=0) + ax.set_axisbelow(True) + + # 原图为开口式坐标轴:只保留左/下 spine + for side, spine in ax.spines.items(): + if side in ('top', 'right'): + spine.set_visible(False) + else: + spine.set_linewidth(0.9) + spine.set_color('#333333') + spine.set_zorder(4) + + ax.tick_params(length=0, labelsize=10.2) + + # 图例 + handles = [mpatches.Patch(facecolor=c, hatch=h, + edgecolor='white', linewidth=0.8, + label=l) + for l, c, h in zip(labels, colors, hatches)] + leg = ax.legend(handles=handles, fontsize=8.9, loc='upper right', + bbox_to_anchor=legend_anchor, + framealpha=1.0, facecolor='white', + edgecolor='#C8C8C8', fancybox=False, + borderpad=0.28, labelspacing=0.26, + handlelength=1.7, handletextpad=0.45, + borderaxespad=0.28) + # 只将主方法(BEST_METHOD)的图例文字加粗 + for text in leg.get_texts(): + if text.get_text() == BEST_METHOD: + text.set_fontweight('bold') + + +# ── 画布 ───────────────────────────────────────────────── +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12.8, 4.35)) +fig.subplots_adjust(left=0.05, right=0.985, bottom=0.15, top=0.86, wspace=0.16) + +draw_panel( + ax1, + data_abl, + COLORS_ABL, + HATCHES_ABL, + LABELS_ABL, + '(a) SPICE Ablations', + x_positions=[0.00, 1.00, 2.00, 3.00], + total_w=0.78, + xlim=(-0.56, 3.56), + legend_anchor=(0.992, 0.986), +) +draw_panel( + ax2, + data_cmp, + COLORS_CMP, + HATCHES_CMP, + LABELS_CMP, + '(b) SPICE vs Baselines', + x_positions=[0.00, 1.00, 2.00, 3.00], + total_w=0.78, + xlim=(-0.56, 3.56), + legend_anchor=(0.992, 0.986), +) + +from pathlib import Path + +output_path = Path('output/figures/bar_spice_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +plt.savefig(output_path, + dpi=300, facecolor='white') +plt.close() +print('saved: bar_spice_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/box_plot.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/box_plot.py new file mode 100644 index 0000000..9b4eef4 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/box_plot.py @@ -0,0 +1,113 @@ +""" +箱线图(box plot) +特征:多组数据分布对比,4边可见箱体,y轴浅灰网格,中位线红色加粗 +来源:学术统计图表风格,复用 bar_grouped_hatch 配色体系 +""" + +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path + +# ── 全局样式(复用 bar_grouped_hatch 的 serif + usetex 风格)───────────── +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +# ── 颜色 ────────────────────────────────────────────────────── +C_BOX = '#5499C7' # 箱体主色(蓝) +C_MED = '#CC2200' # 中位线(红,与 bar_paired_delta delta 色一致) +C_OUTL = '#D651A0' # 异常值(粉,与 scatter_tsne 一致) +C_WHISK = '#333333' # 须线颜色 + +# ── 数据(请替换为你的数据)─────────────────────────────────── +# 每组数据可以是 list 或 np.array +data = { + 'Method A': [23.5, 25.1, 24.8, 26.2, 27.0, 25.5, 24.9, 26.8, 25.0, 24.3], + 'Method B': [28.3, 29.1, 27.8, 30.2, 29.5, 28.9, 30.1, 29.0, 28.7, 29.3], + 'Ours': [31.2, 32.5, 31.8, 33.1, 32.0, 31.5, 32.8, 33.4, 31.9, 32.2], +} + +labels = list(data.keys()) +values = [np.array(v) for v in data.values()] + +# ── 参数配置 ───────────────────────────────────────────────── +TITLE = r'\textbf{Distribution Comparison}' +XLABEL = r'\textbf{Method}' +YLABEL = r'\textit{Accuracy (\%)}' +YLIM = (15, 40) +BOX_W = 0.35 + +# ── 画布 ───────────────────────────────────────────────────── +fig, ax = plt.subplots(figsize=(7.5, 5.0)) + +x_pos = np.arange(len(labels)) + +# 画箱线图 +bp = ax.boxplot( + values, + positions=x_pos, + widths=BOX_W, + patch_artist=True, # 允许填充颜色 + showmeans=True, # 显示均值 + meanline=False, # 均值用 marker,不用线 + whiskerprops=dict(color=C_WHISK, linewidth=1.2), + capprops=dict(color=C_WHISK, linewidth=1.2), + flierprops=dict(marker='o', markerfacecolor=C_OUTL, markersize=5, + markeredgecolor=C_WHISK, markeredgewidth=0.5), + medianprops=dict(color=C_MED, linewidth=2.0), + meanprops=dict(marker='D', markerfacecolor='white', markeredgecolor=C_WHISK, + markersize=5, markeredgewidth=0.8), +) + +# 填充箱体颜色 +for patch in bp['boxes']: + patch.set_facecolor(C_BOX) + patch.set_alpha(0.75) + patch.set_edgecolor(C_WHISK) + patch.set_linewidth(1.0) + +# 隐藏 top/right spine(学术风格) +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_color('#333333') +ax.spines['bottom'].set_color('#333333') +ax.spines['left'].set_linewidth(1.2) +ax.spines['bottom'].set_linewidth(1.2) + +# y 轴网格(与 bar_grouped_hatch 一致) +ax.yaxis.grid(True, color='#EBEBEB', linewidth=0.7, linestyle='--', zorder=0) +ax.set_axisbelow(True) + +# 刻度 +ax.tick_params(length=4, direction='in', labelsize=10) + +# 标签 +ax.set_xticks(x_pos) +ax.set_xticklabels(labels, fontsize=11) +ax.set_xlabel(XLABEL, fontsize=11) +ax.set_ylabel(YLABEL, fontsize=11) +ax.set_ylim(*YLIM) + +# 标题 +ax.set_title(TITLE, fontsize=13, pad=8) + +# 图例(均值 marker 说明) +from matplotlib.lines import Line2D +legend_elements = [ + Line2D([0], [0], marker='D', color='white', markerfacecolor='white', + markeredgecolor=C_WHISK, markersize=5, markeredgewidth=0.8, + label=r'\textit{Mean}'), + Line2D([0], [0], color=C_MED, linewidth=2.0, label=r'\textit{Median}'), +] +ax.legend(handles=legend_elements, fontsize=9, loc='upper right', + frameon=True, facecolor='white', edgecolor='#CCCCCC') + +# ── 保存 ───────────────────────────────────────────────────── +output_path = Path('output/figures/box_plot_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bubble_chart.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bubble_chart.py new file mode 100644 index 0000000..1858a01 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/bubble_chart.py @@ -0,0 +1,149 @@ +""" +气泡图(bubble chart) +特征:x/y 平面 + size 编码第三维,圆形气泡,serif + usetex 风格 +来源:继承 scatter_tsne_cluster 的聚类表达,加入第三维 size +""" + +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path + +# ── 全局样式(与 scatter_tsne_cluster 一致)─────────────────── +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +# ── 颜色(沿用 scatter_tsne 的语义色阶)────────────────────── +C_MAIN = '#1B3D6E' # 主类(深蓝) +C_CLASS = ['#6A4C93', '#D651A0', '#FF8A65', '#FFB74D', '#C888E8'] + +# ── 数据(请替换为你的数据)─────────────────────────────────── +# x, y = 两个维度(位置),s = size(第三维),label = 图例标签,color = 颜色 +bubbles = [ + {'x': 10, 'y': 20, 's': 80, 'label': 'Method A'}, + {'x': 15, 'y': 35, 's': 120, 'label': 'Method B'}, + {'x': 25, 'y': 15, 's': 200, 'label': 'Method C'}, + {'x': 30, 'y': 45, 's': 60, 'label': 'Method D'}, + {'x': 40, 'y': 30, 's': 150, 'label': 'Method E'}, + {'x': 35, 'y': 22, 's': 90, 'label': 'Method F'}, +] + +# 主方法名称(图例会加粗) +MAIN_METHOD = 'Method C' + +# ── 参数配置 ───────────────────────────────────────────────── +TITLE = r'\textbf{Bubble Chart: Accuracy vs Efficiency}' +XLABEL = r'\textit{Accuracy (\%)}' +YLABEL = r'\textit{Efficiency (speedup)}' +XLIM = (5, 50) +YLIM = (10, 55) +SIZE_MIN = 60 +SIZE_MAX = 600 +SIZE_SCALE = 'sqrt' # 推荐 sqrt 避免大值主导 + +# ── size 映射函数 ──────────────────────────────────────────── +def scale_size(s, vmin, vmax, smin, smax, mode='sqrt'): + frac = (s - vmin) / (vmax - vmin) if vmax != vmin else 0.5 + if mode == 'sqrt': + frac = np.sqrt(frac) + return smin + frac * (smax - smin) + +vmin = min(b['s'] for b in bubbles) +vmax = max(b['s'] for b in bubbles) + +# ── 画布 ───────────────────────────────────────────────────── +fig, ax = plt.subplots(figsize=(8.0, 6.0)) + +# ── 画气泡 ───────────────────────────────────────────────── +for idx, b in enumerate(bubbles): + color = C_MAIN if b['label'] == MAIN_METHOD else C_CLASS[idx % len(C_CLASS)] + size = scale_size(b['s'], vmin, vmax, SIZE_MIN, SIZE_MAX, SIZE_SCALE) + is_main = b['label'] == MAIN_METHOD + + ax.scatter( + b['x'], b['y'], + s=size, + c=color, + alpha=0.65, + edgecolors='white', + linewidths=1.0, + zorder=3, + label=b['label'], + ) + # 气泡内标签(仅大气泡) + if size > 120: + ax.text( + b['x'], b['y'], + b['label'].replace('Method ', 'M'), + fontsize=7.5, ha='center', va='center', + color='white', fontweight='bold' if is_main else 'normal', + zorder=4, + ) + +# ── 样式 ───────────────────────────────────────────────────── +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_color('#333333') +ax.spines['bottom'].set_color('#333333') +ax.spines['left'].set_linewidth(0.9) +ax.spines['bottom'].set_linewidth(0.9) + +# 点线网格(与 scatter_tsne 一致) +ax.grid(True, color='#E0E0E0', linewidth=0.6, linestyle=':', zorder=0) +ax.set_axisbelow(True) + +# 刻度 +ax.tick_params(length=4, direction='in', labelsize=10, color='#333333') + +# 标签 +ax.set_xlim(*XLIM) +ax.set_ylim(*YLIM) +ax.set_xlabel(XLABEL, fontsize=12) +ax.set_ylabel(YLABEL, fontsize=12) +ax.set_title(TITLE, fontsize=13, pad=10) + +# ── 图例 ───────────────────────────────────────────────────── +# 排序:主方法在前 +from matplotlib.lines import Line2D +legend_elements = [] +labels_seen = [] +for b in bubbles: + if b['label'] in labels_seen: + continue + labels_seen.append(b['label']) + color = C_MAIN if b['label'] == MAIN_METHOD else C_CLASS[list(bubbles).index(b) % len(C_CLASS)] + is_main = b['label'] == MAIN_METHOD + size = scale_size(b['s'], vmin, vmax, SIZE_MIN, SIZE_MAX, SIZE_SCALE) + legend_elements.append( + Line2D([0], [0], marker='o', color='white', + markerfacecolor=color, markersize=np.sqrt(size) / 3.5, + markeredgecolor='white', markeredgewidth=0.5, + label=b['label'], + linewidth=0) + ) + +leg = ax.legend( + handles=legend_elements, + fontsize=9, + loc='upper left', + bbox_to_anchor=(1.01, 1.0), + frameon=True, + facecolor='white', + edgecolor='#CCCCCC', + labelspacing=0.4, + handlelength=1.0, + borderaxespad=0.3, +) +for text in leg.get_texts(): + if text.get_text() == MAIN_METHOD: + text.set_fontweight('bold') + +# ── 保存 ───────────────────────────────────────────────────── +output_path = Path('output/figures/bubble_chart_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/calendar_heatmap.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/calendar_heatmap.py new file mode 100644 index 0000000..2213768 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/calendar_heatmap.py @@ -0,0 +1,208 @@ +""" +日历热力图(calendar heatmap) +特征:六边形网格(hexbin),按日/周展示时序活跃度,蓝色冷色阶 +来源:参考 GitHub contribution graph 风格,适合展示每日活跃度、提交量等 +""" + +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np +import pandas as pd +from matplotlib.colors import BoundaryNorm +from matplotlib.cm import ScalarMappable +import matplotlib.colorbar as cbar + +# ── 全局样式 ───────────────────────────────────────────────────── +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +# ── 颜色 ──────────────────────────────────────────────────────── +C_LOW = '#EBF5FB' # 最浅蓝(接近 0) +C_MID = '#5499C7' # 中蓝 +C_HIGH = '#1B3D6E' # 深蓝(最高值) +C_WEEKEND = '#F5F5F5' # 周末列底色 + +# ── 数据(请替换为你的数据)─────────────────────────────────── +# 格式:{ 'YYYY-MM-DD': value } 或 pd.DataFrame +raw_data = { + '2024-01-02': 3, '2024-01-03': 7, '2024-01-04': 12, '2024-01-05': 5, + '2024-01-08': 8, '2024-01-09': 14, '2024-01-10': 6, '2024-01-11': 9, + '2024-01-12': 4, '2024-01-15': 11, '2024-01-16': 18, '2024-01-17': 7, + '2024-01-18': 5, '2024-01-19': 3, '2024-01-22': 9, '2024-01-23': 15, + '2024-01-24': 22, '2024-01-25': 11, '2024-01-26': 6, + '2024-01-29': 8, '2024-01-30': 13, '2024-01-31': 7, + '2024-02-01': 5, '2024-02-02': 9, '2024-02-05': 14, '2024-02-06': 18, + '2024-02-07': 11, '2024-02-08': 6, '2024-02-09': 4, '2024-02-12': 8, + '2024-02-13': 16, '2024-02-14': 21, '2024-02-15': 9, '2024-02-16': 5, + '2024-02-19': 7, '2024-02-20': 12, '2024-02-21': 19, '2024-02-22': 14, + '2024-02-23': 8, '2024-02-26': 6, '2024-02-27': 11, '2024-02-28': 17, + '2024-02-29': 9, + '2024-03-01': 4, '2024-03-04': 8, '2024-03-05': 13, '2024-03-06': 16, + '2024-03-07': 10, '2024-03-08': 5, '2024-03-11': 7, '2024-03-12': 14, + '2024-03-13': 20, '2024-03-14': 12, '2024-03-15': 6, '2024-03-18': 9, + '2024-03-19': 15, '2024-03-20': 18, '2024-03-21': 11, '2024-03-22': 7, + '2024-03-25': 5, '2024-03-26': 10, '2024-03-27': 14, '2024-03-28': 19, + '2024-03-29': 8, +} + +# ── 参数配置 ────────────────────────────────────────────────── +START_DATE = '2024-01-01' +END_DATE = '2024-03-31' +TITLE = 'Daily Activity' +CBAR_LABEL = 'Contributions' +C_MAP = 'Blues' # matplotlib 内置蓝白渐变 + +# Hexbin 参数:每个格子代表一天 +# x = 该日期属于第几周(从 START_DATE 起算) +# y = 星期几(0=Mon, 1=Tue, ..., 6=Sun) +# 这样每列是一周,每天一个六边形格子 +CELL_SIZE = 0.65 # inches + +# ── 数据解析 ────────────────────────────────────────────────── +if isinstance(raw_data, dict): + df = pd.DataFrame([ + {'date': pd.to_datetime(d, errors='coerce'), 'value': v} + for d, v in raw_data.items() + ]).dropna() +else: + df = raw_data.copy() + df['date'] = pd.to_datetime(df['date'], errors='coerce') + +# 过滤日期范围 +start = pd.to_datetime(START_DATE) +end = pd.to_datetime(END_DATE) +df = df[(df['date'] >= start) & (df['date'] <= end)].copy() + +# 补全缺失日期(value=0) +full_index = pd.date_range(start, end, freq='D') +df_full = pd.DataFrame({'date': full_index}) +df_full['value'] = df_full['date'].map( + df.set_index('date')['value'] +).fillna(0).values + +# 计算 x=周序号, y=星期几(0=Mon) +df_full['weekday'] = df_full['date'].dt.weekday # Mon=0, Sun=6 +df_full['days_since_start'] = (df_full['date'] - start).dt.days +df_full['week'] = df_full['days_since_start'] // 7 + +# 取出坐标和值 +x = df_full['week'].values.astype(float) +y = df_full['weekday'].values.astype(float) +C = df_full['value'].values.astype(float) + +# 计算色阶边界 +vmin = 0 +vmax = max(C.max() * 1.05, 1) +N_LEVELS = 5 +levels = np.linspace(vmin, vmax, N_LEVELS + 1) +norm = BoundaryNorm(levels, N_LEVELS) +cmap = plt.get_cmap(C_MAP) + +# 计算画布尺寸 +num_weeks = int(df_full['week'].max()) + 1 +fig_w = num_weeks * CELL_SIZE + 1.2 # 左侧周标签 + 色轴 +fig_h = 7 * CELL_SIZE + 0.8 # 7天 + 顶部月份标签 + +# ── 画布 ───────────────────────────────────────────────────── +fig, ax = plt.subplots(figsize=(fig_w, fig_h)) + +# 画六边形热力图 +hb = ax.hexbin( + x, y, C, + gridsize=int(num_weeks), + cmap=cmap, + norm=norm, + linewidths=0.3, + edgecolors='white', + mincnt=0, + zorder=2, +) + +# 周末列淡灰底色(周六=5, 周日=6) +for week_i in range(num_weeks): + for dy, day_label in enumerate(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']): + y_pos = dy + if y_pos in (5, 6): # Sat or Sun + col_color = C_WEEKEND + rect = mpatches.FancyBboxPatch( + (week_i - 0.5, y_pos - 0.5), + 1, 1, + boxstyle='square,pad=0', + facecolor=col_color, + edgecolor='none', + zorder=1 + ) + ax.add_patch(rect) + +# ── 样式 ───────────────────────────────────────────────────── +# 月份标签(放在顶部) +months_shown = {} +for _, row in df_full.iterrows(): + month = row['date'].month + week_i = row['week'] + if month not in months_shown: + months_shown[month] = week_i + +month_names = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] +for month_num, week_start in sorted(months_shown.items(), key=lambda x: x[1]): + ax.text( + week_start, 7.1, + month_names[month_num], + fontsize=9.5, ha='left', va='bottom', + color='#333333', fontweight='normal', + ) + ax.axvline(week_start, color='#CCCCCC', lw=0.5, zorder=0) + +# 左侧周标签 +ax.text(-0.7, 3, 'Week', fontsize=8, ha='right', va='center', color='#666666') + +# 左侧天数标签(每隔一天显示) +DAY_LABELS = ['Mon', '', 'Wed', '', 'Fri', '', ''] +for i, lbl in enumerate(DAY_LABELS): + if lbl: + ax.text(-0.02, i, lbl, fontsize=8, ha='right', va='center', color='#666666') + +# 隐藏 top/right spine,L 形 +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_color('#333333') +ax.spines['bottom'].set_color('#333333') +ax.spines['left'].set_linewidth(0.9) +ax.spines['bottom'].set_linewidth(0.9) + +# 刻度 +ax.tick_params(length=3, direction='out', labelsize=8) +ax.set_xticks([]) +ax.set_yticks(range(7)) +ax.set_yticklabels([]) +ax.set_xlim(-0.5, num_weeks + 0.5) +ax.set_ylim(-0.5, 7.0) +ax.set_aspect('equal') +ax.grid(False) + +# ── Colorbar ───────────────────────────────────────────────── +cbar_ax = fig.add_axes([0.93, 0.15, 0.025, 0.7]) +cb = fig.colorbar( + ScalarMappable(norm=norm, cmap=cmap), + cax=cbar_ax, + orientation='vertical', + shrink=0.7, +) +cb.set_label(CBAR_LABEL, fontsize=9) +cb.ax.tick_params(labelsize=8) + +# ── 标题 ───────────────────────────────────────────────────── +ax.set_title(TITLE, fontsize=12, fontweight='bold', color='#333333', pad=6) + +# ── 保存 ───────────────────────────────────────────────────── +from pathlib import Path + +output_path = Path('output/figures/calendar_heatmap_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_aime.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_aime.py new file mode 100644 index 0000000..1d94921 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_aime.py @@ -0,0 +1,98 @@ +""" +Reproduce: image6.png — AIME avg@32 training curve +Two lines with vertical breakpoint markers + horizontal reference line. +Style: sans-serif, 4-spine box, no grid, right-bottom legend. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.lines as mlines + +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +rng = np.random.default_rng(42) + +# ---- 模拟数据 ---- +# w/ Dynamic Sampling (purple): 0-2200 steps, rises fast, ends ~0.42 +steps_dyn_a = np.arange(0, 2300, 50) +y_dyn_a = 0.43 * (1 - np.exp(-steps_dyn_a / 600)) +y_dyn_a += rng.normal(0, 0.012, len(steps_dyn_a)) +y_dyn_a = np.clip(y_dyn_a, 0, 0.45) +y_dyn_a[:2] = [0.03, 0.03] # cold start + +# w/o Dynamic Sampling (cyan): 0-9000, slower rise, peaks ~0.42 at step 6000, then drops +steps_nodyn = np.arange(0, 9100, 100) +y_nodyn = 0.38 * (1 - np.exp(-steps_nodyn / 1200)) +y_nodyn += rng.normal(0, 0.012, len(steps_nodyn)) +y_nodyn = np.clip(y_nodyn, 0, 0.44) +y_nodyn[:2] = [0.01, 0.02] +# After step 6000, add gradual decline +mask = steps_nodyn > 6000 +y_nodyn[mask] -= 0.06 * (steps_nodyn[mask] - 6000) / 3000 + +C_DYN = '#5B0DAD' # 更深紫,接近原图 +C_NODYN = '#5BBCCA' # 柔和青绿 +C_REF = '#3D78C2' # 独立蓝色(参考线,与曲线色区分) +STEP_DYN = 2200 # 紫色垂直线 +STEP_NODYN = 6050 # 青色垂直线 +REF_Y = 0.43 # 水平参考线 + +fig, ax = plt.subplots(figsize=(9.0, 4.8)) + +# ---- 两条主线 ---- +ax.plot(steps_dyn_a, y_dyn_a, color=C_DYN, lw=1.4, zorder=3, label='w/ Dynamic Sampling') +ax.plot(steps_nodyn, y_nodyn, color=C_NODYN, lw=1.4, zorder=3, label='w/o Dynamic Sampling') + +# ---- 水平参考线 ---- +ax.axhline(REF_Y, color=C_REF, lw=1.5, linestyle='--', zorder=2) + +# ---- 两条垂直虚线 ---- +ax.axvline(STEP_DYN, color=C_DYN, lw=1.5, linestyle='--', alpha=0.85, zorder=2) +ax.axvline(STEP_NODYN, color=C_NODYN, lw=1.5, linestyle='--', alpha=0.85, zorder=2) + +# ---- Axes 样式 ---- +ax.set_xlim(-100, 9200) +ax.set_ylim(-0.01, 0.47) +ax.set_xticks([0, 2000, 4000, 6000, 8000]) +ax.set_xticklabels(['0', '2000', '4000', '6000', '8000'], fontsize=10) +ax.set_yticks([0.0, 0.1, 0.2, 0.3, 0.4]) +ax.tick_params(labelsize=10, direction='out', length=4, width=0.8) +ax.set_xlabel('Step', fontsize=12) +ax.set_ylabel('AIME avg@32', fontsize=12) + +# 四边框(all spines visible) +for sp in ax.spines.values(): + sp.set_visible(True) + sp.set_linewidth(1.0) + +ax.grid(False) + +# ---- 图例 ---- +leg = ax.legend( + loc='lower right', + fontsize=9.5, + frameon=True, + facecolor='white', + edgecolor='#AAAAAA', + framealpha=1.0, + borderpad=0.5, + labelspacing=0.3, + handlelength=2.0, + handletextpad=0.5, +) + +from pathlib import Path + +output_path = Path('output/figures/line_aime_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.tight_layout(pad=0.8) +fig.savefig( + output_path, + dpi=300, facecolor='white', +) +plt.close(fig) +print('saved: line_aime_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_loss_inset.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_loss_inset.py new file mode 100644 index 0000000..2a875b9 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_loss_inset.py @@ -0,0 +1,161 @@ +""" +Reproduce: image10.png — Loss curve with zoom inset (SiameseNorm paper style) +Main plot: L-shaped spine (left+bottom) + axis arrows, 3 lines. +Inset: zoomed blue+green in right panel. +Style: serif, tab10 colors, black dashed connection lines. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +from matplotlib.patches import ConnectionPatch, FancyArrowPatch +from mpl_toolkits.axes_grid1.inset_locator import mark_inset, inset_axes + +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +rng = np.random.default_rng(7) + +# ---- 模拟数据 ---- +steps = np.arange(0, 5600, 20) + +# HybridNorm (orange): exponential decay to ~8, spike at ~1900, then flat ~8 +y_hybrid = 7.5 * np.exp(-steps / 450) + 2.8 +y_hybrid += rng.normal(0, 0.06, len(steps)) +spike_idx = np.searchsorted(steps, 1880) +spike_end = np.searchsorted(steps, 1940) +y_hybrid[spike_idx:spike_end] = np.linspace(y_hybrid[spike_idx - 1], 15.5, spike_end - spike_idx) +after_spike = np.searchsorted(steps, 1940) +y_hybrid[after_spike:] = 7.8 + rng.normal(0, 0.07, len(steps[after_spike:])) + +# HybridNorm-ResiDual (blue): rapid decay + noisy with prominent spikes +y_blue = 7.8 * np.exp(-steps / 380) + 2.3 +y_blue += rng.normal(0, 0.18, len(steps)) +mask_noisy = steps > 2300 +# 模拟蓝线在 2300+ 之后有明显峰值(与原图一致) +noise_large = rng.normal(0, 1.5, mask_noisy.sum()) +# 少量极值峰 +for idx_offset in rng.integers(10, mask_noisy.sum() - 10, size=8): + noise_large[idx_offset] += rng.uniform(4, 9) +y_blue[mask_noisy] += noise_large +y_blue = np.clip(y_blue, 1.8, 13.5) + +# SiameseNorm/Ours (green): smooth rapid decay +y_green = 7.2 * np.exp(-steps / 360) + 2.1 +y_green += rng.normal(0, 0.05, len(steps)) +y_green = np.clip(y_green, 1.8, 9.0) + +# tab10 colors +C_ORANGE = '#FF7F0E' +C_BLUE = '#1F77B4' +C_GREEN = '#2CA02C' + +# ---- 主图 ---- +# 原图 952×368 → 宽高比 2.59;复现目标 10.5×4.05" +fig = plt.figure(figsize=(10.5, 4.05)) +ax_main = fig.add_axes([0.08, 0.16, 0.50, 0.78]) + +ax_main.plot(steps, y_hybrid, color=C_ORANGE, lw=1.3, label='HybridNorm', zorder=3) +ax_main.plot(steps, y_blue, color=C_BLUE, lw=1.0, label='HybridNorm-ResiDual', zorder=3) +ax_main.plot(steps, y_green, color=C_GREEN, lw=1.3, label='SiameseNorm (Ours)', zorder=4) + +ax_main.set_xlim(-50, 5600) +ax_main.set_ylim(1.5, 14.5) # 与原图 ~2-14 对齐 +ax_main.set_xlabel(r'Step', fontsize=10) +ax_main.set_ylabel(r'Loss', fontsize=10) +ax_main.set_xticks([0, 1000, 2000, 3000, 4000, 5000]) +ax_main.tick_params(labelsize=9.0, direction='out', length=3.5, width=0.8) + +# L 形 spine(左+下),无上右 +ax_main.spines['top'].set_visible(False) +ax_main.spines['right'].set_visible(False) +ax_main.spines['left'].set_linewidth(1.0) +ax_main.spines['bottom'].set_linewidth(1.0) + +# 轴端箭头(模拟原图的箭头轴) +ax_main.plot(1, 0, '>k', transform=ax_main.get_yaxis_transform(), + clip_on=False, markersize=5) +ax_main.plot(0, 1, '^k', transform=ax_main.get_xaxis_transform(), + clip_on=False, markersize=5) + +ax_main.grid(True, color='#E0E0E0', linewidth=0.5, linestyle=':') +ax_main.set_axisbelow(True) + +leg = ax_main.legend( + loc='upper right', + fontsize=9.0, + frameon=True, + facecolor='white', + edgecolor='#DDDDDD', + borderpad=0.4, + labelspacing=0.25, + handlelength=1.8, + framealpha=1.0, +) + +# ---- Zoom 区域(虚线矩形)---- +zoom_x1, zoom_x2 = 2400, 5500 +zoom_y1, zoom_y2 = 1.8, 4.5 +rect = mpatches.FancyBboxPatch( + (zoom_x1, zoom_y1), + zoom_x2 - zoom_x1, zoom_y2 - zoom_y1, + boxstyle='square,pad=0', + linewidth=1.0, edgecolor='#333333', + facecolor='none', linestyle='--', + zorder=5, +) +ax_main.add_patch(rect) + +# ---- Inset(右侧独立子图,原图约占总宽 40%,紧凑)---- +ax_inset = fig.add_axes([0.61, 0.10, 0.36, 0.86]) + +mask_z = (steps >= zoom_x1) & (steps <= zoom_x2) +steps_z = steps[mask_z] + +ax_inset.plot(steps_z, y_blue[mask_z], color=C_BLUE, lw=1.0, zorder=3) +ax_inset.plot(steps_z, y_green[mask_z], color=C_GREEN, lw=1.2, zorder=4) + +ax_inset.set_xlim(zoom_x1 - 50, zoom_x2 + 50) +ax_inset.set_ylim(zoom_y1 - 0.1, zoom_y2 + 4.0) # 原图 inset y: ~1.8~8.5 +ax_inset.set_xticks([3000, 4000, 5000]) +ax_inset.tick_params(labelsize=8.5, direction='out', length=3.5, width=0.8) + +for sp in ax_inset.spines.values(): + sp.set_visible(True) + sp.set_linewidth(1.5) + sp.set_color('#2A6073') # 原图 inset 边框为深蓝灰色 + +ax_inset.grid(False) + +# ---- 黑色虚线连接线(从 zoom 框的两个角到 inset 边缘)---- +# 右上角 → inset 左上角 +con1 = ConnectionPatch( + xyA=(zoom_x2, zoom_y2), coordsA=ax_main.transData, + xyB=(ax_inset.get_xlim()[0], ax_inset.get_ylim()[1]), + coordsB=ax_inset.transData, + color='#333333', lw=0.8, linestyle='--', +) +# 右下角 → inset 左下角 +con2 = ConnectionPatch( + xyA=(zoom_x2, zoom_y1), coordsA=ax_main.transData, + xyB=(ax_inset.get_xlim()[0], ax_inset.get_ylim()[0]), + coordsB=ax_inset.transData, + color='#333333', lw=0.8, linestyle='--', +) +fig.add_artist(con1) +fig.add_artist(con2) + +from pathlib import Path + +output_path = Path('output/figures/line_loss_inset_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig( + output_path, + dpi=300, facecolor='white', +) +plt.close(fig) +print('saved: line_loss_inset_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_selfdistill.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_selfdistill.py new file mode 100644 index 0000000..3b220fb --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/line_selfdistill.py @@ -0,0 +1,173 @@ +""" +复现 image2 & image3: Self-distillation 论文折线图 +image2: 连续训练曲线 + 置信区间阴影 + 水平参考线 +image3: 离散点折线 + 置信区间阴影(模型规模 scaling) +来源:Reinforcement learning via self-distillation +""" + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np + +# ── 预分析结论 ───────────────────────────────────────────── +# 字体:serif,接近 LaTeX Computer Modern,启用 usetex +# 加粗:标题 normal | 图例 SDPO bold | 其他 normal +# Spine:只保留左/下(开口式) +# Grid:无 +# 颜色:绿 #3A8B3A (SDPO) | 蓝 #3B6BB5 (GRPO) | 灰 #999999 (base) +# 阴影:主线颜色 alpha=0.15 的半透明填充 +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +C_SDPO = '#2CA02C' # matplotlib tab green +C_GRPO = '#1F77B4' # matplotlib tab blue +C_BASE = '#BCBCBC' # 浅灰,原图 base model + +# ══════════════════════════════════════════════════════════ +# 图 2:连续训练曲线(LiveCodeBench v6) +# ══════════════════════════════════════════════════════════ +np.random.seed(42) + +steps = np.linspace(0, 20000, 400) # 更多点 + EMA 后更平滑 + +def raw_curve(start, end, steps, noise=0.008): + t = steps / steps[-1] + curve = start + (end - start) * (1 - np.exp(-4 * t)) + curve += np.random.normal(0, noise, len(steps)) * (1 - t * 0.7) + return curve + +def ema(arr, alpha=0.96): + """Exponential moving average — 模拟论文中对 training log 的平滑""" + out = np.zeros_like(arr) + out[0] = arr[0] + for i in range(1, len(arr)): + out[i] = alpha * out[i - 1] + (1 - alpha) * arr[i] + return out + +# 中心线:先生成有噪声的曲线,再 EMA 平滑(与论文一致) +sdpo_mean = ema(raw_curve(0.285, 0.490, steps, noise=0.006)) +sdpo_std = 0.012 * np.exp(-2 * steps / steps[-1]) + 0.007 +grpo_mean = ema(raw_curve(0.285, 0.415, steps, noise=0.006)) +grpo_std = 0.010 * np.exp(-2 * steps / steps[-1]) + 0.006 + +fig2, ax2 = plt.subplots(figsize=(6.5, 4.4)) + +ax2.fill_between(steps, sdpo_mean - sdpo_std, sdpo_mean + sdpo_std, + color=C_SDPO, alpha=0.20) +ax2.fill_between(steps, grpo_mean - grpo_std, grpo_mean + grpo_std, + color=C_GRPO, alpha=0.20) +ax2.plot(steps, sdpo_mean, color=C_SDPO, lw=2.5, label=r'\textbf{SDPO}') +ax2.plot(steps, grpo_mean, color=C_GRPO, lw=2.5, label='GRPO') +# 原图 Claude 参考线为稀疏圆点线,非虚线 +ax2.axhline(0.406, color='#AAAAAA', lw=1.8, + linestyle=(0, (1, 2)), label='Claude Sonnet 4') + +ax2.set_xlim(0, 20000) +ax2.set_ylim(0.28, 0.52) +ax2.set_xlabel('Number of Generations', fontsize=13) +ax2.set_ylabel('Accuracy', fontsize=13) +ax2.set_title('LiveCodeBench v6', fontsize=15, pad=7) + +ax2.xaxis.set_major_formatter(ticker.FuncFormatter( + lambda x, _: f'{int(x/1000)}k' if x > 0 else '0')) +ax2.xaxis.set_major_locator(ticker.MultipleLocator(5000)) +ax2.yaxis.set_major_locator(ticker.MultipleLocator(0.05)) + +leg2 = ax2.legend(fontsize=11, loc='lower right', + framealpha=0, edgecolor='none', + handlelength=2.2, borderaxespad=0.5, labelspacing=0.3) +for text in leg2.get_texts(): + if 'SDPO' in text.get_text(): + text.set_fontweight('bold') + +# 四边框 + 向内刻度(与原图一致) +for sp in ax2.spines.values(): + sp.set_visible(True) + sp.set_linewidth(1.5) +ax2.tick_params(direction='in', length=5, width=1.2, labelsize=11) +ax2.grid(False) + +from pathlib import Path + +OUTPUT_DIR = Path("output/figures") +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +fig2.tight_layout(pad=0.9) +fig2.savefig(OUTPUT_DIR / 'line_selfdistill_v6_repro.png', + dpi=300, facecolor='white') +plt.close(fig2) +print('saved: line_selfdistill_v6_repro.png') + +# ══════════════════════════════════════════════════════════ +# 图 3:模型 scaling 折线(Model scaling Qwen3) +# ══════════════════════════════════════════════════════════ +param_labels = ['0.6', '1.7', '4', '8'] +param_x = [0.6, 1.7, 4, 8] +x_pos = [0, 1, 2, 3] # 等间距排列,x 轴用 param_labels + +sdpo_pts = [0.215, 0.333, 0.450, 0.490] +grpo_pts = [0.212, 0.295, 0.400, 0.414] +base_pts = [0.095, 0.150, 0.233, 0.284] + +sdpo_std3 = [0.005, 0.006, 0.008, 0.006] +grpo_std3 = [0.005, 0.006, 0.007, 0.006] + +fig3, ax3 = plt.subplots(figsize=(10, 5)) # 2:1 宽高比,与原图一致 + +ax3.fill_between(x_pos, + [v - s for v, s in zip(sdpo_pts, sdpo_std3)], + [v + s for v, s in zip(sdpo_pts, sdpo_std3)], + color=C_SDPO, alpha=0.18) +ax3.fill_between(x_pos, + [v - s for v, s in zip(grpo_pts, grpo_std3)], + [v + s for v, s in zip(grpo_pts, grpo_std3)], + color=C_GRPO, alpha=0.18) + +MEC = 'black' # 原图标记点有黑色描边 +ax3.plot(x_pos, sdpo_pts, color=C_SDPO, lw=2.5, + marker='o', ms=7, mfc=C_SDPO, + markeredgecolor=MEC, markeredgewidth=1.0, + label=r'\textbf{SDPO}') +ax3.plot(x_pos, grpo_pts, color=C_GRPO, lw=2.5, + marker='o', ms=7, mfc=C_GRPO, + markeredgecolor=MEC, markeredgewidth=1.0, + label='GRPO') +ax3.plot(x_pos, base_pts, color=C_BASE, lw=2.5, # 与主线同粗 + marker='o', ms=7, mfc=C_BASE, + markeredgecolor=MEC, markeredgewidth=1.0, + label='base model') + +ax3.set_xticks(x_pos) +ax3.set_xticklabels(param_labels, fontsize=11) +ax3.set_xlim(-0.35, 3.35) +ax3.set_ylim(0.08, 0.51) # 与原图 0.1~0.5 刻度对齐,留极小顶部空 +ax3.set_xlabel('Model parameters (B)', fontsize=13) +ax3.set_ylabel(r'\textit{Accuracy}', fontsize=13) +ax3.set_title('Model scaling (Qwen3)', fontsize=15, pad=7) +ax3.yaxis.set_major_locator(ticker.MultipleLocator(0.1)) + +# 图例移右下(原图位置) +leg3 = ax3.legend(fontsize=11, loc='lower right', + bbox_to_anchor=(0.98, 0.02), + framealpha=0, edgecolor='none', + handlelength=2.2, borderaxespad=0.5, labelspacing=0.3) +for text in leg3.get_texts(): + if 'SDPO' in text.get_text(): + text.set_fontweight('bold') + +# 四边框 + 向内刻度 +for sp in ax3.spines.values(): + sp.set_visible(True) + sp.set_linewidth(1.5) +ax3.tick_params(direction='in', length=5, width=1.2, labelsize=11) +ax3.grid(False) + +fig3.tight_layout(pad=0.9) +fig3.savefig(OUTPUT_DIR / 'line_selfdistill_scaling_repro.png', + dpi=300, facecolor='white') +plt.close(fig3) +print('saved: line_selfdistill_scaling_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/parallel_coordinates.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/parallel_coordinates.py new file mode 100644 index 0000000..e3a30a1 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/parallel_coordinates.py @@ -0,0 +1,173 @@ +""" +平行坐标图(parallel coordinates) +特征:多垂直轴,每条线代表一个方法在多个维度上的表现 +来源:经典多维方法对比图表,学术和工程场景通用 +""" + +import matplotlib.pyplot as plt +import matplotlib.lines as mlines +import numpy as np + +# ── 全局样式 ───────────────────────────────────────────────────── +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +# ── 颜色 ────────────────────────────────────────────────────── +C_MAIN = '#1B3D6E' # 主方法(深蓝) +C_BASE = '#A8C8E8' # baseline(浅钢蓝) +C_OTHERS = ['#5499C7', '#2CA02C', '#D651A0', '#FF7F0E'] + +# ── 数据(请替换为你的数据)─────────────────────────────────── +# 维度名称 +dimensions = ['Accuracy', 'Latency', 'Memory', 'FLOPs', 'Robustness'] + +# 方法 -> 各维度值(建议先归一化到 [0, 1],或传入 raw 值由脚本自动归一化) +data = { + 'Method A': [0.82, 0.45, 0.60, 0.55, 0.70], + 'Method B': [0.75, 0.60, 0.50, 0.65, 0.80], + 'Ours': [0.88, 0.35, 0.70, 0.40, 0.85], + 'Baseline': [0.70, 0.80, 0.40, 0.80, 0.60], +} + +# 若传入 raw 值(非归一化),设为 False +NORMALIZED = True + +# 主方法名称(图例中会突出显示) +MAIN_METHOD = 'Ours' + +# ── 参数配置 ────────────────────────────────────────────────── +TITLE = r'Method Comparison on Multiple Dimensions' +YLABEL = 'Normalized Score' +LEFT = 0.10 +RIGHT = 0.92 +TOP = 0.14 +AXIS_COL = '#333333' +AXIS_LW = 0.8 +GRID_COL = '#EBEBEB' +GRID_LW = 0.6 +TICK_LEN = 3 +LINE_LW = 1.8 +ALPHA = 0.75 + +# ── 归一化(如果数据不是 [0,1] 范围)───────────────────────── +if not NORMALIZED: + all_vals = [] + for vals in data.values(): + all_vals.extend(vals) + vmin, vmax = min(all_vals), max(all_vals) + data = {k: [(v - vmin) / (vmax - vmin) for v in vals] for k, vals in data.items()} + +# ── 布局 ───────────────────────────────────────────────────── +n_dims = len(dimensions) +n_methods = len(data) +x_pos = np.linspace(0, 1, n_dims) # 每个维度在 x 轴上的位置(0~1 归一化) + +# 颜色分配 +def get_color(method_name, idx): + if method_name == MAIN_METHOD: + return C_MAIN + elif method_name == 'Baseline': + return C_BASE + else: + return C_OTHERS[idx % len(C_OTHERS)] + +# ── 画布 ───────────────────────────────────────────────────── +fig_w = 9.0 +fig_h = 5.5 +fig, ax = plt.subplots(figsize=(fig_w, fig_h)) + +ax.set_xlim(0, 1) +ax.set_ylim(0, 1) + +# ── 画垂直轴 ───────────────────────────────────────────────── +for xi in x_pos: + ax.plot([xi, xi], [0, 1], color=AXIS_COL, lw=AXIS_LW, zorder=1) + +# 水平参考线(y 轴网格) +for y_ref in np.arange(0.2, 1.0, 0.2): + ax.axhline(y_ref, color=GRID_COL, lw=GRID_LW, linestyle='--', zorder=0) + +# ── 画每条线 ───────────────────────────────────────────────── +colors = [] +for idx, (method, vals) in enumerate(data.items()): + color = get_color(method, idx) + colors.append(color) + lw = LINE_LW if method == MAIN_METHOD else LINE_LW - 0.4 + ax.plot( + x_pos, vals, + color=color, + lw=lw, + alpha=ALPHA if method != MAIN_METHOD else 1.0, + zorder=3, + label=method, + ) + # 数据点 + ax.scatter( + x_pos, vals, + color=color, + s=30 if method == MAIN_METHOD else 18, + zorder=4, + edgecolors='white', + linewidths=0.5, + ) + +# ── 坐标轴标签 ─────────────────────────────────────────────── +for xi, dim in zip(x_pos, dimensions): + ax.text(xi, -0.06, dim, fontsize=10, ha='center', va='top', color='#333333') + +ax.set_ylabel(YLABEL, fontsize=10, color='#333333') +ax.set_title(TITLE, fontsize=12, fontweight='bold', color='#333333', pad=10) + +# ── 样式 ───────────────────────────────────────────────────── +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_visible(False) +ax.spines['bottom'].set_visible(False) + +ax.tick_params(length=TICK_LEN, direction='out', labelsize=8) +ax.set_xticks([]) +ax.set_yticks(np.arange(0.2, 1.0, 0.2)) +ax.set_yticklabels([f'{int(v*100)}%' for v in np.arange(0.2, 1.0, 0.2)], fontsize=8) +ax.grid(False) + +# ── 图例 ───────────────────────────────────────────────────── +legend_elements = [] +for method, vals in data.items(): + color = get_color(method, list(data.keys()).index(method)) + is_main = (method == MAIN_METHOD) + lw = LINE_LW if is_main else LINE_LW - 0.4 + legend_elements.append( + mlines.Line2D([0], [0], color=color, lw=lw, + alpha=ALPHA if not is_main else 1.0, + label=method, + marker='o', markersize=4, + markerfacecolor=color, markeredgecolor='white', + markeredgewidth=0.3) + ) + +leg = ax.legend( + handles=legend_elements, + fontsize=9, + loc='upper right', + bbox_to_anchor=(1.01, 1.0), + frameon=True, + facecolor='white', + edgecolor='#CCCCCC', + labelspacing=0.4, + handlelength=2.0, +) +for text in leg.get_texts(): + if text.get_text() == MAIN_METHOD: + text.set_fontweight('bold') + +# ── 保存 ───────────────────────────────────────────────────── +from pathlib import Path + +output_path = Path('output/figures/parallel_coordinates_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/radar_dora.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/radar_dora.py new file mode 100644 index 0000000..2cf7264 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/radar_dora.py @@ -0,0 +1,155 @@ +""" +Reproduce: image8.png — DoRA vs LoRA Radar Chart +Style: sans-serif, dashed octagonal grid, white-bg value annotations, + semi-transparent fill, legend: black text + colored line segment. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.lines as mlines + +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +# ---- 数据(顺时针从正上方开始)---- +CATEGORIES = [ + 'CommonSense\n(LLaMA)', + 'MT-bench\n(LLaMA2)', + 'MT-bench\n(LLaMA)', + 'visual instruction\ntuning (LLaVA-1.5)', + 'video-text\n(VL-BART)', + 'image-text\n(VL-BART)', + 'CommonSense\n(LLaMA3)', + 'CommonSense\n(LLaMA2)', +] + +DORA_raw = np.array([78.40, 6.00, 5.50, 67.60, 85.40, 77.40, 85.20, 79.70]) +LORA_raw = np.array([76.30, 5.70, 5.10, 66.90, 83.50, 76.50, 80.80, 77.60]) + +N = len(CATEGORIES) + +RANGES = [ + (74.0, 80.0), + (5.4, 6.2), + (4.8, 5.7), + (65.0, 70.0), + (81.0, 87.0), + (74.0, 79.0), + (78.0, 87.0), + (75.0, 81.0), +] +RMIN, RMAX = 0.35, 1.0 + +def nrm(v, vmin, vmax): + return RMIN + (RMAX - RMIN) * (v - vmin) / (vmax - vmin) + +dora_r = np.array([nrm(v, r[0], r[1]) for v, r in zip(DORA_raw, RANGES)]) +lora_r = np.array([nrm(v, r[0], r[1]) for v, r in zip(LORA_raw, RANGES)]) + +angles = np.linspace(0, 2 * np.pi, N, endpoint=False) + +def close(arr): + return np.concatenate([arr, [arr[0]]]) + +# 原图 1032×850 → 宽高比 1.21 +fig, ax = plt.subplots(figsize=(7.0, 5.8), + subplot_kw=dict(projection='polar')) + +ax.set_theta_zero_location('N') +ax.set_theta_direction(-1) +ax.set_yticks([]) +ax.set_xticks([]) + +# ---- 同心正八边形网格(虚线),非圆形 ---- +for r in [0.4, 0.55, 0.7, 0.85, 1.0]: + ax.plot(close(angles), close(np.full(N, r)), + color='#CCCCCC', lw=0.8, linestyle='--', zorder=1) + +# 射线 +for ang in angles: + ax.plot([ang, ang], [0, 1.0], + color='#CCCCCC', lw=0.8, linestyle='--', zorder=1) + +C_DORA = '#5A8A5A' # 深绿,与原图一致 +C_LORA = '#4169E1' # 皇家蓝,原图 LoRA 为真蓝色 + +# ---- 填充(统一透明度)---- +ax.fill(close(angles), close(dora_r), color=C_DORA, alpha=0.18, zorder=3) +ax.fill(close(angles), close(lora_r), color=C_LORA, alpha=0.18, zorder=3) + +# ---- 折线(DoRA 明显粗于 LoRA)---- +ax.plot(close(angles), close(dora_r), + color=C_DORA, lw=3.0, solid_capstyle='round', zorder=4) +ax.plot(close(angles), close(lora_r), + color=C_LORA, lw=1.5, solid_capstyle='round', zorder=4) + +# ---- 数值标注(带白底提高可读性)---- +def fmt(v): + # 原图保留两位小数 + return f'{v:.2f}' + +for i, ang in enumerate(angles): + # DoRA 数值(折线外侧) + r_d = dora_r[i] + 0.08 + ax.text(ang, r_d, fmt(DORA_raw[i]), + ha='center', va='center', + fontsize=7.8, color=C_DORA, zorder=6, + bbox=dict(boxstyle='round,pad=0.12', + facecolor='white', edgecolor='none', alpha=0.85)) + # LoRA 数值(折线内侧) + r_l = lora_r[i] - 0.09 + ax.text(ang, r_l, fmt(LORA_raw[i]), + ha='center', va='center', + fontsize=7.8, color=C_LORA, zorder=6, + bbox=dict(boxstyle='round,pad=0.12', + facecolor='white', edgecolor='none', alpha=0.85)) + +# ---- 轴标签 ---- +# 原图 label 紧贴多边形外圈(约 1.13),字体相对图幅偏小 +label_r = 1.13 +for i, (ang, cat) in enumerate(zip(angles, CATEGORIES)): + if abs(np.sin(ang)) < 0.15: + ha = 'center' + elif np.sin(ang) > 0: + ha = 'left' + else: + ha = 'right' + ax.text(ang, label_r, cat, + ha=ha, va='center', + fontsize=8.5, color='#333333', + multialignment='center') + +# ---- 图例:黑色系列名 + 彩色线段 ---- +# DoRA 图例行(粗绿线 + 黑色加粗文字) +fig.text(0.09, 0.91, + '──── ', color=C_DORA, fontsize=11, + fontweight='bold', va='center', ha='left') +fig.text(0.155, 0.91, + 'DoRA', color='black', fontsize=10, + fontweight='bold', va='center', ha='left') + +# LoRA 图例行(细蓝线 + 黑色普通文字) +fig.text(0.09, 0.875, + '─────', color=C_LORA, fontsize=8.5, + va='center', ha='left') +fig.text(0.155, 0.875, + 'LoRA', color='black', fontsize=10, + va='center', ha='left') + +ax.set_ylim(0, 1.32) # 缩小上下留白,让多边形撑满 +ax.set_frame_on(False) + +fig.subplots_adjust(left=0.10, right=0.90, top=0.86, bottom=0.06) +from pathlib import Path + +output_path = Path('output/figures/radar_dora_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig( + output_path, + dpi=300, facecolor='white', +) +plt.close(fig) +print('saved: radar_dora_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_break.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_break.py new file mode 100644 index 0000000..8119504 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_break.py @@ -0,0 +1,173 @@ +""" +Reproduce: image9.png — Broken-axis scatter plot (Meta-Harness style) +X-axis has a break between ~50k and 115k. +Uses two side-by-side axes with shared y-axis. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import matplotlib.lines as mlines +from scipy.interpolate import make_interp_spline + +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +# ---- 模拟数据 ---- +rng = np.random.default_rng(42) + +# Ours (Pareto) — red stars + dashed pink line +pareto_x = np.array([0, 5000, 20000, 30000, 35000, 40000, 45000, 48000, 50000]) +pareto_y = np.array([40.3, 40.3, 40.7, 44.3, 45.0, 47.3, 48.3, 48.8, 49.1]) + +# Ours (non-Pareto) — scattered light-pink circles +np_x = rng.uniform(25000, 50000, 32) +np_y = 35 + 14 * (np_x - 25000) / 25000 + rng.normal(0, 1.8, 32) +np_y = np.clip(np_y, 34, 50) + +# Few-shot — purple circles + straight line (折线,非样条) +few_x = np.array([4000, 8000, 15000, 25000, 38000, 48000]) +few_y = np.array([32.5, 34.2, 34.0, 35.7, 40.5, 41.0]) + +# Zero-shot — single purple X at origin +zs_x, zs_y = 0, 27.0 + +# MCE — orange triangle (right panel) +mce_x, mce_y = 115000, 39.6 + +# ACE — blue diamond (right panel) +ace_x, ace_y = 200000, 41.0 + + +# ---- 颜色 ---- +C_PARETO = '#E53935' # 亮红(与原图一致) +C_NONPARETO= '#F4B8B8' # 更淡的粉雾 +C_FEW = '#6B4FA0' # 深紫 +C_FEW_LINE = '#B8A8D8' # 浅紫(曲线) +C_MCE = '#E69B00' # 橙色 +C_ACE = '#2E86C1' # 蓝色 +C_ZS = '#5B2D8E' # 深紫(zero-shot) +C_DASH = '#F0A0A0' # 粉色虚线 + +# ---- 布局:左宽(0-50k)右窄(115k, 200k)---- +fig, (ax1, ax2) = plt.subplots( + 1, 2, + figsize=(9.5, 5.5), + gridspec_kw={'width_ratios': [5, 1.3], 'wspace': 0.05}, +) +fig.subplots_adjust(left=0.09, right=0.97, top=0.93, bottom=0.13) + +YLIM = (25, 51) + +for ax in [ax1, ax2]: + ax.set_ylim(*YLIM) + +# ---- 左轴(ax1:0 - 50k)---- +ax1.set_xlim(-3000, 53000) + +# Few-shot 样条曲线(原图明显为 S 形平滑曲线) +spl = make_interp_spline(few_x, few_y, k=3) +spl_x = np.linspace(few_x[0], few_x[-1], 300) +spl_y = spl(spl_x) +ax1.plot(spl_x, spl_y, color=C_FEW_LINE, lw=1.8, zorder=2) +ax1.scatter(few_x, few_y, + marker='o', s=70, color=C_FEW, + zorder=4, linewidths=0.8, edgecolors='black') + +# Zero-shot X +ax1.scatter([zs_x], [zs_y], marker='X', s=120, color=C_ZS, zorder=5, + linewidths=0.8, edgecolors='black') + +# non-Pareto circles(淡粉,无描边) +ax1.scatter(np_x, np_y, marker='o', s=28, color=C_NONPARETO, alpha=0.85, + zorder=3, linewidths=0) + +# Pareto dashed line + stars +ax1.plot(pareto_x, pareto_y, color=C_DASH, lw=1.8, linestyle='--', zorder=3) +ax1.scatter(pareto_x, pareto_y, + marker='*', s=200, color=C_PARETO, + zorder=5, linewidths=0.8, edgecolors='black') + +ax1.set_xlabel('Additional context (chars)', fontsize=13, fontweight='bold', labelpad=4) +ax1.set_ylabel('Test accuracy', fontsize=13, fontweight='bold') +ax1.set_xticks([0, 10000, 20000, 30000, 40000, 50000]) +ax1.set_xticklabels(['0', '10k', '20k', '30k', '40k', '50k'], fontsize=10) +ax1.tick_params(labelsize=10) + +# spines: 左/下 +ax1.spines['top'].set_visible(False) +ax1.spines['right'].set_visible(False) +ax1.spines['left'].set_linewidth(1.0) +ax1.spines['bottom'].set_linewidth(1.0) + +# ---- 右轴(ax2:115k, 200k)---- +ax2.set_xlim(95000, 220000) + +ax2.scatter([mce_x], [mce_y], marker='^', s=130, color=C_MCE, zorder=5, + linewidths=0.8, edgecolors='black') +ax2.scatter([ace_x], [ace_y], marker='D', s=90, color=C_ACE, zorder=5, + linewidths=0.8, edgecolors='black') + +ax2.set_xticks([115000, 200000]) +ax2.set_xticklabels(['115k', '200k'], fontsize=10) +ax2.tick_params(labelsize=10) +ax2.set_yticks([]) + +# spines: 只保留下边 +ax2.spines['top'].set_visible(False) +ax2.spines['right'].set_visible(False) +ax2.spines['left'].set_visible(False) +ax2.spines['bottom'].set_linewidth(1.0) + +# ---- 折断符号(只在 x 轴底部,不在顶部)---- +d = 0.015 +kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False, lw=1.2) +ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs) # 底部斜杠 + +kwargs2 = dict(transform=ax2.transAxes, color='k', clip_on=False, lw=1.2) +ax2.plot((-d, +d), (-d, +d), **kwargs2) # 底部斜杠 + +# ---- 图例(右下角,有浅灰框)---- +legend_elements = [ + mlines.Line2D([], [], marker='*', color='w', markerfacecolor=C_PARETO, + markersize=11, label='Ours (Pareto)', + linestyle='--', linewidth=1.2, + markeredgewidth=0.3, markeredgecolor='white'), + mlines.Line2D([], [], marker='o', color='w', markerfacecolor=C_NONPARETO, + markersize=7, label='Ours (non-Pareto)', linestyle='None'), + mlines.Line2D([], [], marker='^', color='w', markerfacecolor=C_MCE, + markersize=9, label='MCE', linestyle='None'), + mlines.Line2D([], [], marker='D', color='w', markerfacecolor=C_ACE, + markersize=8, label='ACE', linestyle='None'), + mlines.Line2D([], [], marker='X', color='w', markerfacecolor=C_ZS, + markersize=9, label='Zero-shot', linestyle='None'), + mlines.Line2D([], [], marker='o', color='w', markerfacecolor=C_FEW, + markersize=8, label='Few-shot', linestyle='None'), +] + +leg = ax1.legend( + handles=legend_elements, + loc='lower right', + fontsize=9.0, + frameon=True, + facecolor='white', + edgecolor='#CCCCCC', + framealpha=1.0, + borderpad=0.5, + labelspacing=0.3, + handletextpad=0.4, +) + +from pathlib import Path + +output_path = Path('output/figures/scatter_break_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig( + output_path, + dpi=300, facecolor='white', +) +plt.close(fig) +print('saved: scatter_break_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_tsne.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_tsne.py new file mode 100644 index 0000000..3369121 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/scatter_tsne.py @@ -0,0 +1,137 @@ +""" +Reproduce: image7.png — t-SNE Latent Memory Visualization +Style: serif (Computer Modern via usetex), light gray grid, + 4-spine box, annotation boxes with cluster color edges. +""" + +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches + +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +rng = np.random.default_rng(42) + +def cluster(cx, cy, n, rx=8, ry=8, shape='round'): + """生成一个椭圆形聚类,shape='round'|'elongated'""" + if shape == 'elongated': + angles = rng.uniform(0, 2 * np.pi, n) + r = rng.rayleigh(1.0, n) + x = cx + rx * r * np.cos(angles) + y = cy + ry * r * np.sin(angles) + else: + x = rng.normal(cx, rx, n) + y = rng.normal(cy, ry, n) + return x, y + + +# ---- 数据集颜色(严格参照原图) ---- +DS = { + 'GSM8K': {'color': '#6A4C93', 'n': 900, 'cx': 10, 'cy': 12, 'rx': 9, 'ry': 12}, + 'MATH': {'color': '#D651A0', 'n': 700, 'cx': 8, 'cy': 32, 'rx': 7, 'ry': 8}, + 'GPQA': {'color': '#F06292', 'n': 300, 'cx': 18, 'cy': 50, 'rx': 5, 'ry': 6}, + 'KodCode': {'color': '#FF8A65', 'n': 500, 'cx': 38, 'cy': -10, 'rx': 9, 'ry': 10}, + 'BCB': {'color': '#FFB74D', 'n': 600, 'cx': 18, 'cy': -30, 'rx': 10, 'ry': 9}, + 'ALFWorld': {'color': '#FFF176', 'n': 280, 'cx': -10, 'cy': -42, 'rx': 12, 'ry': 10}, # 黄色! + 'TriviaQA': {'color': '#C888E8', 'n': 700, 'cx': -42, 'cy': 5, 'rx': 14, 'ry': 22}, +} + +# ---- 注释框配置(统一深灰边框,与原图一致;GPQA 也添加)---- +ANNOTS = [ + {'name': 'MATH', 'xy': (8, 32), 'xytext': (8, 32)}, + {'name': 'GSM8K', 'xy': (10, 10), 'xytext': (10, 10)}, + {'name': 'GPQA', 'xy': (18, 52), 'xytext': (18, 52)}, + {'name': 'KodCode', 'xy': (38,-10), 'xytext': (38,-10)}, + {'name': 'BCB', 'xy': (18,-30), 'xytext': (18,-30)}, + {'name': 'ALFWorld', 'xy': (-10,-42), 'xytext': (-10,-42)}, + {'name': 'TriviaQA', 'xy': (-42, 5), 'xytext': (-42, 5)}, +] +BBOX_EDGECOLOR = '#2C3E50' # 统一深蓝灰 + +fig, ax = plt.subplots(figsize=(7.5, 6.2)) + +for name, cfg in DS.items(): + x, y = cluster(cfg['cx'], cfg['cy'], cfg['n'], cfg['rx'], cfg['ry']) + ax.scatter(x, y, c=cfg['color'], s=14, alpha=0.55, + linewidths=0, rasterized=True, label=name, zorder=2) + +# ---- 注释框 ---- +for ann in ANNOTS: + color = DS[ann['name']]['color'] + # 注释框:与簇色同色相的浅色半透明底(原图风格) + import matplotlib.colors as mcolors + rgba = list(mcolors.to_rgba(color)) + rgba[3] = 0.28 # alpha for facecolor + ax.annotate( + r'\textbf{' + ann['name'] + r'}', + xy=ann['xy'], xytext=ann['xytext'], + fontsize=10.0, + bbox=dict( + boxstyle='round,pad=0.30', + facecolor=tuple(rgba), + edgecolor=BBOX_EDGECOLOR, + linewidth=0.9, + ), + ha='center', va='center', zorder=5, + ) + +# ---- Axes 样式 ---- +ax.set_xlabel(r'\textbf{t-SNE Component 1}', fontsize=12) +ax.set_ylabel(r'\textbf{t-SNE Component 2}', fontsize=12) +ax.set_title( + r'\textbf{Latent Memory Visualization}' + '\n' + r'\textbf{(across all benchmarks)}', + fontsize=13.5, pad=8, linespacing=1.4, +) + +ax.set_xlim(-88, 70) +ax.set_ylim(-75, 80) +ax.xaxis.set_major_locator(plt.MultipleLocator(20)) +ax.yaxis.set_major_locator(plt.MultipleLocator(20)) + +# 四边框,深灰接近原图 +for sp in ax.spines.values(): + sp.set_visible(True) + sp.set_linewidth(0.9) + sp.set_color('#333333') + +ax.tick_params(direction='in', length=4, width=0.8, labelsize=10, + color='#333333') + +# 浅灰点线网格(原图风格) +ax.grid(True, color='#E0E0E0', linewidth=0.6, linestyle=':', zorder=0) +ax.set_axisbelow(True) + +# ---- 图例(原图有白底浅灰框) ---- +leg = ax.legend( + loc='upper right', + fontsize=9.5, + frameon=True, + facecolor='white', + edgecolor='#CCCCCC', + framealpha=1.0, + markerscale=1.0, + handlelength=0.8, + handleheight=0.8, + handletextpad=0.5, + labelspacing=0.25, + borderpad=0.5, + borderaxespad=0.5, +) + +from pathlib import Path + +output_path = Path('output/figures/scatter_tsne_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.tight_layout(pad=0.9) +fig.savefig( + output_path, + dpi=300, facecolor='white', +) +plt.close(fig) +print('saved: scatter_tsne_repro.png') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/stacked_bar.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/stacked_bar.py new file mode 100644 index 0000000..c586e56 --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/stacked_bar.py @@ -0,0 +1,121 @@ +""" +堆叠柱状图(stacked bar chart) +特征:每列堆叠多个组分,展示结构占比,蓝灰递进色阶 +来源:学术论文中展示组成结构或任务分解 +""" + +import matplotlib.pyplot as plt +import numpy as np +from pathlib import Path + +# ── 全局样式 ───────────────────────────────────────────────────── +plt.rcParams.update({ + 'font.family': 'sans-serif', + 'font.sans-serif': ['DejaVu Sans', 'Arial', 'Helvetica'], + 'text.usetex': False, +}) + +# ── 颜色(蓝灰递进色阶)────────────────────────────────────── +C_LAYERS = [ + '#D3D3D3', # 底层(最浅灰) + '#A8C8E8', # 第二层(浅钢蓝) + '#5499C7', # 第三层(中蓝) + '#1B3D6E', # 顶层(深蓝,主组分) +] + +# ── 数据(请替换为你的数据)─────────────────────────────────── +categories = ['Task A', 'Task B', 'Task C', 'Task D', 'Task E'] + +# 每层一个 dict,key = 组分名,value = 该组分在每个 category 的数值 +components = { + 'Base Model': [20, 25, 15, 30, 22], + 'Feature Ex': [35, 30, 40, 25, 33], + 'Fusion': [45, 45, 45, 45, 45], +} + +# ── 参数配置 ───────────────────────────────────────────────── +TITLE = r'Performance Breakdown by Task' +YLABEL = r'Score' +YLIM = (0, 110) +BAR_W = 0.5 +BOTTOM_GAP = 0.05 + +# ── 数据解析 ───────────────────────────────────────────────── +component_names = list(components.keys()) +n_groups = len(categories) +x_center = np.arange(n_groups) + +# 转为 numpy 数组 +vals_list = [np.array(components[name]) for name in component_names] + +# 检查:每列总量不能超过 YLIM[1] +col_sums = sum(vals_list) +if col_sums.max() > YLIM[1]: + import warnings + warnings.warn(f'堆叠总量 {col_sums.max()} 超过 YLIM[1]={YLIM[1]},已自动扩展 YLIM') + +# ── 画布 ───────────────────────────────────────────────────── +fig, ax = plt.subplots(figsize=(8.0, 5.0)) + +# ── 堆叠绘图 ───────────────────────────────────────────────── +# bottom 从 0 开始,每层累加 +bottom = np.zeros(n_groups) + +colors = C_LAYERS[:len(component_names)] + +for i, (name, vals) in enumerate(zip(component_names, vals_list)): + bars = ax.bar( + x_center, vals, + width=BAR_W, + bottom=bottom, + color=colors[i], + edgecolor='white', + linewidth=0.5, + label=name, + zorder=2, + ) + # 底部累加 + bottom += vals + +# ── 样式 ───────────────────────────────────────────────────── +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_color('#333333') +ax.spines['bottom'].set_color('#333333') +ax.spines['left'].set_linewidth(0.9) +ax.spines['bottom'].set_linewidth(0.9) + +# y 轴网格(与 bar_grouped_hatch 一致) +ax.yaxis.grid(True, color='#EBEBEB', linewidth=0.7, linestyle='--', zorder=0) +ax.set_axisbelow(True) + +# 刻度 +ax.tick_params(length=3, direction='out', labelsize=9) + +# 标签 +ax.set_xticks(x_center) +ax.set_xticklabels(categories, fontsize=10) +ax.set_ylabel(YLABEL, fontsize=10) +ax.set_ylim(*YLIM) +ax.set_title(TITLE, fontsize=12, fontweight='bold', color='#333333', pad=8) + +# 图例 +leg = ax.legend( + fontsize=9, + loc='upper right', + bbox_to_anchor=(1.01, 1.0), + frameon=True, + facecolor='white', + edgecolor='#CCCCCC', + labelspacing=0.35, + handlelength=1.5, + handletextpad=0.5, + borderaxespad=0.3, +) + +# ── 保存 ───────────────────────────────────────────────────── +output_path = Path('output/figures/stacked_bar_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git a/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/violin_plot.py b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/violin_plot.py new file mode 100644 index 0000000..c1e17ed --- /dev/null +++ b/skills/data-flow-skill/data-flow-skill/scripts/visualization/matplotlib/violin_plot.py @@ -0,0 +1,139 @@ +""" +小提琴图(violin plot) +特征:数据密度分布可视化,内部叠加 mini box plot(中位线+四分位线) +来源:学术统计图表风格,与 box_plot 共用配色体系 +""" + +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np +from pathlib import Path + +# ── 全局样式(与 box_plot 完全一致)──────────────────────────── +plt.rcParams.update({ + 'text.usetex': True, + 'font.family': 'serif', + 'font.serif': ['Computer Modern Roman', 'STIX Two Text', 'DejaVu Serif'], + 'axes.unicode_minus': False, +}) + +# ── 颜色(与 box_plot 一致)────────────────────────────────── +C_VIOLIN = '#5499C7' # 小提琴主体(蓝) +C_BOX = '#1B3D6E' # 内部叠加箱体(深蓝) +C_MED = '#CC2200' # 中位线(红) +C_OUTL = '#D651A0' # 异常值(粉) + +# ── 数据(请替换为你的数据)─────────────────────────────────── +data = { + 'Method A': [23.5, 25.1, 24.8, 26.2, 27.0, 25.5, 24.9, 26.8, 25.0, 24.3], + 'Method B': [28.3, 29.1, 27.8, 30.2, 29.5, 28.9, 30.1, 29.0, 28.7, 29.3], + 'Ours': [31.2, 32.5, 31.8, 33.1, 32.0, 31.5, 32.8, 33.4, 31.9, 32.2], +} + +labels = list(data.keys()) +values = [np.array(v) for v in data.values()] + +# ── 参数配置 ───────────────────────────────────────────────── +TITLE = r'\textbf{Distribution Comparison (Violin)}' +XLABEL = r'\textbf{Method}' +YLABEL = r'\textit{Accuracy (\%)}' +YLIM = (15, 40) +VIOLIN_ALPHA = 0.6 + +# ── 画布 ───────────────────────────────────────────────────── +fig, ax = plt.subplots(figsize=(7.5, 5.0)) + +x_pos = np.arange(len(labels)) + +# 画小提琴 +vp = ax.violinplot( + values, + positions=x_pos, + widths=0.5, + showmeans=False, + showmedians=False, +) + +# 设置小提琴颜色和透明度 +for i, body in enumerate(vp['bodies']): + body.set_facecolor(C_VIOLIN) + body.set_alpha(VIOLIN_ALPHA) + body.set_edgecolor('#333333') + body.set_linewidth(1.0) + +# 隐藏须线和caps(保留小提琴形状) +for partname in ('cbars', 'cmins', 'cmaxes'): + parts = vp.get(partname) + if parts is not None: + parts.set_visible(False) + +# ── 内部叠加 mini box plot ────────────────────────────────── +# 手动计算四分位和中位线 +def get_stats(arr): + arr = np.sort(arr) + q1 = np.percentile(arr, 25) + med = np.percentile(arr, 50) + q3 = np.percentile(arr, 75) + iqr = q3 - q1 + lo = max(arr.min(), q1 - 1.5 * iqr) + hi = min(arr.max(), q3 + 1.5 * iqr) + return lo, q1, med, q3, hi + +for i, (xi, vals) in enumerate(zip(x_pos, values)): + lo, q1, med, q3, hi = get_stats(vals) + bw = 0.12 # mini box 宽度 + + # 箱体(透明深蓝) + ax.fill_between( + [xi - bw, xi + bw], [q1, q1], [q3, q3], + color=C_BOX, alpha=0.6, zorder=4 + ) + # 中位线(红色加粗) + ax.plot([xi - bw, xi + bw], [med, med], + color=C_MED, linewidth=2.0, zorder=5) + # 须线(连接箱体上下端) + ax.plot([xi, xi], [lo, q1], color='#333333', linewidth=1.2, zorder=4) + ax.plot([xi, xi], [q3, hi], color='#333333', linewidth=1.2, zorder=4) + # 须线端点横线 + ax.plot([xi - bw * 0.6, xi + bw * 0.6], [lo, lo], color='#333333', lw=1.2, zorder=4) + ax.plot([xi - bw * 0.6, xi + bw * 0.6], [hi, hi], color='#333333', lw=1.2, zorder=4) + +# ── 样式 ───────────────────────────────────────────────────── +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.spines['left'].set_color('#333333') +ax.spines['bottom'].set_color('#333333') +ax.spines['left'].set_linewidth(1.2) +ax.spines['bottom'].set_linewidth(1.2) + +# y 轴网格(与 bar_grouped_hatch / box_plot 一致) +ax.yaxis.grid(True, color='#EBEBEB', linewidth=0.7, linestyle='--', zorder=0) +ax.set_axisbelow(True) + +# 刻度 +ax.tick_params(length=4, direction='in', labelsize=10) + +# 标签 +ax.set_xticks(x_pos) +ax.set_xticklabels(labels, fontsize=11) +ax.set_xlabel(XLABEL, fontsize=11) +ax.set_ylabel(YLABEL, fontsize=11) +ax.set_ylim(*YLIM) +ax.set_title(TITLE, fontsize=13, pad=8) + +# 图例 +from matplotlib.lines import Line2D +legend_elements = [ + Line2D([0], [0], color=C_MED, linewidth=2.0, label=r'\textit{Median}'), + mpatches.Patch(facecolor=C_VIOLIN, alpha=VIOLIN_ALPHA, edgecolor='#333333', + label=r'\textit{Density}'), +] +ax.legend(handles=legend_elements, fontsize=9, loc='upper right', + frameon=True, facecolor='white', edgecolor='#CCCCCC') + +# ── 保存 ───────────────────────────────────────────────────── +output_path = Path('output/figures/violin_plot_repro.png') +output_path.parent.mkdir(parents=True, exist_ok=True) +fig.savefig(output_path, dpi=300, facecolor='white', bbox_inches='tight') +plt.close(fig) +print(f'✅ saved: {output_path}') diff --git "a/\346\212\200\350\203\275\346\270\205\345\215\225.md" "b/\346\212\200\350\203\275\346\270\205\345\215\225.md" index cca0f7c..626eea4 100644 --- "a/\346\212\200\350\203\275\346\270\205\345\215\225.md" +++ "b/\346\212\200\350\203\275\346\270\205\345\215\225.md" @@ -54,43 +54,44 @@ 37. dream-video-prompt-generator ⭐⭐ 38. agentkit-multimedia-shopping ⭐⭐ -### 文档与分析(4个) -39. paper-analysis-assistant ⭐⭐⭐⭐ -40. contract-review ⭐⭐⭐ -41. law-to-markdown ⭐⭐ -42. stock-analysis ⭐⭐⭐ +### 文档与分析(5个) +39. data-flow-skill ⭐⭐⭐⭐⭐ +40. paper-analysis-assistant ⭐⭐⭐⭐ +41. contract-review ⭐⭐⭐ +42. law-to-markdown ⭐⭐ +43. stock-analysis ⭐⭐⭐ ### 智能体协作(3个) -43. agent-team ⭐⭐⭐ -44. multi-agent-meeting ⭐⭐ -45. peers-advisory-group ⭐⭐ +44. agent-team ⭐⭐⭐ +45. multi-agent-meeting ⭐⭐ +46. peers-advisory-group ⭐⭐ ### 产品与项目管理(2个) -46. product-manager-toolkit ⭐⭐⭐ -47. sales-ai-assistant ⭐⭐ +47. product-manager-toolkit ⭐⭐⭐ +48. sales-ai-assistant ⭐⭐ ### 设计与可视化(5个) -48. frontend-design ⭐⭐⭐ -49. ai-drawio ⭐⭐⭐⭐ -50. pop-up-book-illustration ⭐⭐ -51. web-to-app ⭐⭐ -52. web-design-analyzer (待补充) +49. frontend-design ⭐⭐⭐ +50. ai-drawio ⭐⭐⭐⭐ +51. pop-up-book-illustration ⭐⭐ +52. web-to-app ⭐⭐ +53. web-design-analyzer (待补充) ### 文档处理(4个 - 系统内置) -53. pptx ⭐⭐⭐⭐⭐ -54. xlsx ⭐⭐⭐⭐⭐ -55. pdf ⭐⭐⭐⭐⭐ -56. docx ⭐⭐⭐⭐⭐ +54. pptx ⭐⭐⭐⭐⭐ +55. xlsx ⭐⭐⭐⭐⭐ +56. pdf ⭐⭐⭐⭐⭐ +57. docx ⭐⭐⭐⭐⭐ ### 技能管理(2个 - 系统内置) -57. find-skill ⭐⭐⭐ -58. skill-creator ⭐⭐⭐ +58. find-skill ⭐⭐⭐ +59. skill-creator ⭐⭐⭐ ### 财务分析(2个) -59. creating-financial-models ⭐⭐⭐⭐ -60. market-research-reports ⭐⭐⭐⭐ +60. creating-financial-models ⭐⭐⭐⭐ +61. market-research-reports ⭐⭐⭐⭐ ### 文化创作(1个) -61. poetry-music-visual ⭐⭐ +62. poetry-music-visual ⭐⭐ -总计:61个技能 +总计:62个技能