diff --git a/.gitignore b/.gitignore index 2a83f0af..efc032af 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ workspace/log_ngp.txt models/ *.log dist +*.pem +*.sh diff --git a/README-EN.md b/README-EN.md deleted file mode 100644 index 508192a6..00000000 --- a/README-EN.md +++ /dev/null @@ -1,217 +0,0 @@ -

- -

- -English | [中文版](./README.md) - -

- - - - - -

-

-lipku%2FLiveTalking | Trendshift -

- -A real-time interactive streaming digital human engine enabling synchronized audio-video conversation, widely adopted in commercial applications. - -**Demos**: [wav2lip](https://youtu.be/-ss0H8qLr7E) | [ernerf](https://www.bilibili.com/video/BV1G1421z73r/) | [musetalk](https://youtu.be/vzUMruoZlxc/) - -Domestic Mirror: - ---- - -## Features -1. Supports multiple digital human models: ernerf, musetalk, wav2lip, Ultralight-Digital-Human -2. Supports voice cloning -3. Supports interrupting the digital human while speaking -4. Supports full-body video stitching -5. Supports WebRTC, RTMP, and virtual camera output -6. Supports action choreography: plays custom videos when not speaking -7. Supports multi-concurrency -8. Supports custom digital human avatars -9. Provides frontend API integration - ---- - -## Usage Scenarios - -LiveTalking leverages real-time streaming digital human technology to drive virtual avatars via text or voice, combined with LLM for intelligent conversation. Suitable for the following scenarios: - -| Scenario | Description | -|----------|-------------| -| **Virtual Streamer / Live Commerce** | 24/7 unmanned live streaming with LLM-generated sales scripts and action choreography for natural performance | -| **AI Digital Human Customer Service** | Integrate enterprise knowledge bases for real-time voice Q&A with interruption support | -| **Online Education / Training** | Digital teacher分身 for course recording, or API-driven digital instructor for real-time lectures | -| **Intelligent Voice Assistant** | Pair with smart speakers or apps, calling the `/human` API to drive digital human voice interactions | -| **Large Screen Presentation** | Digital human presenter for exhibition halls, event venues, and other content narration scenarios | -| **Batch Short Video Creation** | Submit scripts in batch via API to generate digital human videos without real-person filming, using `/human` + `/record` APIs | - -**Core Flow**: User input (text/audio) → LLM response (optional) → TTS speech synthesis → Real-time lip-sync → Audio/video streaming output - ---- - -## 1. Installation - -Tested on Ubuntu 24.04, Python 3.12, PyTorch 2.9.1, CUDA 13.0. - -### 1.1 Install Dependencies - -```bash -git clone https://github.com/lipku/LiveTalking.git -conda create -n livetalking python=3.12 -conda activate livetalking -# If CUDA version is not 13.0 (check via nvidia-smi), install the corresponding PyTorch version(https://pytorch.org/get-started/previous-versions) -pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130 -cd LiveTalking -pip install -r requirements.txt -``` - -Installation FAQ: - -Linux CUDA environment setup: - ---- - -## 2. Quick Start - -### 2.1 Download Models - -| Source | Link | -|--------|------| -| Quark Cloud | | -| Google Drive | | - -1. Copy `wav2lip256.pth` to the project's `models/` directory and rename it to `wav2lip.pth` -2. Extract `wav2lip256_avatar1.tar.gz` and copy the entire extracted folder to `data/avatars/` - -### 2.2 Start the Server - -```bash -python app.py --transport webrtc --model wav2lip --avatar_id wav2lip256_avatar1 -``` - -> **Note**: The server must open ports TCP:8010, UDP:1-65536 - -### 2.3 Client Access - -| Method | Description | -|--------|-------------| -| Browser | Open `http://serverip:8010/index.html`, click "Start Connection" to play the digital human video, then enter text and submit | -| API | See [API Docs](docs/api.md) for HTTP-based integration | -| Desktop App | Download: | - -### 2.4 Web Pages - -| Page | URL | Description | -|------|-----|-------------| -| Home | `/index.html` | WebRTC connection + text/audio driver + recording control | -| Avatar Creator | `/avatar.html` | Upload video to auto-generate digital human avatars | -| Admin Console | `/admin.html` | Real-time session monitoring & global configuration | - - - -### 2.5 Quick Experience - -Create an instance with a cloud image to run instantly: - -- [UCloud Image](https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking) - -### 2.6 Documentation - - ---- - -## 3. Architecture - -### Dataflow Diagram - - - -### Layer Overview - -**API Layer** -- `/human`: Accepts text, supporting echo (direct playback) and chat (LLM conversation) modes -- `/humanaudio`: Accepts audio files for direct playback -- Each connection is assigned a unique `sessionid`, supporting multi-user concurrency - -**Logic Layer** -- **LLM Engine**: Integrates with models like Qwen to generate conversational responses -- **TTS Engine**: Modular design supporting EdgeTTS, GPT-SoVITS, CosyVoice, Tencent Cloud, and more -- **Feature Extraction**: Synchronously extracts acoustic features (e.g., Mel spectrograms) for lip-sync inference - -**Rendering Layer** -- **Model Inference**: Uses deep learning models (Wav2Lip, MuseTalk, etc.) to generate lip-sync frames from audio features -- **Post-Processing**: Smoothly overlays the generated mouth region back onto the original high-definition video - -**Streaming Layer** -- **WebRTC**: Low-latency browser-based streaming -- **RTMP**: Standard live streaming protocol, supports pushing to platforms like Bilibili/YouTube -- **Virtual Camera**: Outputs as a system camera device - -**Plugin System** -- Decentralized registration mechanism based on [registry.py](registry.py), allowing developers to extend TTS, Avatar, and Output modules - ---- - -## 4. API Documentation - -| Document | Description | -|----------|-------------| -| [docs/api.md](docs/api.md) | General API — WebRTC, text/audio driver, recording, action choreography | -| [docs/avatar_api.md](docs/avatar_api.md) | Avatar Generation API — create tasks, query progress, delete tasks | -| [docs/admin_api.md](docs/admin_api.md) | Admin API — global config, session monitoring, force stop | - ---- - -## 5. Docker - -Available images: -- **AutoDL**: — [Tutorial](https://doc.livetalking.ai/en/docs/autodl/) -- **UCloud**: — Supports opening any port, no additional SRS deployment required — [Tutorial](https://doc.livetalking.ai/en/docs/ucloud/) - -> AutoDL cannot open UDP ports, so you need to deploy SRS or TURN relay service separately. - ---- - -## 6. Performance - -- Each video stream compression consumes CPU; higher resolution means greater CPU usage. Each lip-sync inference consumes GPU -- Concurrent sessions when not speaking depend on CPU; concurrent speaking sessions depend on GPU -- In backend logs: `inferfps` = GPU inference frame rate, `finalfps` = final streaming frame rate. Both must be >= 25 for real-time performance - -### Real-Time Inference Performance - -| Model | GPU | FPS | -|:------|:----|:----| -| wav2lip256 | RTX 3060 | 60 | -| wav2lip256 | RTX 3080Ti | 120 | -| musetalk | RTX 3080Ti | 42 | -| musetalk | RTX 3090 | 45 | -| musetalk | RTX 4090 | 72 | - -- wav2lip256: RTX 3060 or higher recommended -- musetalk: RTX 3080Ti or higher recommended - ---- - -## 7. Statement - -Videos developed based on this project and published on platforms such as Bilibili, WeChat Channels, and Douyin must include the LiveTalking watermark and logo. - ---- - -If this project is helpful to you, please give it a Star. Contributors interested in improving this project are also welcome. - -| Community | Link | -|-----------|------| -| Knowledge Planet | | -| WeChat | wxwubug (mention for group invite) | -| Telegram | | -| Discord | | -| Email | lipku@foxmail.com | -| WeChat Official | 数字人技术 | - - diff --git a/README.md b/README.md index fa37432d..508192a6 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,7 @@

-中文版 | [English](./README-EN.md) - +English | [中文版](./README.md)

@@ -16,205 +15,203 @@ lipku%2FLiveTalking | Trendshift

-实时交互流式数字人引擎,实现音视频同步对话,已在业内获得广泛商用 +A real-time interactive streaming digital human engine enabling synchronized audio-video conversation, widely adopted in commercial applications. -**效果演示**: [wav2lip](https://www.bilibili.com/video/BV1scwBeyELA/) | [ernerf](https://www.bilibili.com/video/BV1G1421z73r/) | [musetalk](https://www.bilibili.com/video/BV1bUwezvEnG/) +**Demos**: [wav2lip](https://youtu.be/-ss0H8qLr7E) | [ernerf](https://www.bilibili.com/video/BV1G1421z73r/) | [musetalk](https://youtu.be/vzUMruoZlxc/) -国内镜像: +Domestic Mirror: --- ## Features -1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human -2. 支持声音克隆 -3. 支持数字人说话被打断 -4. 支持全身视频拼接 -5. 支持 WebRTC、RTMP、虚拟摄像头输出 -6. 支持动作编排:不说话时播放自定义视频 -7. 支持多并发 -8. 支持自定义数字人形象 -9. 提供前端API接口对接 +1. Supports multiple digital human models: ernerf, musetalk, wav2lip, Ultralight-Digital-Human +2. Supports voice cloning +3. Supports interrupting the digital human while speaking +4. Supports full-body video stitching +5. Supports WebRTC, RTMP, and virtual camera output +6. Supports action choreography: plays custom videos when not speaking +7. Supports multi-concurrency +8. Supports custom digital human avatars +9. Provides frontend API integration --- -## 使用场景 +## Usage Scenarios -LiveTalking 基于实时流式数字人技术,通过文本或语音驱动虚拟形象说话,结合 LLM 实现智能对话。适用于以下场景: +LiveTalking leverages real-time streaming digital human technology to drive virtual avatars via text or voice, combined with LLM for intelligent conversation. Suitable for the following scenarios: -| 场景 | 说明 | -|------|------| -| **虚拟主播/直播带货** | 24 小时无人直播,通过 LLM 自动生成带货话术,配合动作编排实现自然表现 | -| **AI 数字人客服** | 接入企业知识库,用户语音提问,数字人实时回答,支持打断重说 | -| **在线教育/培训** | 教师数字分身录制课程,或通过 API 驱动数字人讲师实时授课 | -| **智能语音助手** | 结合智能音箱或 APP,调用 `/human` 接口驱动数字人进行语音对话交互 | -| **大屏讲解** | 数字人讲解员在展厅大屏、活动现场等场景进行内容讲解和互动 | -| **短视频批量制作** | 通过 API 批量提交文案生成数字人出镜视频,无需真人拍摄,调用 `/human` + `/record` 接口 | +| Scenario | Description | +|----------|-------------| +| **Virtual Streamer / Live Commerce** | 24/7 unmanned live streaming with LLM-generated sales scripts and action choreography for natural performance | +| **AI Digital Human Customer Service** | Integrate enterprise knowledge bases for real-time voice Q&A with interruption support | +| **Online Education / Training** | Digital teacher分身 for course recording, or API-driven digital instructor for real-time lectures | +| **Intelligent Voice Assistant** | Pair with smart speakers or apps, calling the `/human` API to drive digital human voice interactions | +| **Large Screen Presentation** | Digital human presenter for exhibition halls, event venues, and other content narration scenarios | +| **Batch Short Video Creation** | Submit scripts in batch via API to generate digital human videos without real-person filming, using `/human` + `/record` APIs | -**核心流程**:用户输入文字/音频 → LLM 生成回复(可选)→ TTS 合成语音 → 数字人实时口型同步 → 音视频推流输出 +**Core Flow**: User input (text/audio) → LLM response (optional) → TTS speech synthesis → Real-time lip-sync → Audio/video streaming output --- -## 1. 安装 +## 1. Installation -已在 Ubuntu 22.04、Python 3.12、PyTorch 2.9.1、CUDA 13.0 测试通过。 +Tested on Ubuntu 24.04, Python 3.12, PyTorch 2.9.1, CUDA 13.0. -### 1.1 安装依赖 +### 1.1 Install Dependencies ```bash git clone https://github.com/lipku/LiveTalking.git conda create -n livetalking python=3.12 conda activate livetalking -# 如果 CUDA 版本不为 13.0 (运行 nvidia-smi 确认),请根据 PyTorch 官网(https://pytorch.org/get-started/previous-versions)安装对应版本 +# If CUDA version is not 13.0 (check via nvidia-smi), install the corresponding PyTorch version(https://pytorch.org/get-started/previous-versions) pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130 cd LiveTalking pip install -r requirements.txt ``` -安装常见问题:[FAQ](https://doc.livetalking.ai/docs/faq/) +Installation FAQ: -Linux CUDA 环境搭建参考: +Linux CUDA environment setup: --- -## 2. 快速开始 +## 2. Quick Start -### 2.1 下载模型 +### 2.1 Download Models -| 网盘 | 地址 | -|------|------| -| 夸克云盘 | | +| Source | Link | +|--------|------| +| Quark Cloud | | | Google Drive | | -1. 将 `wav2lip256.pth` 拷贝到项目的 `models/` 目录下,重命名为 `wav2lip.pth` -2. 将 `wav2lip256_avatar1.tar.gz` 解压后整个文件夹拷贝到 `data/avatars/` 目录下 +1. Copy `wav2lip256.pth` to the project's `models/` directory and rename it to `wav2lip.pth` +2. Extract `wav2lip256_avatar1.tar.gz` and copy the entire extracted folder to `data/avatars/` -### 2.2 启动服务 +### 2.2 Start the Server ```bash python app.py --transport webrtc --model wav2lip --avatar_id wav2lip256_avatar1 ``` +> **Note**: The server must open ports TCP:8010, UDP:1-65536 -> **注意**: 服务端需开放端口 TCP:8010, UDP:1-65536 - - -### 2.3 客户端接入 +### 2.3 Client Access -| 方式 | 说明 | -|------|------| -| 浏览器 | 打开 `http://serverip:8010/index.html`,点击"开始连接"播放数字人视频,在文本框输入文字提交即可 | -| API 调用 | 参考 [API 文档](docs/api.md) 通过 HTTP 接口驱动 | -| 桌面客户端 | 下载地址: | +| Method | Description | +|--------|-------------| +| Browser | Open `http://serverip:8010/index.html`, click "Start Connection" to play the digital human video, then enter text and submit | +| API | See [API Docs](docs/api.md) for HTTP-based integration | +| Desktop App | Download: | -### 2.4 Web 页面 +### 2.4 Web Pages -| 页面 | 地址 | 说明 | -|------|------|------| -| 首页 | `/index.html` | WebRTC 连接 + 文本/音频驱动 + 录制控制 | -| Avatar 生成 | `/avatar.html` | 上传视频自动生成数字人形象 | -| 管理后台 | `/admin.html` | 实时监控会话状态与全局配置 | +| Page | URL | Description | +|------|-----|-------------| +| Home | `/index.html` | WebRTC connection + text/audio driver + recording control | +| Avatar Creator | `/avatar.html` | Upload video to auto-generate digital human avatars | +| Admin Console | `/admin.html` | Real-time session monitoring & global configuration | -### 2.5 快速体验 +### 2.5 Quick Experience -使用在线镜像创建实例即可运行: +Create an instance with a cloud image to run instantly: -- [UCloud 镜像](https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking) +- [UCloud Image](https://www.compshare.cn/images/4458094e-a43d-45fe-9b57-de79253befe4?referral_code=3XW3852OBmnD089hMMrtuU&ytag=GPU_GitHub_livetalking) + +### 2.6 Documentation + -### 2.6 使用说明 - --- -## 3. 系统架构 +## 3. Architecture -### 数据流图 +### Dataflow Diagram +### Layer Overview -### 各层说明 - -**API 层** -- `/human`: 接收文本,支持 echo(直接复读)和 chat(LLM 对话)模式 -- `/humanaudio`: 接收音频文件直接播放 -- 每个连接分配唯一 `sessionid`,支持多用户并发 +**API Layer** +- `/human`: Accepts text, supporting echo (direct playback) and chat (LLM conversation) modes +- `/humanaudio`: Accepts audio files for direct playback +- Each connection is assigned a unique `sessionid`, supporting multi-user concurrency -**逻辑层** -- **LLM 引擎**: 对接 Qwen 等大模型生成对话回复 -- **TTS 引擎**: 模块化设计,支持 EdgeTTS、GPT-SoVITS、CosyVoice、腾讯云等多种方案 -- **特征提取**: 同步提取音频的声学特征(如 Mel 频谱),用于口型推理 +**Logic Layer** +- **LLM Engine**: Integrates with models like Qwen to generate conversational responses +- **TTS Engine**: Modular design supporting EdgeTTS, GPT-SoVITS, CosyVoice, Tencent Cloud, and more +- **Feature Extraction**: Synchronously extracts acoustic features (e.g., Mel spectrograms) for lip-sync inference -**渲染层** -- **模型推理**: 使用深度学习模型 (Wav2Lip, MuseTalk 等) 根据音频特征生成口型画面 -- **后处理**: 将生成的口型区域平滑贴回原始高清视频 +**Rendering Layer** +- **Model Inference**: Uses deep learning models (Wav2Lip, MuseTalk, etc.) to generate lip-sync frames from audio features +- **Post-Processing**: Smoothly overlays the generated mouth region back onto the original high-definition video -**推流层** -- **WebRTC**: 低延迟浏览器端推流 -- **RTMP**: 标准直播协议,支持推流到 B站/YouTube 等平台 -- **虚拟摄像头**: 输出为系统摄像头设备 +**Streaming Layer** +- **WebRTC**: Low-latency browser-based streaming +- **RTMP**: Standard live streaming protocol, supports pushing to platforms like Bilibili/YouTube +- **Virtual Camera**: Outputs as a system camera device -**插件系统** -- 基于 [registry.py](registry.py) 的去中心化注册机制,开发者可自行扩展 TTS、Avatar、Output 模块 +**Plugin System** +- Decentralized registration mechanism based on [registry.py](registry.py), allowing developers to extend TTS, Avatar, and Output modules --- -## 4. API 接口 +## 4. API Documentation -| 文档 | 说明 | -|------|------| -| [docs/api.md](docs/api.md) | 通用业务 API — WebRTC、文本/音频驱动、录制、动作编排 | -| [docs/avatar_api.md](docs/avatar_api.md) | Avatar 生成 API — 创建任务、查询进度、删除任务 | -| [docs/admin_api.md](docs/admin_api.md) | Admin 管理 API — 全局配置、会话监控、强制停止 | +| Document | Description | +|----------|-------------| +| [docs/api.md](docs/api.md) | General API — WebRTC, text/audio driver, recording, action choreography | +| [docs/avatar_api.md](docs/avatar_api.md) | Avatar Generation API — create tasks, query progress, delete tasks | +| [docs/admin_api.md](docs/admin_api.md) | Admin API — global config, session monitoring, force stop | --- -## 5. Docker 运行 +## 5. Docker -镜像说明: -- **AutoDL**: — [教程](https://doc.livetalking.ai/docs/autodl/) -- **UCloud**: — 支持开放任意端口,无需额外部署 SRS — [教程](https://doc.livetalking.ai/docs/ucloud/) +Available images: +- **AutoDL**: — [Tutorial](https://doc.livetalking.ai/en/docs/autodl/) +- **UCloud**: — Supports opening any port, no additional SRS deployment required — [Tutorial](https://doc.livetalking.ai/en/docs/ucloud/) -> AutoDL 由于不能开放 UDP 端口,需自行部署 SRS 或 TURN 转发服务。 +> AutoDL cannot open UDP ports, so you need to deploy SRS or TURN relay service separately. --- -## 6. 性能指标 +## 6. Performance -- 每路视频压缩消耗 CPU,分辨率越高 CPU 消耗越大;每路口型推理消耗 GPU -- 不说话时并发数取决于 CPU,同时说话并发数取决于 GPU -- 后端日志 `inferfps` = GPU 推理帧率, `finalfps` = 最终推流帧率,两者均需 >=25 才算实时 +- Each video stream compression consumes CPU; higher resolution means greater CPU usage. Each lip-sync inference consumes GPU +- Concurrent sessions when not speaking depend on CPU; concurrent speaking sessions depend on GPU +- In backend logs: `inferfps` = GPU inference frame rate, `finalfps` = final streaming frame rate. Both must be >= 25 for real-time performance -### 实时推理性能 +### Real-Time Inference Performance -| 模型 | 显卡 | FPS | -|:------|:------|:----| +| Model | GPU | FPS | +|:------|:----|:----| | wav2lip256 | RTX 3060 | 60 | | wav2lip256 | RTX 3080Ti | 120 | | musetalk | RTX 3080Ti | 42 | | musetalk | RTX 3090 | 45 | | musetalk | RTX 4090 | 72 | -- wav2lip256 推荐 RTX 3060 及以上 -- musetalk 推荐 RTX 3080Ti 及以上 +- wav2lip256: RTX 3060 or higher recommended +- musetalk: RTX 3080Ti or higher recommended --- -## 7. 声明 +## 7. Statement -基于本项目开发并发布在B站、视频号、抖音等平台上的视频需带上 LiveTalking 水印和标识。 +Videos developed based on this project and published on platforms such as Bilibili, WeChat Channels, and Douyin must include the LiveTalking watermark and logo. --- -如果本项目对你有帮助,帮忙点个 Star。也欢迎感兴趣的朋友一起来完善该项目。 +If this project is helpful to you, please give it a Star. Contributors interested in improving this project are also welcome. -| 社区 | 链接 | -|------|------| -| 知识星球 | | -| 微信 | wxwubug (加群请备注) | +| Community | Link | +|-----------|------| +| Knowledge Planet | | +| WeChat | wxwubug (mention for group invite) | | Telegram | | | Discord | | | Email | lipku@foxmail.com | -| 微信公众号 | 数字人技术 | +| WeChat Official | 数字人技术 | diff --git a/app.py b/app.py index ce496206..3a287cff 100644 --- a/app.py +++ b/app.py @@ -194,7 +194,7 @@ def main(): elif opt.transport=='rtcpush': pagename='rtcpushapi.html' logger.info('start http server; http://:'+str(opt.listenport)+'/'+pagename) - # logger.info('如果使用webrtc,推荐访问webrtc集成前端: http://:'+str(opt.listenport)+'/dashboard.html') + # logger.info('If using WebRTC, it is recommended to access the WebRTC integrated frontend: http://:'+str(opt.listenport)+'/dashboard.html') def run_server(runner): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) diff --git a/llm.py b/llm.py index 9be5e19f..4f5f2569 100644 --- a/llm.py +++ b/llm.py @@ -8,51 +8,57 @@ def llm_response(message,avatar_session:'BaseAvatar',datainfo:dict={}): try: opt = avatar_session.opt - start = time.perf_counter() - from openai import OpenAI - client = OpenAI( - # 如果您没有配置环境变量,请在此处用您的API Key进行替换 - api_key=os.getenv("DASHSCOPE_API_KEY"), - # 填写DashScope SDK的base_url - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - ) - end = time.perf_counter() - logger.info(f"llm Time init: {end-start}s,{message}") - completion = client.chat.completions.create( - model="qwen-plus", - messages=[{'role': 'system', 'content': '你是一个知识助手,尽量以简短、口语化的方式输出'}, - {'role': 'user', 'content': message}], - stream=True, - # 通过以下设置,在流式输出的最后一行展示token使用信息 - stream_options={"include_usage": True} - ) - result="" - first = True - for chunk in completion: - if len(chunk.choices)>0: - #print(chunk.choices[0].delta.content) - if first: - end = time.perf_counter() - logger.info(f"llm Time to first chunk: {end-start}s") - first = False - msg = chunk.choices[0].delta.content - if msg is None: - continue - lastpos=0 - #msglist = re.split('[,.!;:,。!?]',msg) - for i, char in enumerate(msg): - if char in ",.!;:,。!?:;" : - result = result+msg[lastpos:i+1] - lastpos = i+1 - if len(result)>10: - logger.info(result) - avatar_session.put_msg_txt(result,datainfo) - result="" - result = result+msg[lastpos:] - end = time.perf_counter() - logger.info(f"llm Time to last chunk: {end-start}s") - if result: - avatar_session.put_msg_txt(result,datainfo) + # Static response to avoid using paid third-party services + static_response = f"收到,这是本地静态测试回复。你发送的消息是:{message}" + logger.info(f"Static LLM response: {static_response}") + avatar_session.put_msg_txt(static_response, datainfo) + return + + # start = time.perf_counter() + # from openai import OpenAI + # client = OpenAI( + # # 如果您没有配置环境变量,请在此处用您的API Key进行替换 + # api_key=os.getenv("DASHSCOPE_API_KEY"), + # # 填写DashScope SDK的base_url + # base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + # ) + # end = time.perf_counter() + # logger.info(f"llm Time init: {end-start}s,{message}") + # completion = client.chat.completions.create( + # model="qwen-plus", + # messages=[{'role': 'system', 'content': '你是一个知识助手,尽量以简短、口语化的方式输出'}, + # {'role': 'user', 'content': message}], + # stream=True, + # # 通过以下设置,在流式输出的最后一行展示token使用信息 + # stream_options={"include_usage": True} + # ) + # result="" + # first = True + # for chunk in completion: + # if len(chunk.choices)>0: + # #print(chunk.choices[0].delta.content) + # if first: + # end = time.perf_counter() + # logger.info(f"llm Time to first chunk: {end-start}s") + # first = False + # msg = chunk.choices[0].delta.content + # if msg is None: + # continue + # lastpos=0 + # #msglist = re.split('[,.!;:,。!?]',msg) + # for i, char in enumerate(msg): + # if char in ",.!;:,。!?:;" : + # result = result+msg[lastpos:i+1] + # lastpos = i+1 + # if len(result)>10: + # logger.info(result) + # avatar_session.put_msg_txt(result,datainfo) + # result="" + # result = result+msg[lastpos:] + # end = time.perf_counter() + # logger.info(f"llm Time to last chunk: {end-start}s") + # if result: + # avatar_session.put_msg_txt(result,datainfo) except Exception as e: logger.exception('llm exceptiopn:') diff --git a/tts/doubao.py b/tts/doubao.py index 6c1b84e5..a5305878 100644 --- a/tts/doubao.py +++ b/tts/doubao.py @@ -49,54 +49,59 @@ def __init__(self, opt, parent): } async def doubao_voice(self, text, ref_file): # -> Iterator[bytes]: - start = time.perf_counter() - voice_type = ref_file #self.opt.REF_FILE + # Mock/static return to avoid using paid Doubao service + logger.info(f"Mock Doubao TTS voice synthesis for text: {text}") + yield b'\x00' * 51200 + return - try: - # 创建请求对象 - default_header = bytearray(b'\x11\x10\x11\x00') - submit_request_json = copy.deepcopy(self.request_json) - submit_request_json["user"]["uid"] = self.parent.sessionid - submit_request_json["audio"]["voice_type"] = voice_type - submit_request_json["request"]["text"] = text - submit_request_json["request"]["reqid"] = str(uuid.uuid4()) - submit_request_json["request"]["operation"] = "submit" - payload_bytes = str.encode(json.dumps(submit_request_json)) - payload_bytes = gzip.compress(payload_bytes) # if no compression, comment this line - full_client_request = bytearray(default_header) - full_client_request.extend((len(payload_bytes)).to_bytes(4, 'big')) # payload size(4 bytes) - full_client_request.extend(payload_bytes) # payload - - header = {"Authorization": f"Bearer; {self.token}"} - first = True - async with websockets.connect(self.api_url, extra_headers=header, ping_interval=None) as ws: - await ws.send(full_client_request) - while True: - res = await ws.recv() - header_size = res[0] & 0x0f - message_type = res[1] >> 4 - message_type_specific_flags = res[1] & 0x0f - payload = res[header_size*4:] - - if message_type == 0xb: # audio-only server response - if message_type_specific_flags == 0: # no sequence number as ACK - #print(" Payload size: 0") - continue - else: - if first: - end = time.perf_counter() - logger.info(f"doubao tts Time to first chunk: {end-start}s") - first = False - sequence_number = int.from_bytes(payload[:4], "big", signed=True) - payload_size = int.from_bytes(payload[4:8], "big", signed=False) - payload = payload[8:] - yield payload - if sequence_number < 0: - break - else: - break - except Exception as e: - logger.exception('doubao') + # start = time.perf_counter() + # voice_type = ref_file #self.opt.REF_FILE + # + # try: + # # 创建请求对象 + # default_header = bytearray(b'\x11\x10\x11\x00') + # submit_request_json = copy.deepcopy(self.request_json) + # submit_request_json["user"]["uid"] = self.parent.sessionid + # submit_request_json["audio"]["voice_type"] = voice_type + # submit_request_json["request"]["text"] = text + # submit_request_json["request"]["reqid"] = str(uuid.uuid4()) + # submit_request_json["request"]["operation"] = "submit" + # payload_bytes = str.encode(json.dumps(submit_request_json)) + # payload_bytes = gzip.compress(payload_bytes) # if no compression, comment this line + # full_client_request = bytearray(default_header) + # full_client_request.extend((len(payload_bytes)).to_bytes(4, 'big')) # payload size(4 bytes) + # full_client_request.extend(payload_bytes) # payload + # + # header = {"Authorization": f"Bearer; {self.token}"} + # first = True + # async with websockets.connect(self.api_url, extra_headers=header, ping_interval=None) as ws: + # await ws.send(full_client_request) + # while True: + # res = await ws.recv() + # header_size = res[0] & 0x0f + # message_type = res[1] >> 4 + # message_type_specific_flags = res[1] & 0x0f + # payload = res[header_size*4:] + # + # if message_type == 0xb: # audio-only server response + # if message_type_specific_flags == 0: # no sequence number as ACK + # #print(" Payload size: 0") + # continue + # else: + # if first: + # end = time.perf_counter() + # logger.info(f"doubao tts Time to first chunk: {end-start}s") + # first = False + # sequence_number = int.from_bytes(payload[:4], "big", signed=True) + # payload_size = int.from_bytes(payload[4:8], "big", signed=False) + # payload = payload[8:] + # yield payload + # if sequence_number < 0: + # break + # else: + # break + # except Exception as e: + # logger.exception('doubao') # # 检查响应状态码 # if response.status_code == 200: # # 处理响应数据 diff --git a/tts/qwentts.py b/tts/qwentts.py index 1b69edaa..25cb00f6 100644 --- a/tts/qwentts.py +++ b/tts/qwentts.py @@ -45,16 +45,6 @@ def __init__(self, opt, parent): self.voice = opt.REF_FILE if opt.REF_FILE else 'Cherry' # 模型名 self.model = getattr(opt, 'qwen_tts_model', 'qwen3-tts-flash-realtime') - # WebSocket URL - self.ws_url = getattr(opt, 'qwen_tts_url', - 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime') - - # 设置 DashScope API Key - api_key = getattr(opt, 'dashscope_api_key', None) or os.environ.get('DASHSCOPE_API_KEY') - if api_key: - dashscope.api_key = api_key - else: - logger.warning("QwenTTS: DASHSCOPE_API_KEY 未设置,请设置环境变量或通过参数传入") # ---------- 内部状态 ---------- self._remainder = np.array([], dtype=np.float32) # 上次重采样后不足一 chunk 的 16kHz 样本 @@ -63,94 +53,35 @@ def __init__(self, opt, parent): self._current_text = '' self._current_textevent = {} - # ---------- 回调类 ---------- - tts_ref = self - - class _Callback(QwenTtsRealtimeCallback): - def on_open(self) -> None: - logger.info("QwenTTS WebSocket 连接已建立") - - def on_close(self, close_status_code, close_msg) -> None: - logger.info(f"QwenTTS WebSocket 关闭: code={close_status_code}, msg={close_msg}") - tts_ref._response_event.set() - - def on_event(self, response: dict) -> None: - try: - event_type = response.get('type', '') - - if event_type == 'session.created': - logger.info(f"QwenTTS session: {response.get('session', {}).get('id', '')}") - - elif event_type == 'response.audio.delta': - audio_b64 = response.get('delta', '') - if audio_b64: - pcm_data = base64.b64decode(audio_b64) - tts_ref._on_audio_data(pcm_data) - - elif event_type == 'response.done': - logger.info("QwenTTS response done") - tts_ref._flush_remainder() - tts_ref._response_event.set() - - elif event_type == 'error': - logger.error(f"QwenTTS 错误: {response}") - tts_ref._response_event.set() - - except Exception as e: - logger.exception(f"QwenTTS 回调处理异常: {e}") - - # ---------- 建立唯一连接 ---------- - self._callback = _Callback() - self._tts_client = QwenTtsRealtime( - model=self.model, - callback=self._callback, - url=self.ws_url, - ) - self._tts_client.connect() - self._tts_client.update_session( - voice=self.voice, - response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, # Qwen TTS 只支持 24kHz 输出 - sample_rate=16000, - mode='commit', - ) - logger.info(f"QwenTTS 初始化完成: model={self.model}, voice={self.voice}") + logger.info("Mock QwenTTS initialized (no remote API connection established)") # ========================== 核心方法 ========================== def txt_to_audio(self, msg: tuple[str, dict]): - text, textevent = msg - t_start = time.perf_counter() - - ref_file = textevent.get('tts', {}).get('ref_file',self.opt.REF_FILE) - - # 重置状态 - self._remainder = np.array([], dtype=np.float32) - self._first_chunk = True - self._current_text = text - self._current_textevent = textevent - self._response_event.clear() - try: - #logger.info(f"QwenTTS 发送文本: {text[:80]}...") - if ref_file != self.voice: - logger.info(f'ref_file:{ref_file},self.voice:{self.voice}') - self.voice=ref_file - self._tts_client.close() - self._tts_client.connect() - self._tts_client.update_session( - voice=self.voice, - response_format=AudioFormat.PCM_24000HZ_MONO_16BIT, # Qwen TTS 只支持 24kHz 输出 - sample_rate=16000, - mode='commit', - ) - self._tts_client.append_text(text) - self._tts_client.commit() - - # 等待 response.done(音频在回调中流式处理) - self._response_event.wait(timeout=60) + text, textevent = msg + t_start = time.perf_counter() + + logger.info(f"Mock QwenTTS synthesis for text: {text}") + + # Output start frame + eventpoint_start = {'status': 'start', 'text': text} + eventpoint_start.update(**textevent) + self.parent.put_audio_frame(np.zeros(self.chunk, np.float32), eventpoint_start) + + # Output mock silence + for _ in range(10): + if self.state != State.RUNNING: + break + self.parent.put_audio_frame(np.zeros(self.chunk, np.float32), textevent) + + # Output end frame + eventpoint_end = {'status': 'end', 'text': text} + eventpoint_end.update(**textevent) + self.parent.put_audio_frame(np.zeros(self.chunk, np.float32), eventpoint_end) t_end = time.perf_counter() - logger.info(f"QwenTTS 合成完成,耗时: {t_end - t_start:.2f}s") + logger.info(f"Mock QwenTTS synthesis completed, time: {t_end - t_start:.2f}s") except Exception as e: logger.exception(f"QwenTTS txt_to_audio 异常: {e}") diff --git a/tts/tencent.py b/tts/tencent.py index b283596f..4584cd1c 100644 --- a/tts/tencent.py +++ b/tts/tencent.py @@ -78,41 +78,46 @@ def txt_to_audio(self,msg:tuple[str, dict]): ) def tencent_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]: - start = time.perf_counter() - session_id = str(uuid.uuid1()) - params = self.__gen_params(session_id, text, reffile) - signature = self.__gen_signature(params) - headers = { - "Content-Type": "application/json", - "Authorization": str(signature) - } - url = _PROTOCOL + _HOST + _PATH - try: - res = requests.post(url, headers=headers, - data=json.dumps(params), stream=True) - - end = time.perf_counter() - logger.info(f"tencent Time to make POST: {end-start}s") - - first = True - - for chunk in res.iter_content(chunk_size=6400): # 640 16K*20ms*2 - #logger.info('chunk len:%d',len(chunk)) - if first: - try: - rsp = json.loads(chunk) - #response["Code"] = rsp["Response"]["Error"]["Code"] - #response["Message"] = rsp["Response"]["Error"]["Message"] - logger.error("tencent tts:%s",rsp["Response"]["Error"]["Message"]) - return - except: - end = time.perf_counter() - logger.info(f"tencent Time to first chunk: {end-start}s") - first = False - if chunk and self.state==State.RUNNING: - yield chunk - except Exception as e: - logger.exception('tencent') + # Mock/static return to avoid using paid Tencent service + logger.info(f"Mock Tencent TTS voice synthesis for text: {text}") + yield b'\x00' * 51200 + return + + # start = time.perf_counter() + # session_id = str(uuid.uuid1()) + # params = self.__gen_params(session_id, text, reffile) + # signature = self.__gen_signature(params) + # headers = { + # "Content-Type": "application/json", + # "Authorization": str(signature) + # } + # url = _PROTOCOL + _HOST + _PATH + # try: + # res = requests.post(url, headers=headers, + # data=json.dumps(params), stream=True) + # + # end = time.perf_counter() + # logger.info(f"tencent Time to make POST: {end-start}s") + # + # first = True + # + # for chunk in res.iter_content(chunk_size=6400): # 640 16K*20ms*2 + # #logger.info('chunk len:%d',len(chunk)) + # if first: + # try: + # rsp = json.loads(chunk) + # #response["Code"] = rsp["Response"]["Error"]["Code"] + # #response["Message"] = rsp["Response"]["Error"]["Message"] + # logger.error("tencent tts:%s",rsp["Response"]["Error"]["Message"]) + # return + # except: + # end = time.perf_counter() + # logger.info(f"tencent Time to first chunk: {end-start}s") + # first = False + # if chunk and self.state==State.RUNNING: + # yield chunk + # except Exception as e: + # logger.exception('tencent') def stream_tts(self,audio_stream,msg:tuple[str, dict]): text,textevent = msg diff --git a/web/admin.html b/web/admin.html index 534ae389..6ebd8406 100644 --- a/web/admin.html +++ b/web/admin.html @@ -1,10 +1,10 @@ - + - 后台管理系统 - LiveTalking + Admin Console - LiveTalking @@ -119,17 +119,9 @@ } @keyframes pulse { - 0% { - box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.4); - } - - 70% { - box-shadow: 0 0 0 6px rgba(239, 68, 68, 0); - } - - 100% { - box-shadow: 0 0 0 0 rgba(239, 68, 68, 0); - } + 0% { box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.4); } + 70% { box-shadow: 0 0 0 6px rgba(239, 68, 68, 0); } + 100% { box-shadow: 0 0 0 0 rgba(239, 68, 68, 0); } } .session-detail-item { @@ -159,7 +151,7 @@ } .skeleton { - background: linear-gradient(90deg, #f0f0f0 25%, #e0e0e0 50%, #f0f0f0 75%); + background: linear-gradient(90deg, #f0f0f0 25%, #e0eafc 50%, #f0f0f0 75%); background-size: 200% 100%; animation: loading 1.5s infinite; border-radius: 4px; @@ -167,13 +159,8 @@ } @keyframes loading { - 0% { - background-position: 200% 0; - } - - 100% { - background-position: -200% 0; - } + 0% { background-position: 200% 0; } + 100% { background-position: -200% 0; } } .empty-state { @@ -194,9 +181,9 @@

- LiveTalking 控制台 + LiveTalking Console

@@ -205,22 +192,18 @@

- 全局配置 + Global Config

- - + + - - + +
加载中... -
-
Loading...
加载中... -
-
Loading...
@@ -232,7 +215,7 @@

- 活跃会话 + Active Sessions 0

@@ -253,15 +236,15 @@

diff --git a/web/asr/index.html b/web/asr/index.html index 902518c8..c68e50d2 100644 --- a/web/asr/index.html +++ b/web/asr/index.html @@ -1,89 +1,89 @@ - - - - - - 语音识别 - - - - - - - - -
- -
- asr服务器地址(必填): -
- -
- -
-
-
- 选择录音模式:
- -    - - -
- -
-
- 选择asr模型模式:
- -    -    - - -
- - -
-
- 逆文本标准化(ITN):
-    - -
-
-
- 热词设置(一行一个关键字,空格隔开权重,如"阿里巴巴 20"): -
- - - -
- -
- 语音识别结果显示: -
- - -
-
请点击开始
-
- - - - -
- - -
-
- - - - - - - - - + + + + + + Speech Recognition (ASR) + + + + + + + + +
+ +
+ ASR Server Address (Required): +
+ +
+ +
+
+
+ Select Recording Mode:
+ +    + + +
+ +
+
+ Select ASR Model Mode:
+ +    +    + + +
+ + +
+
+ Inverse Text Normalization (ITN):
+    + +
+
+
+ Hotword Settings (one keyword per line, space separated weight, e.g., "Alibaba 20"): +
+ + + +
+ +
+ Speech Recognition Results: +
+ + +
+
Please click Start
+
+ + + + +
+ + +
+
+ + + + + + + + + diff --git a/web/asr/main.js b/web/asr/main.js index 3f312079..2e0ef180 100644 --- a/web/asr/main.js +++ b/web/asr/main.js @@ -1,625 +1,622 @@ -/** - * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights - * Reserved. MIT License (https://opensource.org/licenses/MIT) - */ -/* 2022-2023 by zhaoming,mali aihealthx.com */ - - -// 连接; 定义socket连接类对象与语音对象 -var wsconnecter = new WebSocketConnectMethod({msgHandle:getJsonMessage,stateHandle:getConnState}); -var audioBlob; - -// 录音; 定义录音对象,wav格式 -var rec = Recorder({ - type:"pcm", - bitRate:16, - sampleRate:16000, - onProcess:recProcess -}); - - - - -var sampleBuf=new Int16Array(); -// 定义按钮响应事件 -var btnStart = document.getElementById('btnStart'); -btnStart.onclick = record; -var btnStop = document.getElementById('btnStop'); -btnStop.onclick = stop; -btnStop.disabled = true; -btnStart.disabled = true; - -btnConnect= document.getElementById('btnConnect'); -btnConnect.onclick = start; - -var awsslink= document.getElementById('wsslink'); - - -var rec_text=""; // for online rec asr result -var offline_text=""; // for offline rec asr result -var info_div = document.getElementById('info_div'); - -var upfile = document.getElementById('upfile'); - - - -var isfilemode=false; // if it is in file mode -var file_ext=""; -var file_sample_rate=16000; //for wav file sample rate -var file_data_array; // array to save file data - -var totalsend=0; - - -// var now_ipaddress=window.location.href; -// now_ipaddress=now_ipaddress.replace("https://","wss://"); -// now_ipaddress=now_ipaddress.replace("static/index.html",""); -// var localport=window.location.port; -// now_ipaddress=now_ipaddress.replace(localport,"10095"); -// document.getElementById('wssip').value=now_ipaddress; -addresschange(); -function addresschange() -{ - - var Uri = document.getElementById('wssip').value; - document.getElementById('info_wslink').innerHTML="点此处手工授权(IOS手机)"; - Uri=Uri.replace(/wss/g,"https"); - console.log("addresschange uri=",Uri); - - awsslink.onclick=function(){ - window.open(Uri, '_blank'); - } - -} - -upfile.onclick=function() -{ - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=false; - -} - -// from https://github.com/xiangyuecn/Recorder/tree/master -var readWavInfo=function(bytes){ - //读取wav文件头,统一成44字节的头 - if(bytes.byteLength<44){ - return null; - }; - var wavView=bytes; - var eq=function(p,s){ - for(var i=0;i=chunk_size){ - - sendBuf=sampleBuf.slice(0,chunk_size); - totalsend=totalsend+sampleBuf.length; - sampleBuf=sampleBuf.slice(chunk_size,sampleBuf.length); - wsconnecter.wsSend(sendBuf); - - - } - - stop(); - - - -} - - -function on_recoder_mode_change() -{ - var item = null; - var obj = document.getElementsByName("recoder_mode"); - for (var i = 0; i < obj.length; i++) { //遍历Radio - if (obj[i].checked) { - item = obj[i].value; - break; - } - - - } - if(item=="mic") - { - document.getElementById("mic_mode_div").style.display = 'block'; - document.getElementById("rec_mode_div").style.display = 'none'; - - - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=false; - isfilemode=false; - } - else - { - document.getElementById("mic_mode_div").style.display = 'none'; - document.getElementById("rec_mode_div").style.display = 'block'; - - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=true; - isfilemode=true; - info_div.innerHTML='请点击选择文件'; - - - } -} - - -function getHotwords(){ - - var obj = document.getElementById("varHot"); - - if(typeof(obj) == 'undefined' || obj==null || obj.value.length<=0){ - return null; - } - let val = obj.value.toString(); - - console.log("hotwords="+val); - let items = val.split(/[(\r\n)\r\n]+/); //split by \r\n - var jsonresult = {}; - const regexNum = /^[0-9]*$/; // test number - for (item of items) { - - let result = item.split(" "); - if(result.length>=2 && regexNum.test(result[result.length-1])) - { - var wordstr=""; - for(var i=0;i new Promise((resolve) => setTimeout(resolve, delay)) -async function is_speaking() { - const response = await fetch('/is_speaking', { - body: JSON.stringify({ - sessionid: String(parent.document.getElementById('sessionid').value), - }), - headers: { - 'Content-Type': 'application/json' - }, - method: 'POST' - }); - const data = await response.json(); - console.log('is_speaking res:',data) - return data.data -} - -async function waitSpeakingEnd() { - rec.stop() //关闭录音 - for(let i=0;i<10;i++) { //等待数字人开始讲话,最长等待10s - bspeak = await is_speaking() - if(bspeak) { - break - } - await sleep(1000) - } - - while(true) { //等待数字人讲话结束 - bspeak = await is_speaking() - if(!bspeak) { - break - } - await sleep(1000) - } - await sleep(2000) - rec.start() -} -// 语音识别结果; 对jsonMsg数据解析,将识别结果附加到编辑框中 -function getJsonMessage( jsonMsg ) { - //console.log(jsonMsg); - console.log( "message: " + JSON.parse(jsonMsg.data)['text'] ); - var rectxt=""+JSON.parse(jsonMsg.data)['text']; - var asrmodel=JSON.parse(jsonMsg.data)['mode']; - var is_final=JSON.parse(jsonMsg.data)['is_final']; - var timestamp=JSON.parse(jsonMsg.data)['timestamp']; - if(asrmodel=="2pass-offline" || asrmodel=="offline") - { - offline_text=offline_text+rectxt.replace(/ +/g,"")+'\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,""); - rec_text=offline_text; - fetch('/human', { - body: JSON.stringify({ - text: rectxt.replace(/ +/g,""), - type: 'chat', - sessionid: String(parent.document.getElementById('sessionid').value), - }), - headers: { - 'Content-Type': 'application/json' - }, - method: 'POST' - }); - - waitSpeakingEnd(); - } - else - { - rec_text=rec_text+rectxt; //.replace(/ +/g,""); - } - var varArea=document.getElementById('varArea'); - - varArea.value=rec_text; - console.log( "offline_text: " + asrmodel+","+offline_text); - console.log( "rec_text: " + rec_text); - if (isfilemode==true && is_final==true){ - console.log("call stop ws!"); - play_file(); - wsconnecter.wsStop(); - - info_div.innerHTML="请点击连接"; - - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=false; - } - - - -} - -// 连接状态响应 -function getConnState( connState ) { - if ( connState === 0 ) { //on open - - - info_div.innerHTML='连接成功!请点击开始'; - if (isfilemode==true){ - info_div.innerHTML='请耐心等待,大文件等待时间更长'; - start_file_send(); - } - else - { - btnStart.disabled = false; - btnStop.disabled = true; - btnConnect.disabled=true; - } - } else if ( connState === 1 ) { - //stop(); - } else if ( connState === 2 ) { - stop(); - console.log( 'connecttion error' ); - - alert("连接地址"+document.getElementById('wssip').value+"失败,请检查asr地址和端口。或试试界面上手动授权,再连接。"); - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=false; - - - info_div.innerHTML='请点击连接'; - } -} - -function record() -{ - - rec.open( function(){ - rec.start(); - console.log("开始"); - btnStart.disabled = true; - btnStop.disabled = false; - btnConnect.disabled=true; - }); - -} - - - -// 识别启动、停止、清空操作 -function start() { - - // 清除显示 - clear(); - //控件状态更新 - console.log("isfilemode"+isfilemode); - - //启动连接 - var ret=wsconnecter.wsStart(); - // 1 is ok, 0 is error - if(ret==1){ - info_div.innerHTML="正在连接asr服务器,请等待..."; - isRec = true; - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=true; - - return 1; - } - else - { - info_div.innerHTML="请点击开始"; - btnStart.disabled = true; - btnStop.disabled = true; - btnConnect.disabled=false; - - return 0; - } -} - - -function stop() { - var chunk_size = new Array( 5, 10, 5 ); - var request = { - "chunk_size": chunk_size, - "wav_name": "h5", - "is_speaking": false, - "chunk_interval":10, - "mode":getAsrMode(), - }; - console.log(request); - if(sampleBuf.length>0){ - wsconnecter.wsSend(sampleBuf); - console.log("sampleBuf.length"+sampleBuf.length); - sampleBuf=new Int16Array(); - } - wsconnecter.wsSend( JSON.stringify(request) ); - - - - - - - // 控件状态更新 - - isRec = false; - info_div.innerHTML="发送完数据,请等候,正在识别..."; - - if(isfilemode==false){ - btnStop.disabled = true; - btnStart.disabled = true; - btnConnect.disabled=true; - //wait 3s for asr result - setTimeout(function(){ - console.log("call stop ws!"); - wsconnecter.wsStop(); - btnConnect.disabled=false; - info_div.innerHTML="请点击连接";}, 3000 ); - - - - rec.stop(function(blob,duration){ - - console.log(blob); - var audioBlob = Recorder.pcm2wav(data = {sampleRate:16000, bitRate:16, blob:blob}, - function(theblob,duration){ - console.log(theblob); - var audio_record = document.getElementById('audio_record'); - audio_record.src = (window.URL||webkitURL).createObjectURL(theblob); - audio_record.controls=true; - //audio_record.play(); - - - } ,function(msg){ - console.log(msg); - } - ); - - - - },function(errMsg){ - console.log("errMsg: " + errMsg); - }); - } - // 停止连接 - - - -} - -function clear() { - - var varArea=document.getElementById('varArea'); - - varArea.value=""; - rec_text=""; - offline_text=""; - -} - - -function recProcess( buffer, powerLevel, bufferDuration, bufferSampleRate,newBufferIdx,asyncEnd ) { - if ( isRec === true ) { - var data_48k = buffer[buffer.length-1]; - - var array_48k = new Array(data_48k); - var data_16k=Recorder.SampleData(array_48k,bufferSampleRate,16000).data; - - sampleBuf = Int16Array.from([...sampleBuf, ...data_16k]); - var chunk_size=960; // for asr chunk_size [5, 10, 5] - info_div.innerHTML=""+bufferDuration/1000+"s"; - while(sampleBuf.length>=chunk_size){ - sendBuf=sampleBuf.slice(0,chunk_size); - sampleBuf=sampleBuf.slice(chunk_size,sampleBuf.length); - wsconnecter.wsSend(sendBuf); - - - - } - - - - } -} - -function getUseITN() { - var obj = document.getElementsByName("use_itn"); - for (var i = 0; i < obj.length; i++) { - if (obj[i].checked) { - return obj[i].value === "true"; - } - } - return false; -} +/** + * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights + * Reserved. MIT License (https://opensource.org/licenses/MIT) + */ +/* 2022-2023 by zhaoming,mali aihealthx.com */ + + +// Connection; Define socket connection and audio objects +var wsconnecter = new WebSocketConnectMethod({msgHandle:getJsonMessage,stateHandle:getConnState}); +var audioBlob; + +// Recording; Define recording object, wav format +var rec = Recorder({ + type:"pcm", + bitRate:16, + sampleRate:16000, + onProcess:recProcess +}); + + + + +var sampleBuf=new Int16Array(); +// Define button click handlers +var btnStart = document.getElementById('btnStart'); +btnStart.onclick = record; +var btnStop = document.getElementById('btnStop'); +btnStop.onclick = stop; +btnStop.disabled = true; +btnStart.disabled = true; + +btnConnect= document.getElementById('btnConnect'); +btnConnect.onclick = start; + +var awsslink= document.getElementById('wsslink'); + + +var rec_text=""; // for online rec asr result +var offline_text=""; // for offline rec asr result +var info_div = document.getElementById('info_div'); + +var upfile = document.getElementById('upfile'); + + + +var isfilemode=false; // if it is in file mode +var file_ext=""; +var file_sample_rate=16000; //for wav file sample rate +var file_data_array; // array to save file data + +var totalsend=0; + + +// var now_ipaddress=window.location.href; +// now_ipaddress=now_ipaddress.replace("https://","wss://"); +// now_ipaddress=now_ipaddress.replace("static/index.html",""); +// var localport=window.location.port; +// now_ipaddress=now_ipaddress.replace(localport,"10095"); +// document.getElementById('wssip').value=now_ipaddress; +addresschange(); +function addresschange() +{ + + var Uri = document.getElementById('wssip').value; + document.getElementById('info_wslink').innerHTML="Click here to manually authorize (iOS)"; + Uri=Uri.replace(/wss/g,"https"); + console.log("addresschange uri=",Uri); + + awsslink.onclick=function(){ + window.open(Uri, '_blank'); + } + +} + +upfile.onclick=function() +{ + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=false; + +} + +// from https://github.com/xiangyuecn/Recorder/tree/master +var readWavInfo=function(bytes){ + // Read wav header, uniform to 44-byte header + if(bytes.byteLength<44){ + return null; + }; + var wavView=bytes; + var eq=function(p,s){ + for(var i=0;i=chunk_size){ + + sendBuf=sampleBuf.slice(0,chunk_size); + totalsend=totalsend+sampleBuf.length; + sampleBuf=sampleBuf.slice(chunk_size,sampleBuf.length); + wsconnecter.wsSend(sendBuf); + + + } + + stop(); + + + +} + + +function on_recoder_mode_change() +{ + var item = null; + var obj = document.getElementsByName("recoder_mode"); + for (var i = 0; i < obj.length; i++) { // Iterate over radios + if (obj[i].checked) { + item = obj[i].value; + break; + } + + + } + if(item=="mic") + { + document.getElementById("mic_mode_div").style.display = 'block'; + document.getElementById("rec_mode_div").style.display = 'none'; + + + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=false; + isfilemode=false; + } + else + { + document.getElementById("mic_mode_div").style.display = 'none'; + document.getElementById("rec_mode_div").style.display = 'block'; + + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=true; + isfilemode=true; + info_div.innerHTML='Please click to select a file'; + + + } +} + + +function getHotwords(){ + + var obj = document.getElementById("varHot"); + + if(typeof(obj) == 'undefined' || obj==null || obj.value.length<=0){ + return null; + } + let val = obj.value.toString(); + + console.log("hotwords="+val); + let items = val.split(/[(\r\n)\r\n]+/); //split by \r\n + var jsonresult = {}; + const regexNum = /^[0-9]*$/; // test number + for (item of items) { + + let result = item.split(" "); + if(result.length>=2 && regexNum.test(result[result.length-1])) + { + var wordstr=""; + for(var i=0;i new Promise((resolve) => setTimeout(resolve, delay)) +async function is_speaking() { + const response = await fetch('/is_speaking', { + body: JSON.stringify({ + sessionid: String(parent.document.getElementById('sessionid').value), + }), + headers: { + 'Content-Type': 'application/json' + }, + method: 'POST' + }); + const data = await response.json(); + console.log('is_speaking res:',data) + return data.data +} + +async function waitSpeakingEnd() { + rec.stop() // Stop recording + for(let i=0;i<10;i++) { // Wait for avatar to start speaking, max 10s + bspeak = await is_speaking() + if(bspeak) { + break + } + await sleep(1000) + } + + while(true) { // Wait for avatar to finish speaking + bspeak = await is_speaking() + if(!bspeak) { + break + } + await sleep(1000) + } + await sleep(2000) + rec.start() +} +// ASR result; Parse jsonMsg data and append to text area +function getJsonMessage( jsonMsg ) { + //console.log(jsonMsg); + console.log( "message: " + JSON.parse(jsonMsg.data)['text'] ); + var rectxt=""+JSON.parse(jsonMsg.data)['text']; + var asrmodel=JSON.parse(jsonMsg.data)['mode']; + var is_final=JSON.parse(jsonMsg.data)['is_final']; + var timestamp=JSON.parse(jsonMsg.data)['timestamp']; + if(asrmodel=="2pass-offline" || asrmodel=="offline") + { + offline_text=offline_text+rectxt.replace(/ +/g,"")+'\n'; //handleWithTimestamp(rectxt,timestamp); //rectxt; //.replace(/ +/g,""); + rec_text=offline_text; + fetch('/human', { + body: JSON.stringify({ + text: rectxt.replace(/ +/g,""), + type: 'chat', + sessionid: String(parent.document.getElementById('sessionid').value), + }), + headers: { + 'Content-Type': 'application/json' + }, + method: 'POST' + }); + + waitSpeakingEnd(); + } + else + { + rec_text=rec_text+rectxt; //.replace(/ +/g,""); + } + var varArea=document.getElementById('varArea'); + + varArea.value=rec_text; + console.log( "offline_text: " + asrmodel+","+offline_text); + console.log( "rec_text: " + rec_text); + if (isfilemode==true && is_final==true){ + console.log("call stop ws!"); + play_file(); + wsconnecter.wsStop(); + + info_div.innerHTML="Please click Connect"; + + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=false; + } + + + +} + +// Connection status handler +function getConnState( connState ) { + if ( connState === 0 ) { //on open + + + info_div.innerHTML='Connected successfully! Please click Start'; + if (isfilemode==true){ + info_div.innerHTML='Please wait patiently, larger files take longer'; + start_file_send(); + } + else + { + btnStart.disabled = false; + btnStop.disabled = true; + btnConnect.disabled=true; + } + } else if ( connState === 1 ) { + //stop(); + } else if ( connState === 2 ) { + stop(); + console.log( 'connecttion error' ); + + alert("Connection to " + document.getElementById('wssip').value + " failed. Please check the ASR address and port, or try manually authorizing first."); + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=false; + + + info_div.innerHTML='Please click Connect'; + } +} + +function record() +{ + + rec.open( function(){ + rec.start(); + console.log("Start"); + btnStart.disabled = true; + btnStop.disabled = false; + btnConnect.disabled=true; + }); + +} + + + +// Recognition start, stop, clear operations +function start() { + + // Clear display + clear(); + // Control state update + console.log("isfilemode"+isfilemode); + + // Start connection + var ret=wsconnecter.wsStart(); + // 1 is ok, 0 is error + if(ret==1){ + info_div.innerHTML="Connecting to ASR server, please wait..."; + isRec = true; + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=true; + + return 1; + } + else + { + info_div.innerHTML="Please click Start"; + btnStart.disabled = true; + btnStop.disabled = true; + btnConnect.disabled=false; + + return 0; + } +} + + +function stop() { + var chunk_size = new Array( 5, 10, 5 ); + var request = { + "chunk_size": chunk_size, + "wav_name": "h5", + "is_speaking": false, + "chunk_interval":10, + "mode":getAsrMode(), + }; + console.log(request); + if(sampleBuf.length>0){ + wsconnecter.wsSend(sampleBuf); + console.log("sampleBuf.length"+sampleBuf.length); + sampleBuf=new Int16Array(); + } + wsconnecter.wsSend( JSON.stringify(request) ); + + + + + + + // Control state update + + isRec = false; + info_div.innerHTML="Data sent, please wait, recognizing..."; + + if(isfilemode==false){ + btnStop.disabled = true; + btnStart.disabled = true; + btnConnect.disabled=true; + //wait 3s for asr result + setTimeout(function(){ + console.log("call stop ws!"); + wsconnecter.wsStop(); + btnConnect.disabled=false; + info_div.innerHTML="Please click Connect";}, 3000 ); + + + + rec.stop(function(blob,duration){ + + console.log(blob); + var audioBlob = Recorder.pcm2wav(data = {sampleRate:16000, bitRate:16, blob:blob}, + function(theblob,duration){ + console.log(theblob); + var audio_record = document.getElementById('audio_record'); + audio_record.src = (window.URL||webkitURL).createObjectURL(theblob); + audio_record.controls=true; + //audio_record.play(); + + + } ,function(msg){ + console.log(msg); + } + ); + + + + + },function(errMsg){ + console.log("errMsg: " + errMsg); + }); + } + // Stop connection + + + +} + +function clear() { + + var varArea=document.getElementById('varArea'); + + varArea.value=""; + rec_text=""; + offline_text=""; + +} + + +function recProcess( buffer, powerLevel, bufferDuration, bufferSampleRate,newBufferIdx,asyncEnd ) { + if ( isRec === true ) { + var data_48k = buffer[buffer.length-1]; + + var array_48k = new Array(data_48k); + var data_16k=Recorder.SampleData(array_48k,bufferSampleRate,16000).data; + + sampleBuf = Int16Array.from([...sampleBuf, ...data_16k]); + var chunk_size=960; // for asr chunk_size [5, 10, 5] + info_div.innerHTML=""+bufferDuration/1000+"s"; + while(sampleBuf.length>=chunk_size){ + sendBuf=sampleBuf.slice(0,chunk_size); + sampleBuf=sampleBuf.slice(chunk_size,sampleBuf.length); + wsconnecter.wsSend(sendBuf); + + + + } + + + + } +} + +function getUseITN() { + var obj = document.getElementsByName("use_itn"); + for (var i = 0; i < obj.length; i++) { + if (obj[i].checked) { + return obj[i].value === "true"; + } + } + return false; +} diff --git a/web/asr/wsconnecter.js b/web/asr/wsconnecter.js index db140efc..7e2ee995 100644 --- a/web/asr/wsconnecter.js +++ b/web/asr/wsconnecter.js @@ -1,119 +1,119 @@ -/** - * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights - * Reserved. MIT License (https://opensource.org/licenses/MIT) - */ -/* 2021-2023 by zhaoming,mali aihealthx.com */ - -function WebSocketConnectMethod( config ) { //定义socket连接方法类 - - - var speechSokt; - var connKeeperID; - - var msgHandle = config.msgHandle; - var stateHandle = config.stateHandle; - - this.wsStart = function () { - var Uri = document.getElementById('wssip').value; //"wss://111.205.137.58:5821/wss/" //设置wss asr online接口地址 如 wss://X.X.X.X:port/wss/ - if(Uri.match(/wss:\S*|ws:\S*/)) - { - console.log("Uri"+Uri); - } - else - { - alert("请检查wss地址正确性"); - return 0; - } - - if ( 'WebSocket' in window ) { - speechSokt = new WebSocket( Uri ); // 定义socket连接对象 - speechSokt.onopen = function(e){onOpen(e);}; // 定义响应函数 - speechSokt.onclose = function(e){ - console.log("onclose ws!"); - //speechSokt.close(); - onClose(e); - }; - speechSokt.onmessage = function(e){onMessage(e);}; - speechSokt.onerror = function(e){onError(e);}; - return 1; - } - else { - alert('当前浏览器不支持 WebSocket'); - return 0; - } - }; - - // 定义停止与发送函数 - this.wsStop = function () { - if(speechSokt != undefined) { - console.log("stop ws!"); - speechSokt.close(); - } - }; - - this.wsSend = function ( oneData ) { - - if(speechSokt == undefined) return; - if ( speechSokt.readyState === 1 ) { // 0:CONNECTING, 1:OPEN, 2:CLOSING, 3:CLOSED - - speechSokt.send( oneData ); - - - } - }; - - // SOCEKT连接中的消息与状态响应 - function onOpen( e ) { - // 发送json - var chunk_size = new Array( 5, 10, 5 ); - var request = { - "chunk_size": chunk_size, - "wav_name": "h5", - "is_speaking": true, - "chunk_interval":10, - "itn":getUseITN(), - "mode":getAsrMode(), - - }; - if(isfilemode) - { - request.wav_format=file_ext; - if(file_ext=="wav") - { - request.wav_format="PCM"; - request.audio_fs=file_sample_rate; - } - } - - var hotwords=getHotwords(); - - if(hotwords!=null ) - { - request.hotwords=hotwords; - } - console.log(JSON.stringify(request)); - speechSokt.send(JSON.stringify(request)); - console.log("连接成功"); - stateHandle(0); - - } - - function onClose( e ) { - stateHandle(1); - } - - function onMessage( e ) { - - msgHandle( e ); - } - - function onError( e ) { - - info_div.innerHTML="连接"+e; - console.log(e); - stateHandle(2); - - } - - +/** + * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights + * Reserved. MIT License (https://opensource.org/licenses/MIT) + */ +/* 2021-2023 by zhaoming,mali aihealthx.com */ + +function WebSocketConnectMethod( config ) { // Define socket connection method class + + + var speechSokt; + var connKeeperID; + + var msgHandle = config.msgHandle; + var stateHandle = config.stateHandle; + + this.wsStart = function () { + var Uri = document.getElementById('wssip').value; //"wss://111.205.137.58:5821/wss/" // Set wss asr online interface address e.g. wss://X.X.X.X:port/wss/ + if(Uri.match(/wss:\S*|ws:\S*/)) + { + console.log("Uri"+Uri); + } + else + { + alert("Please check that the WSS address is correct"); + return 0; + } + + if ( 'WebSocket' in window ) { + speechSokt = new WebSocket( Uri ); // Define socket connection object + speechSokt.onopen = function(e){onOpen(e);}; // Define handler functions + speechSokt.onclose = function(e){ + console.log("onclose ws!"); + //speechSokt.close(); + onClose(e); + }; + speechSokt.onmessage = function(e){onMessage(e);}; + speechSokt.onerror = function(e){onError(e);}; + return 1; + } + else { + alert('WebSocket is not supported by this browser'); + return 0; + } + }; + + // Define stop and send functions + this.wsStop = function () { + if(speechSokt != undefined) { + console.log("stop ws!"); + speechSokt.close(); + } + }; + + this.wsSend = function ( oneData ) { + + if(speechSokt == undefined) return; + if ( speechSokt.readyState === 1 ) { // 0:CONNECTING, 1:OPEN, 2:CLOSING, 3:CLOSED + + speechSokt.send( oneData ); + + + } + }; + + // Message and status handlers for socket connection + function onOpen( e ) { + // Send JSON + var chunk_size = new Array( 5, 10, 5 ); + var request = { + "chunk_size": chunk_size, + "wav_name": "h5", + "is_speaking": true, + "chunk_interval":10, + "itn":getUseITN(), + "mode":getAsrMode(), + + }; + if(isfilemode) + { + request.wav_format=file_ext; + if(file_ext=="wav") + { + request.wav_format="PCM"; + request.audio_fs=file_sample_rate; + } + } + + var hotwords=getHotwords(); + + if(hotwords!=null ) + { + request.hotwords=hotwords; + } + console.log(JSON.stringify(request)); + speechSokt.send(JSON.stringify(request)); + console.log("Connection successful"); + stateHandle(0); + + } + + function onClose( e ) { + stateHandle(1); + } + + function onMessage( e ) { + + msgHandle( e ); + } + + function onError( e ) { + + info_div.innerHTML="Connection " + e; + console.log(e); + stateHandle(2); + + } + + } \ No newline at end of file diff --git a/web/avatar.html b/web/avatar.html index 0932bb6d..4ea3be3b 100644 --- a/web/avatar.html +++ b/web/avatar.html @@ -1,9 +1,9 @@ - + - Avatar 生成 - LiveTalking + Avatar Creator - LiveTalking