Improve settings (#1174)

* more settings for local whisper * clean code * may open echogarden packages dir * upgrade deps * handle recognize unhandle rejection * fix * add tts settings * update ui * may open library path
ZuodaoTech · Nov 11, 2024 · 521ee76 · 521ee76
1 parent 6c8da30
commit 521ee76
Show file tree

Hide file tree

Showing 31 changed files with 846 additions and 321 deletions.
diff --git a/1000h-portal/package.json b/1000h-portal/package.json
@@ -18,7 +18,7 @@
   },
   "devDependencies": {
     "autoprefixer": "^10.4.20",
-    "postcss": "^8.4.47",
+    "postcss": "^8.4.48",
     "sass": "^1.80.6",
     "tailwindcss": "^3.4.14"
   }

diff --git a/enjoy/package.json b/enjoy/package.json
@@ -85,7 +85,7 @@
     "typescript": "^5.6.3",
     "vite": "^5.4.10",
     "vite-plugin-static-copy": "^2.1.0",
-    "zx": "^8.2.0"
+    "zx": "^8.2.1"
   },
   "dependencies": {
     "@andrkrn/ffprobe-static": "^5.2.0",
@@ -141,7 +141,7 @@
     "dayjs": "^1.11.13",
     "decamelize": "^6.0.0",
     "decamelize-keys": "^2.0.1",
-    "echogarden": "^1.8.7",
+    "echogarden": "^2.0.0",
     "electron-context-menu": "^4.0.4",
     "electron-log": "^5.2.2",
     "electron-settings": "^4.0.4",
@@ -159,23 +159,23 @@
     "langchain": "^0.3.5",
     "lodash": "^4.17.21",
     "lru-cache": "^11.0.2",
-    "lucide-react": "^0.455.0",
+    "lucide-react": "^0.456.0",
     "mark.js": "^8.11.1",
     "microsoft-cognitiveservices-speech-sdk": "^1.41.0",
     "mime-types": "^2.1.35",
     "mustache": "^4.2.0",
     "next-themes": "^0.4.3",
     "openai": "^4.71.1",
     "pitchfinder": "^2.3.2",
-    "postcss": "^8.4.47",
+    "postcss": "^8.4.48",
     "proxy-agent": "^6.4.0",
     "react": "^18.3.1",
     "react-activity-calendar": "^2.7.1",
     "react-audio-visualize": "^1.2.0",
     "react-audio-voice-recorder": "^2.2.0",
     "react-dom": "^18.3.1",
     "react-frame-component": "^5.2.7",
-    "react-hook-form": "^7.53.1",
+    "react-hook-form": "^7.53.2",
     "react-hotkeys-hook": "^4.6.1",
     "react-i18next": "^15.1.1",
     "react-markdown": "^9.0.1",

diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
@@ -907,5 +907,17 @@
   "failedToLoadLink": "Failed to load link",
   "refreshSpeech": "Refresh speech",
   "locateParagraph": "Locate paragraph",
-  "close": "Close"
+  "close": "Close",
+  "config": "Config",
+  "temperature": "Temperature",
+  "encoderProvider": "Encoder Provider",
+  "decoderProvider": "Decoder Provider",
+  "enableGPU": "Enable GPU",
+  "openPackagesDir": "Open models dir",
+  "whisperModelDescription": "Model will be downloaded when first used.",
+  "whisperEngineDescription": "OpenAI Whisper with inference done via the ONNX runtime.",
+  "whisperCppEngineDescription": "C++ port of the Whisper architecture.",
+  "ttsService": "Text to Speech Service",
+  "openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
+  "enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported."
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
@@ -907,5 +907,17 @@
   "failedToLoadLink": "加载链接失败",
   "refreshSpeech": "刷新语音",
   "locateParagraph": "定位段落",
-  "close": "关闭"
+  "close": "关闭",
+  "config": "配置",
+  "temperature": "温度",
+  "encoderProvider": "编码器",
+  "decoderProvider": "解码器",
+  "enableGPU": "启用 GPU",
+  "openPackagesDir": "打开模型目录",
+  "whisperModelDescription": "模型首次使用时会下载。",
+  "whisperEngineDescription": "OpenAI Whisper 使用 ONNX 运行时进行推理。",
+  "whisperCppEngineDescription": "Whisper 的 C++ 实现。",
+  "ttsService": "文字转语音服务",
+  "openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
+  "enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务，支持 OpenAI 或 Azure。"
 }
diff --git a/enjoy/src/main/db/models/document.ts b/enjoy/src/main/db/models/document.ts
@@ -259,7 +259,9 @@ export class Document extends Model<Document> {
     }
 
     logger.debug("detected file type", filePath, mimeType, extension);
-    if (!DocumentFormats.includes(extension)) {
+    if (extension === "zip" && filePath.endsWith(".epub")) {
+      extension = "epub";
+    } else if (!DocumentFormats.includes(extension)) {
       logger.error("unsupported file type", filePath, extension);
       throw new Error(
         t("models.document.fileNotSupported", { file: filePath })

diff --git a/enjoy/src/main/echogarden.ts b/enjoy/src/main/echogarden.ts
@@ -15,6 +15,8 @@ import {
   type Timeline,
   type TimelineEntry,
 } from "echogarden/dist/utilities/Timeline.d.js";
+import { WhisperOptions } from "echogarden/dist/recognition/WhisperSTT.js";
+import { ensureAndGetPackagesDir } from "echogarden/dist/utilities/PackageManager.js";
 import path from "path";
 import log from "@main/logger";
 import url from "url";
@@ -25,7 +27,6 @@ import { enjoyUrlToPath, pathToEnjoyUrl } from "./utils";
 import { UserSetting } from "./db/models";
 import { UserSettingKeyEnum } from "@/types/enums";
 import { WHISPER_MODELS } from "@/constants";
-import { WhisperOptions } from "echogarden/dist/recognition/WhisperSTT.js";
 
 Echogarden.setGlobalOption(
   "ffmpegPath",
@@ -59,7 +60,27 @@ class EchogardenWrapper {
   public wordTimelineToSegmentSentenceTimeline: typeof wordTimelineToSegmentSentenceTimeline;
 
   constructor() {
-    this.recognize = Echogarden.recognize;
+    this.recognize = (sampleFile: string, options: RecognitionOptions) => {
+      return new Promise((resolve, reject) => {
+        const handler = (reason: any) => {
+          // Remove the handler after it's triggered
+          process.removeListener("unhandledRejection", handler);
+          reject(reason);
+        };
+
+        // Add temporary unhandledRejection listener
+        process.on("unhandledRejection", handler);
+
+        // Call the original recognize function
+        Echogarden.recognize(sampleFile, options)
+          .then((result) => {
+            // Remove the handler if successful
+            process.removeListener("unhandledRejection", handler);
+            resolve(result);
+          })
+          .catch(reject);
+      });
+    };
     this.align = Echogarden.align;
     this.alignSegments = Echogarden.alignSegments;
     this.denoise = Echogarden.denoise;
@@ -78,23 +99,15 @@ class EchogardenWrapper {
       engine: "whisper",
       whisper: {
         model: "tiny.en",
-        language: "en",
-      } as WhisperOptions,
+      },
     }
   ) {
     const sampleFile = path.join(__dirname, "samples", "jfk.wav");
-    try {
-      const whisperModel = await UserSetting.get(UserSettingKeyEnum.WHISPER);
-      if (WHISPER_MODELS.includes(whisperModel)) {
-        options.whisper.model = whisperModel;
-      }
-    } catch (e) {
-      logger.error(e);
-    }
 
     try {
+      logger.info("check:", options);
       const result = await this.recognize(sampleFile, options);
-      logger.info(result);
+      logger.info(result?.transcript);
       fs.writeJsonSync(
         path.join(settings.cachePath(), "echogarden-check.json"),
         result,
@@ -225,6 +238,10 @@ class EchogardenWrapper {
     ipcMain.handle("echogarden-check", async (_event, options: any) => {
       return this.check(options);
     });
+
+    ipcMain.handle("echogarden-get-packages-dir", async (_event) => {
+      return ensureAndGetPackagesDir();
+    });
   }
 }
 

diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts
@@ -476,6 +476,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
     },
   },
   echogarden: {
+    getPackagesDir: () => {
+      return ipcRenderer.invoke("echogarden-get-packages-dir");
+    },
     recognize: (input: string, options: RecognitionOptions) => {
       return ipcRenderer.invoke("echogarden-recognize", input, options);
     },
@@ -505,8 +508,8 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
     transcode: (input: string) => {
       return ipcRenderer.invoke("echogarden-transcode", input);
     },
-    check: () => {
-      return ipcRenderer.invoke("echogarden-check");
+    check: (options: RecognitionOptions) => {
+      return ipcRenderer.invoke("echogarden-check", options);
     },
   },
   ffmpeg: {

diff --git a/enjoy/src/renderer/components/chats/chat-agent-form.tsx b/enjoy/src/renderer/components/chats/chat-agent-form.tsx
@@ -48,7 +48,7 @@ export const ChatAgentForm = (props: {
   const { EnjoyApp, learningLanguage, webApi } = useContext(
     AppSettingsProviderContext
   );
-  const { currentTtsEngine } = useContext(AISettingsProviderContext);
+  const { ttsConfig } = useContext(AISettingsProviderContext);
   const [selectedTemplate, setSelectedTemplate] = useState<string>("custom");
   const [templates, setTemplates] = useState<
     {
@@ -104,10 +104,10 @@ export const ChatAgentForm = (props: {
     const { type, name, description, config } = data;
     if (type === ChatAgentTypeEnum.TTS) {
       config.tts = {
-        engine: config.tts?.engine || currentTtsEngine.name,
-        model: config.tts?.model || currentTtsEngine.model,
-        language: config.tts?.language || learningLanguage,
-        voice: config.tts?.voice || currentTtsEngine.voice,
+        engine: config.tts?.engine || ttsConfig.engine,
+        model: config.tts?.model || ttsConfig.model,
+        language: config.tts?.language || ttsConfig.language,
+        voice: config.tts?.voice || ttsConfig.voice,
       };
     }
 

diff --git a/enjoy/src/renderer/components/chats/chat-list.tsx b/enjoy/src/renderer/components/chats/chat-list.tsx
@@ -30,7 +30,7 @@ export const ChatList = (props: {
   setCurrentChat: (chat: ChatType) => void;
 }) => {
   const { chats, chatAgent, currentChat, setCurrentChat } = props;
-  const { sttEngine, currentGptEngine, currentTtsEngine } = useContext(
+  const { sttEngine, currentGptEngine, ttsConfig } = useContext(
     AISettingsProviderContext
   );
   const { EnjoyApp, learningLanguage } = useContext(AppSettingsProviderContext);
@@ -78,10 +78,10 @@ export const ChatList = (props: {
       agent.type === ChatAgentTypeEnum.TTS
         ? {
             tts: {
-              engine: currentTtsEngine.name,
-              model: currentTtsEngine.model,
-              voice: currentTtsEngine.voice,
-              language: learningLanguage,
+              engine: ttsConfig.engine,
+              model: ttsConfig.model,
+              voice: ttsConfig.voice,
+              language: ttsConfig.language,
               ...agent.config.tts,
             },
           }
@@ -92,10 +92,10 @@ export const ChatList = (props: {
               model: currentGptEngine.models.default,
             },
             tts: {
-              engine: currentTtsEngine.name,
-              model: currentTtsEngine.model,
-              voice: currentTtsEngine.voice,
-              language: learningLanguage,
+              engine: ttsConfig.engine,
+              model: ttsConfig.model,
+              voice: ttsConfig.voice,
+              language: ttsConfig.language,
             },
           };
     return {

diff --git a/enjoy/src/renderer/components/chats/chat-settings.tsx b/enjoy/src/renderer/components/chats/chat-settings.tsx
@@ -68,10 +68,8 @@ const ChatMemberSetting = (props: {
   onFinish?: () => void;
 }) => {
   const { chat, agentMembers, onFinish } = props;
-  const { EnjoyApp, learningLanguage } = useContext(AppSettingsProviderContext);
-  const { currentGptEngine, currentTtsEngine } = useContext(
-    AISettingsProviderContext
-  );
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { currentGptEngine, ttsConfig } = useContext(AISettingsProviderContext);
   const [memberTab, setMemberTab] = useState(agentMembers[0]?.userId);
   const [query, setQuery] = useState("");
   const [chatAgents, setChatAgents] = useState<ChatAgentType[]>([]);
@@ -90,10 +88,10 @@ const ChatMemberSetting = (props: {
             model: currentGptEngine.models.default,
           },
           tts: {
-            engine: currentTtsEngine.name,
-            model: currentTtsEngine.model,
-            voice: currentTtsEngine.voice,
-            language: learningLanguage,
+            engine: ttsConfig.engine,
+            model: ttsConfig.model,
+            voice: ttsConfig.voice,
+            language: ttsConfig.language,
           },
         },
       })

diff --git a/enjoy/src/renderer/components/documents/document-config-form.tsx b/enjoy/src/renderer/components/documents/document-config-form.tsx
@@ -12,7 +12,8 @@ import {
 import { t } from "i18next";
 import { TTSForm } from "@renderer/components";
 import { LoaderIcon } from "lucide-react";
-import { useState } from "react";
+import { useContext, useState } from "react";
+import { AISettingsProviderContext } from "@renderer/context";
 
 const documentConfigSchema = z.object({
   config: z.object({
@@ -33,6 +34,7 @@ export const DocumentConfigForm = (props: {
 }) => {
   const { config, onSubmit } = props;
   const [submitting, setSubmitting] = useState<boolean>(false);
+  const { ttsConfig } = useContext(AISettingsProviderContext);
 
   const form = useForm<z.infer<typeof documentConfigSchema>>({
     resolver: zodResolver(documentConfigSchema),
@@ -42,12 +44,7 @@ export const DocumentConfigForm = (props: {
           config: {
             autoTranslate: true,
             autoNextSpeech: true,
-            tts: {
-              engine: "openai",
-              model: "openai/tts-1",
-              language: "en-US",
-              voice: "alloy",
-            },
+            tts: ttsConfig,
           },
         },
   });

diff --git a/...y/src/renderer/components/medias/media-left-panel/media-transcription-generate-button.tsx b/...y/src/renderer/components/medias/media-left-panel/media-transcription-generate-button.tsx
@@ -79,7 +79,6 @@ export const MediaTranscriptionGenerateButton = (props: {
                 generateTranscription({
                   originalText: data.text,
                   language: data.language,
-                  model: data.model,
                   service: data.service as SttEngineOptionEnum | "upload",
                   isolate: data.isolate,
                 })

diff --git a/enjoy/src/renderer/components/medias/media-loading-modal.tsx b/enjoy/src/renderer/components/medias/media-loading-modal.tsx
@@ -70,7 +70,6 @@ const LoadingContent = () => {
                 generateTranscription({
                   originalText: data.text,
                   language: data.language,
-                  model: data.model,
                   service: data.service as SttEngineOptionEnum | "upload",
                   isolate: data.isolate,
                 });