Yukaii · Yukaii · Feb 9, 2025 · Feb 1, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/README.md b/README.md
@@ -54,19 +54,19 @@ graph TD;
 
 ## Prerequisite
 
-* Setup Anki with AnkiConnect locally
-* ffplayer (installed along with ffmpeg)
+- Setup Anki with AnkiConnect locally
+- ffplayer (installed along with ffmpeg)
 
 ## Installation
 
-
-
 ### Install gakuon
+
 ```bash
 npm install -g gakuon
 ```
 
-### Install ffmpeg (OSX/Linux) 
+### Install ffmpeg (OSX/Linux)
+
 ```
 brew install ffmpeg
 ```
@@ -231,14 +231,20 @@ Generate helpful learning content.
 example.description = "A natural example sentence using the word"
 example.required = true
 example.audio = true
+example.locale = "ja-JP"
+# Set ttsVoice when use edge-tts
+# You can found list by using tools like https://github.com/andresayac/edge-tts
+example.ttsVoice = "ja-JP-NanamiNeural"
 
 explanation_jp.description = "Simple explanation in Japanese"
 explanation_jp.required = true
 explanation_jp.audio = true
+explanation_jp.locale = "ja-JP"
 
 explanation_en.description = "Detailed explanation in English"
 explanation_en.required = true
 explanation_en.audio = true
+explanation_en.locale = "en-US"
 
 usage_notes.description = "Additional usage notes"
 usage_notes.required = false

diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -9,6 +9,7 @@
   "scripts": {
     "test:integration": "jest --config jest.integration.config.js",
     "start": "bun run src/index.ts",
+    "dev": "bun run --hot src/index.ts serve",
     "build": "bun build src/index.ts --target=node --outfile dist/gakuon",
     "prepublishOnly": "rm -rf dist && bun run build && bun run build:client",
     "fmt": "bunx biome format --write ./src",
@@ -43,6 +44,7 @@
   },
   "dependencies": {
     "@iarna/toml": "^2.2.5",
+    "@lobehub/tts": "^1.28.0",
     "@tailwindcss/vite": "^4.0.0",
     "commander": "^13.1.0",
     "cors": "^2.8.5",
@@ -52,6 +54,7 @@
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "react-router-dom": "^7.1.3",
+    "ws": "^8.18.0",
     "zod": "^3.24.1"
   },
   "files": [
@@ -61,4 +64,4 @@
   "trustedDependencies": [
     "@biomejs/biome"
   ]
-}
+}
diff --git a/src/commands/init.ts b/src/commands/init.ts
@@ -70,10 +70,12 @@ example.audio = true
 explanation_jp.description = "Simple explanation in Japanese"
 explanation_jp.required = true
 explanation_jp.audio = true
+explanation_jp.locale = "ja-JP"
 
 explanation_en.description = "Detailed explanation in English"
 explanation_en.required = true
 explanation_en.audio = true
+explanation_en.locale = "en-US"
 
 Requirements:
 1. Output valid TOML without any markdown formatting or code blocks
@@ -102,6 +104,7 @@ export async function init(options: InitOptions = {}) {
       config.global.openai.baseUrl,
       config.global.openai.chatModel,
       config.global.openai.ttsModel,
+      config.global.ttsMethod,
       debug,
     );
 

diff --git a/src/commands/learn.ts b/src/commands/learn.ts
@@ -66,11 +66,13 @@ export async function learn(options: LearnOptions = {}) {
     config.global.openai.baseUrl,
     config.global.openai.chatModel,
     config.global.openai.ttsModel,
+    config.global.ttsMethod,
     options.debug,
   );
   const contentManager = new ContentManager(
     ankiService,
     openaiService,
+    config.global.ttsVoice,
     options.debug,
   );
   const audioPlayer = new AudioPlayer(ankiService, options.debug);

diff --git a/src/commands/serve.ts b/src/commands/serve.ts
@@ -26,9 +26,15 @@ export async function serve(options: ServeOptions = {}) {
     config.global.openai.baseUrl,
     config.global.openai.chatModel,
     config.global.openai.ttsModel,
+    config.global.ttsMethod,
+    debug,
+  );
+  const contentManager = new ContentManager(
+    ankiService,
+    openaiService,
+    config.global.ttsVoice,
     debug,
   );
-  const contentManager = new ContentManager(ankiService, openaiService, debug);
 
   // Create and start server
   const app = createServer({
@@ -57,6 +63,6 @@ export async function serve(options: ServeOptions = {}) {
   // Start listening
   server = app.listen(port, () => {
     console.log(`Gakuon server running at http://localhost:${port}`);
-    console.log(`Using anki connect server at ${config.global.ankiHost}`)
+    console.log(`Using anki connect server at ${config.global.ankiHost}`);
   });
 }
diff --git a/src/commands/test.ts b/src/commands/test.ts
@@ -25,6 +25,7 @@ export async function test(options: TestOptions = {}) {
       config.global.openai.baseUrl,
       config.global.openai.chatModel,
       config.global.openai.ttsModel,
+      config.global.ttsMethod,
       debug,
     );
 

diff --git a/src/config/loader.ts b/src/config/loader.ts
@@ -12,13 +12,15 @@ import {
   NewCardGatherOrder,
   QueueOrder,
   ReviewSortOrder,
+  TtsMethod,
 } from "./types";
 
 export const DEFAULT_CONFIG: GakuonConfig = {
   global: {
     ankiHost: "http://localhost:8765",
     openaiApiKey: "${OPENAI_API_KEY}",
     ttsVoice: "alloy",
+    ttsMethod: TtsMethod.OPENAI,
     openai: {
       baseUrl: "https://api.openai.com/v1",
       chatModel: "gpt-4o",
@@ -110,14 +112,15 @@ function processRawConfig(rawConfig: unknown): GakuonConfig {
     ...(processed.global?.openai || {}),
   };
 
-  const configObj = {
+  const configObj: GakuonConfig = {
     ...processed,
     decks: processed.decks || DEFAULT_CONFIG.decks,
     global: {
       ankiHost: processed.global?.ankiHost || DEFAULT_CONFIG.global.ankiHost,
       openaiApiKey:
         processed.global?.openaiApiKey || DEFAULT_CONFIG.global.openaiApiKey,
       ttsVoice: processed.global?.ttsVoice || DEFAULT_CONFIG.global.ttsVoice,
+      ttsMethod: processed.global?.ttsMethod || DEFAULT_CONFIG.global.ttsMethod,
       defaultDeck: processed.global?.defaultDeck,
       openai: openaiConfig,
       cardOrder: {
@@ -170,7 +173,6 @@ export function loadConfig(customPath?: string): GakuonConfig {
 
   // Fall back to file-based config
   const configPath = customPath || join(homedir(), ".gakuon", "config.toml");
-
   if (!existsSync(configPath)) {
     saveConfig(DEFAULT_CONFIG);
   }

diff --git a/src/config/types.ts b/src/config/types.ts
@@ -25,6 +25,16 @@ export class PromptError extends Error {
   }
 }
 
+export class AudioGenerationError extends Error {
+  constructor(
+    message: string,
+    public details: { messages: string[] },
+  ) {
+    super(message);
+    this.name = "AudioGenerationError";
+  }
+}
+
 export type OpenAIConfig = z.infer<typeof OpenAIConfigSchema>;
 
 export type GakuonConfig = z.infer<typeof GakuonConfigSchema>;
@@ -82,6 +92,11 @@ export enum QueueOrder {
   MIXED = "mixed",
 }
 
+export enum TtsMethod {
+  OPENAI = "openai",
+  EDGE_TTS = "edge-tts",
+}
+
 export const OpenAIConfigSchema = z.object({
   baseUrl: z.string(),
   chatModel: z.string(),
@@ -98,7 +113,8 @@ export const CardOrderSchema = z.object({
 export const GlobalConfigSchema = z.object({
   ankiHost: z.string(),
   openaiApiKey: z.string(),
-  ttsVoice: z.string(),
+  ttsMethod: z.nativeEnum(TtsMethod),
+  ttsVoice: z.string().optional(),
   defaultDeck: z.string().optional(),
   openai: OpenAIConfigSchema,
   cardOrder: CardOrderSchema,
@@ -109,16 +125,32 @@ export const DeckConfigSchema = z.object({
   pattern: z.string(),
   fields: z.record(z.string()),
   prompt: z.string(),
+  ttsVoice: z.string().optional(),
   responseFields: z.record(
     z.object({
       description: z.string(),
       required: z.boolean(),
       audio: z.boolean().optional(),
+      locale: z.string().optional(),
+      ttsVoice: z.string().optional(),
     }),
   ),
 });
 
-export const GakuonConfigSchema = z.object({
-  global: GlobalConfigSchema,
-  decks: z.array(DeckConfigSchema),
-});
+export const GakuonConfigSchema = z
+  .object({
+    global: GlobalConfigSchema,
+    decks: z.array(DeckConfigSchema),
+  })
+
+  .refine(
+    (data) =>
+      data.global.ttsMethod !== TtsMethod.EDGE_TTS ||
+      data.decks.every((deck) =>
+        Object.values(deck.responseFields).every((field) => !!field.ttsVoice),
+      ),
+    {
+      message: "responseFields.ttsVoice is required when ttsMethod is edge-tts",
+      path: ["decks", "responseFields", "ttsVoice"],
+    },
+  );
diff --git a/src/services/anki.ts b/src/services/anki.ts
@@ -369,7 +369,9 @@ export class AnkiService {
     try {
       await this.request("sync", {});
     } catch (e: unknown) {
-      if ((e as { message?: string })?.message?.includes("auth not configured")) {
+      if (
+        (e as { message?: string })?.message?.includes("auth not configured")
+      ) {
         // Skip syncing when auth is not configured
         return;
       }

diff --git a/src/services/content-manager.ts b/src/services/content-manager.ts
@@ -4,13 +4,14 @@ import type { Card, DeckConfig, DynamicContent } from "../config/types";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { randomBytes } from "node:crypto";
-
+import { TtsMethod } from "../config/types";
 export class ContentManager {
   private tmpDir = tmpdir();
 
   constructor(
     private ankiService: AnkiService,
     private openaiService: OpenAIService,
+    private ttsVoice: string,
     private debug = false,
   ) {
     this.debugLog("Using tmpDir", this.tmpDir);
@@ -65,6 +66,32 @@ export class ContentManager {
     return { content, audioFiles, isNewContent: false, metadata };
   }
 
+  private getTtsVoice(
+    deckConfig: DeckConfig,
+    fieldConfig: DeckConfig["responseFields"][string],
+  ) {
+    const globalTtsVoice = this.ttsVoice;
+    const ttsMethod = this.openaiService.ttsMethod;
+
+    if (ttsMethod === TtsMethod.OPENAI) {
+      const selectedVoice =
+         fieldConfig.ttsVoice || deckConfig.ttsVoice || globalTtsVoice;
+
+      this.debugLog(
+        `Getting tts voice for ${TtsMethod.OPENAI}, using: ${selectedVoice}`,
+      );
+      // for openai ttsMethod, it doesn't matter what voice config you use
+      return selectedVoice;
+    }
+    if (ttsMethod === TtsMethod.EDGE_TTS) {
+      this.debugLog(
+        `Getting tts voice for ${TtsMethod.EDGE_TTS}, using: ${fieldConfig.ttsVoice}`,
+      );
+      // for ollama (we use EdgeTTS) , you have to set voice with the same locale code on the responseField.
+      return fieldConfig.ttsVoice;
+    }
+  }
+
   private async generateAndStoreContent(card: Card, deckConfig: DeckConfig) {
     // Generate content
     const content = await this.openaiService.generateContent(card, deckConfig);
@@ -81,7 +108,8 @@ export class ContentManager {
         const audioPromise = this.openaiService.generateAudio(
           content[field],
           tempPath,
-          "alloy",
+          this.getTtsVoice(deckConfig, fieldConfig),
+          fieldConfig.locale,
         );
         audioPromises.push(audioPromise);