-
-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Embed on-instance Whisper model for audio/mp4 transcribing (#449)
* feat: Embed on-instance Whisper model for audio/mp4 transcribing resolves #329 * additional logging * add placeholder for tmp folder in collector storage Add cleanup of hotdir and tmp on collector boot to prevent hanging files split loading of model and file conversion into concurrency * update README * update model size * update supported filetypes
- Loading branch information
1 parent
719521c
commit 61db981
Showing
12 changed files
with
636 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
const fs = require("fs"); | ||
const path = require("path"); | ||
const { v4 } = require("uuid"); | ||
const { | ||
createdDate, | ||
trashFile, | ||
writeToServerDocuments, | ||
} = require("../../utils/files"); | ||
const { tokenizeString } = require("../../utils/tokenizer"); | ||
const { default: slugify } = require("slugify"); | ||
const { LocalWhisper } = require("../../utils/WhisperProviders/localWhisper"); | ||
|
||
async function asAudio({ fullFilePath = "", filename = "" }) { | ||
const whisper = new LocalWhisper(); | ||
|
||
console.log(`-- Working ${filename} --`); | ||
const transcriberPromise = new Promise((resolve) => | ||
whisper.client().then((client) => resolve(client)) | ||
); | ||
const audioDataPromise = new Promise((resolve) => | ||
convertToWavAudioData(fullFilePath).then((audioData) => resolve(audioData)) | ||
); | ||
const [audioData, transcriber] = await Promise.all([ | ||
audioDataPromise, | ||
transcriberPromise, | ||
]); | ||
|
||
if (!audioData) { | ||
console.error(`Failed to parse content from ${filename}.`); | ||
trashFile(fullFilePath); | ||
return { | ||
success: false, | ||
reason: `Failed to parse content from ${filename}.`, | ||
}; | ||
} | ||
|
||
console.log(`[Model Working]: Transcribing audio data to text`); | ||
const { text: content } = await transcriber(audioData, { | ||
chunk_length_s: 30, | ||
stride_length_s: 5, | ||
}); | ||
|
||
if (!content.length) { | ||
console.error(`Resulting text content was empty for ${filename}.`); | ||
trashFile(fullFilePath); | ||
return { success: false, reason: `No text content found in ${filename}.` }; | ||
} | ||
|
||
data = { | ||
id: v4(), | ||
url: "file://" + fullFilePath, | ||
title: filename, | ||
docAuthor: "no author found", | ||
description: "No description found.", | ||
docSource: "pdf file uploaded by the user.", | ||
chunkSource: filename, | ||
published: createdDate(fullFilePath), | ||
wordCount: content.split(" ").length, | ||
pageContent: content, | ||
token_count_estimate: tokenizeString(content).length, | ||
}; | ||
|
||
writeToServerDocuments(data, `${slugify(filename)}-${data.id}`); | ||
trashFile(fullFilePath); | ||
console.log( | ||
`[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n` | ||
); | ||
return { success: true, reason: null }; | ||
} | ||
|
||
async function convertToWavAudioData(sourcePath) { | ||
try { | ||
let buffer; | ||
const wavefile = require("wavefile"); | ||
const ffmpeg = require("fluent-ffmpeg"); | ||
const outFolder = path.resolve(__dirname, `../../storage/tmp`); | ||
if (!fs.existsSync(outFolder)) fs.mkdirSync(outFolder, { recursive: true }); | ||
|
||
const fileExtension = path.extname(sourcePath).toLowerCase(); | ||
if (fileExtension !== ".wav") { | ||
console.log( | ||
`[Conversion Required] ${fileExtension} file detected - converting to .wav` | ||
); | ||
const outputFile = path.resolve(outFolder, `${v4()}.wav`); | ||
const convert = new Promise((resolve) => { | ||
ffmpeg(sourcePath) | ||
.toFormat("wav") | ||
.on("error", (error) => { | ||
console.error(`[Conversion Error] ${error.message}`); | ||
resolve(false); | ||
}) | ||
.on("progress", (progress) => | ||
console.log( | ||
`[Conversion Processing]: ${progress.targetSize}KB converted` | ||
) | ||
) | ||
.on("end", () => { | ||
console.log("[Conversion Complete]: File converted to .wav!"); | ||
resolve(true); | ||
}) | ||
.save(outputFile); | ||
}); | ||
const success = await convert; | ||
if (!success) | ||
throw new Error( | ||
"[Conversion Failed]: Could not convert file to .wav format!" | ||
); | ||
|
||
const chunks = []; | ||
const stream = fs.createReadStream(outputFile); | ||
for await (let chunk of stream) chunks.push(chunk); | ||
buffer = Buffer.concat(chunks); | ||
fs.rmSync(outputFile); | ||
} else { | ||
const chunks = []; | ||
const stream = fs.createReadStream(sourcePath); | ||
for await (let chunk of stream) chunks.push(chunk); | ||
buffer = Buffer.concat(chunks); | ||
} | ||
|
||
const wavFile = new wavefile.WaveFile(buffer); | ||
wavFile.toBitDepth("32f"); | ||
wavFile.toSampleRate(16000); | ||
|
||
let audioData = wavFile.getSamples(); | ||
if (Array.isArray(audioData)) { | ||
if (audioData.length > 1) { | ||
const SCALING_FACTOR = Math.sqrt(2); | ||
|
||
// Merge channels into first channel to save memory | ||
for (let i = 0; i < audioData[0].length; ++i) { | ||
audioData[0][i] = | ||
(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2; | ||
} | ||
} | ||
audioData = audioData[0]; | ||
} | ||
|
||
return audioData; | ||
} catch (error) { | ||
console.error(`convertToWavAudioData`, error); | ||
return null; | ||
} | ||
} | ||
|
||
module.exports = asAudio; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
tmp/* | ||
!tmp/.placeholder |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
const path = require("path"); | ||
const fs = require("fs"); | ||
|
||
class LocalWhisper { | ||
constructor() { | ||
// Model Card: https://huggingface.co/Xenova/whisper-small | ||
this.model = "Xenova/whisper-small"; | ||
this.cacheDir = path.resolve( | ||
process.env.STORAGE_DIR | ||
? path.resolve(process.env.STORAGE_DIR, `models`) | ||
: path.resolve(__dirname, `../../../server/storage/models`) | ||
); | ||
|
||
this.modelPath = path.resolve(this.cacheDir, "Xenova", "whisper-small"); | ||
|
||
// Make directory when it does not exist in existing installations | ||
if (!fs.existsSync(this.cacheDir)) | ||
fs.mkdirSync(this.cacheDir, { recursive: true }); | ||
} | ||
|
||
async client() { | ||
if (!fs.existsSync(this.modelPath)) { | ||
console.log( | ||
"\x1b[34m[INFO]\x1b[0m The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)\n\n" | ||
); | ||
} | ||
|
||
try { | ||
// Convert ESM to CommonJS via import so we can load this library. | ||
const pipeline = (...args) => | ||
import("@xenova/transformers").then(({ pipeline }) => | ||
pipeline(...args) | ||
); | ||
return await pipeline("automatic-speech-recognition", this.model, { | ||
cache_dir: this.cacheDir, | ||
...(!fs.existsSync(this.modelPath) | ||
? { | ||
// Show download progress if we need to download any files | ||
progress_callback: (data) => { | ||
if (!data.hasOwnProperty("progress")) return; | ||
console.log( | ||
`\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${ | ||
data.file | ||
} ${~~data?.progress}%` | ||
); | ||
}, | ||
} | ||
: {}), | ||
}); | ||
} catch (error) { | ||
console.error("Failed to load the native whisper model:", error); | ||
throw error; | ||
} | ||
} | ||
} | ||
|
||
module.exports = { | ||
LocalWhisper, | ||
}; |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.