Skip to content

Commit

Permalink
feat: add audio to text pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
suhailkakar committed Oct 8, 2024
1 parent b9e9d55 commit 4565a42
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 8 deletions.
33 changes: 30 additions & 3 deletions packages/www/components/ModelGallery/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,17 @@ const imageToVideoInputs: Input[] = [
},
];

const audioToTextInputs: Input[] = [
{
id: "audio",
name: "Audio",
type: "file",
required: true,
description: "The audio to transcribe",
group: "prompt",
},
];

const availableModels: Model[] = [
{
id: "RealVisXL_V4.0_Lightning",
Expand Down Expand Up @@ -220,6 +231,15 @@ const availableModels: Model[] = [
modelId: "stabilityai/stable-diffusion-x4-upscaler",
inputs: upscalerInputs,
},
{
id: "whisper-large-v3",
title: "OpenAI Whisper",
description: " A large-v3 model trained by OpenAI for voice recognition ",
pipline: "Audio to Text",
image: "whisper-large-v3.png",
modelId: "openai/whisper-large-v3",
inputs: audioToTextInputs,
},
];

type Model = {
Expand All @@ -244,9 +264,16 @@ type Input = {
};

type Output = {
url: string;
seed: number;
nsfw: boolean;
url?: string;
seed?: number;
nsfw?: boolean;
text?: string;
chunks?: Chunk[];
};

type Chunk = {
text: string;
timestamp: [number, number];
};

export { availableModels };
Expand Down
2 changes: 1 addition & 1 deletion packages/www/components/ModelGallery/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ const CustomModelPopover = ({

return (
<AlertDialog open={open} onOpenChange={setOpen}>
<AlertDialogContent>
<AlertDialogContent className="max-w-xl">
<AlertDialogHeader>
<AlertDialogTitle>Request a custom model</AlertDialogTitle>
<AlertDialogDescription>
Expand Down
10 changes: 10 additions & 0 deletions packages/www/hooks/use-api/endpoints/ai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,13 @@ export const imageToImage = async (formData: any) => {

return image;
};

export const audioToText = async (formData: any) => {
const url = `/beta/generate/audio-to-text`;
const [res, text] = await context.fetch(url, {
method: "POST",
body: formData,
});

return [text];
};
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ export default function Form({
setOutput: (output: Output[]) => void;
setGenerationTime: (time: number) => void;
}) {
const { textToImage, upscale, imageToVideo, imageToImage } = useApi();
const { textToImage, upscale, imageToVideo, imageToImage, audioToText } =
useApi();
const [loading, setLoading] = useState<boolean>(false);
const startTimeRef = useRef<number | null>(null);
const timerRef = useRef<NodeJS.Timeout | null>(null);
Expand Down Expand Up @@ -49,6 +50,7 @@ export default function Form({
}
}, 100);

console.log(model?.pipline);
switch (model?.pipline) {
case "Text to Image":
const textToImageRes = await textToImage(formInputs);
Expand All @@ -66,6 +68,10 @@ export default function Form({
const imageToImageRes = await imageToImage(formData);
setOutput(imageToImageRes.images);
break;
case "Audio to Text":
const audioToTextRes = await audioToText(formData);
setOutput(audioToTextRes);
break;
case "image-to-image":
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ export default function PlaygroundPage() {
/>
</div>
<div className="md:w-[70%]">
<Output output={output} generationTime={generationTime} />
<Output
model={model}
output={output}
generationTime={generationTime}
/>
</div>
</main>
</Box>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import { Badge } from "components/ui/badge";
import React from "react";
import type { Output as OutputT } from "components/ModelGallery/constants";
import type {
Model,
Output as OutputT,
} from "components/ModelGallery/constants";
import { Card } from "components/ui/card";
import { Label } from "components/ui/label";
import { ScrollArea } from "components/ui/scroll-area";

export default function Output({
output = [],
generationTime,
model,
}: {
output: OutputT[];
model: Model;
generationTime: number;
}) {
return (
Expand All @@ -22,7 +29,7 @@ export default function Output({
<div
key={index}
className="flex items-center justify-center h-[512px]">
{item.url.includes("mp4") ? (
{model.pipline == "Image to Video" ? (
<video
src={item.url}
className="max-w-full max-h-full object-contain rounded-lg"
Expand All @@ -31,6 +38,20 @@ export default function Output({
muted
controls
/>
) : model.pipline == "Audio to Text" ? (
<ScrollArea className="bg-card p-4 rounded-lg border border-input max-h-[512px] overflow-y-auto">
<Label>Chunks (with timestamps)</Label>
{item.chunks.map((chunk, index) => (
<div key={index}>
{chunk.timestamp[0]}-{chunk.timestamp[1]}: {chunk.text}
</div>
))}

<div className="mt-4">
<Label>Text</Label>
<p>{item.text}</p>
</div>
</ScrollArea>
) : (
<img
src={item.url}
Expand Down

0 comments on commit 4565a42

Please sign in to comment.