From 2a9f39124c310dd4c4553befc2f658e334354769 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 20 Jan 2025 15:59:10 +0900 Subject: [PATCH 1/4] Add image and audio prompting API Closes #40. Somewhat helps with #70. --- README.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0401f43..1d353c2 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,56 @@ console.log(await promptWithCalculator("What is 2 + 2?")); We'll likely explore more specific APIs for tool- and function-calling in the future; follow along in [issue #7](https://github.com/webmachinelearning/prompt-api/issues/7). +### Multimodal inputs + +All of the above examples have been of text prompts. Some language models also support other inputs. Our design initially includes the potential to support images and audio clips as inputs. This is done by using objects in the form `{ type: "image", data }` and `{ type: "audio", data }` instead of strings. The `data` values can be the following: + +* For image inputs: [`ImageBitmapSource`](https://html.spec.whatwg.org/#imagebitmapsource), i.e. `Blob`, `ImageData`, `ImageBitmap`, `VideoFrame`, `OffscreenCanvas`, `HTMLImageElement`, `SVGImageElement`, `HTMLCanvasElement`, or `HTMLVideoElement` (will get the current frame). Also raw bytes via `BufferSource` (i.e. `ArrayBuffer` or typed arrays). + +* For audio inputs: for now, `Blob`, `AudioBuffer`, `HTMLAudioElement`. Also raw bytes via `BufferSource`. Other possibilities we're investigating include `AudioData` and `MediaStream`, but we're not yet sure if those are suitable to represent "clips". + +Sessions that will include these inputs need to be created using the `expectedInputTypes` option, to ensure that any necessary downloads are done as part of session creation, and that if the model is not capable of such multimodal prompts, the session creation fails. + +A sample of using these APIs: + +```js +const session = await ai.languageModel.create({ + expectedInputTypes: ["audio", "image"] // "text" is always expected +}); + +const referenceImage = await (await fetch("/reference-image.jpeg")).blob(); +const userDrawnImage = document.querySelector("canvas"); + +const response1 = await session.prompt([ + "Give a helpful artistic critique of how well the second image matches the first:", + { type: "image", data: referenceImage }, + { type: "image", data: userDrawnImage } +]); + +console.log(response1); + +const audioBlob = await captureMicrophoneInput({ seconds: 10 }); + +const response2 = await session.prompt( + "My response to your critique:", + { type: "audio", data: audioBlob } +); +``` + +Future extensions may include more ambitious multimodal inputs, such as video clips, or realtime audio or video. (Realtime might require a different API design, more based around events or streams instead of messages.) + +Edge-case details: + +* `HTMLAudioElement` can also represent streaming audio data (e.g., when it is connected to a `MediaSource`). Such cases will throw a `"NotSupportedError"` `DOMException` for now. + +* `HTMLAudioElement` might be connected to an audio source (e.g., a URL) that is not totally downloaded when the prompt API is called. In such cases, calling into the prompt API will force the download to complete. + +* Text prompts can also be done via `{ type: "text", data: aString }`, instead of just `aString`. This can be useful for generic code. + +* Attempting to supply an invalid combination, e.g. `{ type: "audio", data: anImageBitmap }`, `{ type: "image", data: anAudioBuffer }`, or `{ type: "text", data: anArrayBuffer }`, will throw a `TypeError`. + +* Attempting to give an image or audio prompt with the `"assistant"` role will currently throw a `"NotSupportedError"` `DOMException`. (Although as we explore multimodal outputs, this restriction might be lifted in the future.) + ### Configuration of per-session parameters In addition to the `systemPrompt` and `initialPrompts` options shown above, the currently-configurable model parameters are [temperature](https://huggingface.co/blog/how-to-generate#sampling) and [top-K](https://huggingface.co/blog/how-to-generate#top-k-sampling). The `params()` API gives the default, minimum, and maximum values for these parameters. @@ -355,7 +405,11 @@ The method will return a promise that fulfills with one of the following availab An example usage is the following: ```js -const options = { expectedInputLanguages: ["en", "es"], temperature: 2 }; +const options = { + expectedInputLanguages: ["en", "es"], + expectedInputTypes: ["audio"], + temperature: 2 +}; const supportsOurUseCase = await ai.languageModel.availability(options); @@ -450,6 +504,7 @@ interface AILanguageModel : EventTarget { readonly attribute unsigned long topK; readonly attribute float temperature; readonly attribute FrozenArray? expectedInputLanguages; + readonly attribute FrozenArray expectedInputTypes; // always contains at least "text" attribute EventHandler oncontextoverflow; @@ -469,6 +524,7 @@ dictionary AILanguageModelCreateCoreOptions { [EnforceRange] unsigned long topK; float temperature; sequence expectedInputLanguages; + sequence expectedInputTypes; } dictionary AILanguageModelCreateOptions : AILanguageModelCreateCoreOptions { @@ -481,12 +537,17 @@ dictionary AILanguageModelCreateOptions : AILanguageModelCreateCoreOptions { dictionary AILanguageModelInitialPrompt { required AILanguageModelInitialPromptRole role; - required DOMString content; + required AILanguageModelPromptContentInput content; }; dictionary AILanguageModelPrompt { required AILanguageModelPromptRole role; - required DOMString content; + required AILanguageModelPromptContentInput content; +}; + +dictionary AILanguageModelPromptContent { + required AILanguageModelPromptType type; + required AILanguageModelPromptData data; }; dictionary AILanguageModelPromptOptions { @@ -497,10 +558,13 @@ dictionary AILanguageModelCloneOptions { AbortSignal signal; }; +typedef (DOMString or AILanguageModelPromptContent) AILanguageModelPromptContentInput; typedef (DOMString or AILanguageModelPrompt or sequence) AILanguageModelPromptInput; +typedef (ImageBitmapSource or BufferSource or AudioBuffer or HTMLAudioElement or DOMString) AILanguageModelPromptData; enum AILanguageModelInitialPromptRole { "system", "user", "assistant" }; enum AILanguageModelPromptRole { "user", "assistant" }; +enum AILanguageModelPromptType { "text", "image", "audio" }; ``` ### Instruction-tuned versus base models From e2e6752963598b9c6e53c86c15c88bf10de6da90 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 22 Jan 2025 12:36:45 +0900 Subject: [PATCH 2/4] Respond to review feedback --- README.md | 66 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 1d353c2..13f3037 100644 --- a/README.md +++ b/README.md @@ -211,17 +211,25 @@ const response2 = await session.prompt( Future extensions may include more ambitious multimodal inputs, such as video clips, or realtime audio or video. (Realtime might require a different API design, more based around events or streams instead of messages.) -Edge-case details: +Details: -* `HTMLAudioElement` can also represent streaming audio data (e.g., when it is connected to a `MediaSource`). Such cases will throw a `"NotSupportedError"` `DOMException` for now. +* Cross-origin data that has not been exposed using the `Access-Control-Allow-Origin` header cannot be used with the prompt API, and will reject with a `"SecurityError"` `DOMException`. This applies to `HTMLImageElement`, `SVGImageElement`, `HTMLAudioElement`, `HTMLVideoElement`, `HTMLCanvasElement`, and `OffscreenCanvas`. Note that this is more strict than `createImageBitmap()`, which has a tainting mechanism which allows creating opaque image bitmaps from unexposed cross-origin resources. For the prompt API, such resources will just fail. This includes attempts to use cross-origin-tainted canvases. + +* Raw-bytes cases (`Blob` and `BufferSource`) will apply the appropriate sniffing rules ([for images](https://mimesniff.spec.whatwg.org/#rules-for-sniffing-images-specifically), [for audio](https://mimesniff.spec.whatwg.org/#rules-for-sniffing-audio-and-video-specifically)) and reject with a `"NotSupportedError"` `DOMException` if the format is not supported. This behavior is similar to that of `createImageBitmap()`. + +* Animated images will be required to snapshot the first frame (like `createImageBitmap()`). In the future, animated image input may be supported via some separate opt-in, similar to video clip input. But we don't want interoperability problems from some implementations supporting animated images and some not, in the initial version. + +* `HTMLAudioElement` can also represent streaming audio data (e.g., when it is connected to a `MediaSource`). Such cases will reject with a `"NotSupportedError"` `DOMException` for now. * `HTMLAudioElement` might be connected to an audio source (e.g., a URL) that is not totally downloaded when the prompt API is called. In such cases, calling into the prompt API will force the download to complete. +* Similarly for `HTMLVideoElement`, even a single frame might not yet be downloaded when the prompt API is called. In such cases, calling into the prompt API will force at least a single frame's worth of video to download. (The intent is to behave the same as `createImageBitmap(videoEl)`.) + * Text prompts can also be done via `{ type: "text", data: aString }`, instead of just `aString`. This can be useful for generic code. -* Attempting to supply an invalid combination, e.g. `{ type: "audio", data: anImageBitmap }`, `{ type: "image", data: anAudioBuffer }`, or `{ type: "text", data: anArrayBuffer }`, will throw a `TypeError`. +* Attempting to supply an invalid combination, e.g. `{ type: "audio", data: anImageBitmap }`, `{ type: "image", data: anAudioBuffer }`, or `{ type: "text", data: anArrayBuffer }`, will reject with a `TypeError`. -* Attempting to give an image or audio prompt with the `"assistant"` role will currently throw a `"NotSupportedError"` `DOMException`. (Although as we explore multimodal outputs, this restriction might be lifted in the future.) +* Attempting to give an image or audio prompt with the `"assistant"` role will currently reject with a `"NotSupportedError"` `DOMException`. (Although as we explore multimodal outputs, this restriction might be lifted in the future.) ### Configuration of per-session parameters @@ -525,46 +533,62 @@ dictionary AILanguageModelCreateCoreOptions { float temperature; sequence expectedInputLanguages; sequence expectedInputTypes; -} +}; dictionary AILanguageModelCreateOptions : AILanguageModelCreateCoreOptions { AbortSignal signal; AICreateMonitorCallback monitor; DOMString systemPrompt; - sequence initialPrompts; + sequence initialPrompts; +}; + +dictionary AILanguageModelPromptOptions { + AbortSignal signal; +}; + +dictionary AILanguageModelCloneOptions { + AbortSignal signal; }; -dictionary AILanguageModelInitialPrompt { +// The argument to the prompt() method and others like it + +typedef (AILanguageModelPromptLine or sequence) AILanguageModelPromptInput; + +// Initial prompt lines + +dictionary AILanguageModelInitialPromptLineDict { required AILanguageModelInitialPromptRole role; - required AILanguageModelPromptContentInput content; + required AILanguageModelPromptContent content; }; -dictionary AILanguageModelPrompt { +typedef (DOMString or AILanguageModelInitialPromptLineDict) AILanguageModelInitialPromptLine; + +// Prompt lines + +dictionary AILanguageModelPromptLineDict { required AILanguageModelPromptRole role; - required AILanguageModelPromptContentInput content; + required AILanguageModelPromptContent content; }; -dictionary AILanguageModelPromptContent { +typedef (DOMString or AILanguageModelPromptLineDict) AILanguageModelPromptLine; + +// Prompt content inside the lines + +dictionary AILanguageModelPromptContentDict { required AILanguageModelPromptType type; required AILanguageModelPromptData data; }; -dictionary AILanguageModelPromptOptions { - AbortSignal signal; -}; +typedef (DOMString or AILanguageModelPromptContentDict) AILanguageModelPromptContent; -dictionary AILanguageModelCloneOptions { - AbortSignal signal; -}; - -typedef (DOMString or AILanguageModelPromptContent) AILanguageModelPromptContentInput; -typedef (DOMString or AILanguageModelPrompt or sequence) AILanguageModelPromptInput; typedef (ImageBitmapSource or BufferSource or AudioBuffer or HTMLAudioElement or DOMString) AILanguageModelPromptData; +enum AILanguageModelPromptType { "text", "image", "audio" }; + +// Prompt roles inside the lines enum AILanguageModelInitialPromptRole { "system", "user", "assistant" }; enum AILanguageModelPromptRole { "user", "assistant" }; -enum AILanguageModelPromptType { "text", "image", "audio" }; ``` ### Instruction-tuned versus base models From 996364be4a2b2e4ee641fc30e7f8757dd2cd16d2 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 22 Jan 2025 17:46:14 +0900 Subject: [PATCH 3/4] More complicated typedefs!! --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 13f3037..06eaa42 100644 --- a/README.md +++ b/README.md @@ -562,7 +562,11 @@ dictionary AILanguageModelInitialPromptLineDict { required AILanguageModelPromptContent content; }; -typedef (DOMString or AILanguageModelInitialPromptLineDict) AILanguageModelInitialPromptLine; +typedef ( + DOMString // interpreted as { role: "user", content: { type: "text", data: providedValue } } + or AILanguageModelPromptContent // interpreted as { role: "user", content: providedValue } + or AILanguageModelInitialPromptLineDict // canonical form +) AILanguageModelInitialPromptLine; // Prompt lines @@ -571,7 +575,11 @@ dictionary AILanguageModelPromptLineDict { required AILanguageModelPromptContent content; }; -typedef (DOMString or AILanguageModelPromptLineDict) AILanguageModelPromptLine; +typedef ( + DOMString // interpreted as { role: "user", content: { type: "text", data: providedValue } } + or AILanguageModelPromptContent // interpreted as { role: "user", content: providedValue } + or AILanguageModelPromptLineDict // canonical form +) AILanguageModelPromptLine; // Prompt content inside the lines From 6839d63181c336e3094fba3e68f6d0785ff92ac9 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 22 Jan 2025 17:46:34 +0900 Subject: [PATCH 4/4] Missing []s --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 06eaa42..e7ebaf2 100644 --- a/README.md +++ b/README.md @@ -203,10 +203,10 @@ console.log(response1); const audioBlob = await captureMicrophoneInput({ seconds: 10 }); -const response2 = await session.prompt( +const response2 = await session.prompt([ "My response to your critique:", { type: "audio", data: audioBlob } -); +]); ``` Future extensions may include more ambitious multimodal inputs, such as video clips, or realtime audio or video. (Realtime might require a different API design, more based around events or streams instead of messages.)