update to 4o-audio-preview, bugfixes, and commentary (#1593)

openai · Dec 5, 2024 · 6123b8e · 6123b8e
1 parent 5a96b99
commit 6123b8e
Showing 1 changed file with 68 additions and 29 deletions.
diff --git a/examples/gpt4o/introduction_to_gpt4o.ipynb b/examples/gpt4o/introduction_to_gpt4o.ipynb
@@ -16,9 +16,7 @@
     "\n",
     "### Current API Capabilities\n",
     "\n",
-    "Currently, the API supports `{text, image}` inputs only, with `{text}` outputs, the same modalities as `gpt-4-turbo`.\n",
-    "\n",
-    "Additional modalities, including audio, will be introduced soon. This guide will help you get started with using GPT-4o mini for text, image, and video understanding."
+    "Currently, the `gpt-4o-mini` model supports `{text, image}`, with `{text}` outputs, the same modalities as `gpt-4-turbo`.  As a preview, we will also be using the `gpt-4o-audio-preview` model to showcase transcription though the GPT4o model."
    ]
   },
   {
@@ -313,7 +311,7 @@
    "outputs": [],
    "source": [
     "import cv2\n",
-    "from moviepy.editor import VideoFileClip\n",
+    "from moviepy import *\n",
     "import time\n",
     "import base64\n",
     "\n",
@@ -529,7 +527,56 @@
     "#### Audio Summary\n",
     "The audio summary is generated by sending the model the audio transcript. With just the audio, the model is likely to bias towards the audio content, and will miss the context provided by the presentations and visuals.\n",
     "\n",
-    "`{audio}` input for GPT-4o isn't currently available but will be coming soon! For now, we use our existing `whisper-1` model to process the audio"
+    "`{audio}` input for GPT-4o is currenlty in preview, but will be incorporated into the base model in the near future.  Because of this, we will use the `gpt-4o-audio-preview` model to process the audio."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#transcribe the audio\n",
+    "with open(audio_path, 'rb') as audio_file:\n",
+    "    audio_content = base64.b64encode(audio_file.read()).decode('utf-8')\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "            model='gpt-4o-audio-preview',\n",
+    "            modalities=[\"text\"],\n",
+    "            messages=[\n",
+    "                    {   \"role\": \"system\", \n",
+    "                        \"content\":\"You are generating a transcript. Create a transcript of the provided audio.\"\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"role\": \"user\",\n",
+    "                        \"content\": [\n",
+    "                            { \n",
+    "                                \"type\": \"text\",\n",
+    "                                \"text\": \"this is the audio.\"\n",
+    "                            },\n",
+    "                            {\n",
+    "                                \"type\": \"input_audio\",\n",
+    "                                \"input_audio\": {\n",
+    "                                    \"data\": audio_content,\n",
+    "                                    \"format\": \"mp3\"\n",
+    "                                }\n",
+    "                            }\n",
+    "                        ]\n",
+    "                    },\n",
+    "                ],\n",
+    "            temperature=0,\n",
+    "        )\n",
+    "\n",
+    "# Extract and return the transcription\n",
+    "transcription = response.choices[0].message.content\n",
+    "print (transcription)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Looking good.  Now let's summarize this and format in markdown."
    ]
   },
   {
@@ -574,26 +621,18 @@
     }
    ],
    "source": [
-    "# Transcribe the audio\n",
-    "transcription = client.audio.transcriptions.create(\n",
-    "    model=\"whisper-1\",\n",
-    "    file=open(audio_path, \"rb\"),\n",
-    ")\n",
-    "## OPTIONAL: Uncomment the line below to print the transcription\n",
-    "#print(\"Transcript: \", transcription.text + \"\\n\\n\")\n",
-    "\n",
+    "#summarize the transcript\n",
     "response = client.chat.completions.create(\n",
-    "    model=MODEL,\n",
-    "    messages=[\n",
-    "    {\"role\": \"system\", \"content\":\"\"\"You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown.\"\"\"},\n",
-    "    {\"role\": \"user\", \"content\": [\n",
-    "        {\"type\": \"text\", \"text\": f\"The audio transcription is: {transcription.text}\"}\n",
-    "        ],\n",
-    "    }\n",
-    "    ],\n",
-    "    temperature=0,\n",
-    ")\n",
-    "print(response.choices[0].message.content)"
+    "            model=MODEL,\n",
+    "            modalities=[\"text\"],\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown.\"},\n",
+    "                {\"role\": \"user\", \"content\": f\"Summarize this text: {transcription}\"},\n",
+    "            ],\n",
+    "            temperature=0,\n",
+    "        )\n",
+    "transcription_summary = response.choices[0].message.content\n",
+    "print (transcription_summary)"
    ]
   },
   {
@@ -665,7 +704,7 @@
     "        \"These are the frames from the video.\",\n",
     "        *map(lambda x: {\"type\": \"image_url\", \n",
     "                        \"image_url\": {\"url\": f'data:image/jpg;base64,{x}', \"detail\": \"low\"}}, base64Frames),\n",
-    "        {\"type\": \"text\", \"text\": f\"The audio transcription is: {transcription.text}\"}\n",
+    "        {\"type\": \"text\", \"text\": f\"The audio transcription is: {transcription}\"}\n",
     "        ],\n",
     "    }\n",
     "],\n",
@@ -746,7 +785,7 @@
     "    model=MODEL,\n",
     "    messages=[\n",
     "    {\"role\": \"system\", \"content\":\"\"\"Use the transcription to answer the provided question. Respond in Markdown.\"\"\"},\n",
-    "    {\"role\": \"user\", \"content\": f\"The audio transcription is: {transcription.text}. \\n\\n {QUESTION}\"},\n",
+    "    {\"role\": \"user\", \"content\": f\"The audio transcription is: {transcription}. \\n\\n {QUESTION}\"},\n",
     "    ],\n",
     "    temperature=0,\n",
     ")\n",
@@ -776,7 +815,7 @@
     "        \"These are the frames from the video.\",\n",
     "        *map(lambda x: {\"type\": \"image_url\", \n",
     "                        \"image_url\": {\"url\": f'data:image/jpg;base64,{x}', \"detail\": \"low\"}}, base64Frames),\n",
-    "                        {\"type\": \"text\", \"text\": f\"The audio transcription is: {transcription.text}\"},\n",
+    "                        {\"type\": \"text\", \"text\": f\"The audio transcription is: {transcription}\"},\n",
     "        QUESTION\n",
     "        ],\n",
     "    }\n",
@@ -796,7 +835,7 @@
     "\n",
     "Integrating many input modalities such as audio, visual, and textual, significantly enhances the performance of the model on a diverse range of tasks. This multimodal approach allows for more comprehensive understanding and interaction, mirroring more closely how humans perceive and process information. \n",
     "\n",
-    "Currently, GPT-4o and GPT-4o mini in the API support text and image inputs, with audio capabilities coming soon."
+    "Currently, GPT-4o and GPT-4o mini in the API support text and image inputs, with audio capabilities coming soon.  For the time being, use the `gpt-4o-audio-preview` for audio inputs."
    ]
   }
  ],
@@ -820,5 +859,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }