Added timestamp_granularities parameter to the Audio API

jamesrochabrun · Feb 20, 2024 · d7db8b4 · d7db8b4
1 parent a404763
commit d7db8b4
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 2 deletions.
diff --git a/Sources/OpenAI/Public/Parameters/Audio/AudioTranscriptionParameters.swift b/Sources/OpenAI/Public/Parameters/Audio/AudioTranscriptionParameters.swift
@@ -24,6 +24,9 @@ public struct AudioTranscriptionParameters: Encodable {
    let responseFormat: String?
    /// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. Defaults to 0
    let temperature: Double?
+   /// Defaults to segment
+   /// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+   let timestampGranularities: [String]?
 
    public enum Model: String {
       case whisperOne = "whisper-1"
@@ -36,6 +39,7 @@ public struct AudioTranscriptionParameters: Encodable {
       case responseFormat = "response_format"
       case temperature
       case language
+      case timestampGranularities = "timestamp_granularities[]"
    }
 
    public init(
@@ -45,7 +49,8 @@ public struct AudioTranscriptionParameters: Encodable {
       prompt: String? = nil,
       responseFormat: String? = nil,
       temperature: Double? = nil,
-      language: String? = nil)
+      language: String? = nil,
+      timestampGranularities: [String]? = nil)
    {
       self.fileName = fileName
       self.file = file
@@ -54,6 +59,7 @@ public struct AudioTranscriptionParameters: Encodable {
       self.responseFormat = responseFormat
       self.temperature = temperature
       self.language = language
+      self.timestampGranularities = timestampGranularities
    }
 }
 
@@ -68,7 +74,8 @@ extension AudioTranscriptionParameters: MultipartFormDataParameters {
          .string(paramName: Self.CodingKeys.language.rawValue, value: language),
          .string(paramName: Self.CodingKeys.prompt.rawValue, value: prompt),
          .string(paramName: Self.CodingKeys.responseFormat.rawValue, value: responseFormat),
-         .string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature)
+         .string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature),
+         .string(paramName: Self.CodingKeys.timestampGranularities.rawValue, value: timestampGranularities)
       ]).build()
    }
 }
diff --git a/Sources/OpenAI/Public/ResponseModels/Audio/AudioObject.swift b/Sources/OpenAI/Public/ResponseModels/Audio/AudioObject.swift
@@ -10,6 +10,60 @@ import Foundation
 /// The [audio](https://platform.openai.com/docs/api-reference/audio) response.
 public struct AudioObject: Decodable {
 
+   /// The language of the input audio.
+   public let language: String?
+   /// The duration of the input audio.
+   public let duration: String?
    /// The transcribed text if the request uses the `transcriptions` API, or the translated text if the request uses the `translations` endpoint.
    public let text: String
+   /// Extracted words and their corresponding timestamps.
+   public let words: [Word]?
+   /// Segments of the transcribed text and their corresponding details.
+   public let segments: [Segment]?
+
+   public struct Word: Decodable {
+
+      /// The text content of the word.
+      public let word: String
+      /// Start time of the word in seconds.
+      public let start: Double
+      /// End time of the word in seconds.
+      public let end: Double
+   }
+
+   public struct Segment: Decodable {
+      /// Unique identifier of the segment.
+      public let id: Int
+      /// Seek offset of the segment.
+      public let seek: Int
+      /// Start time of the segment in seconds.
+      public let start: Double
+      ///  End time of the segment in seconds.
+      public let end: Double
+      /// Text content of the segment.
+      public let text: String
+      /// Array of token IDs for the text content.
+      public let tokens: [Int]
+      /// Temperature parameter used for generating the segment.
+      public let temperature: Double
+      /// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
+      public let avgLogprob: Double
+      /// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
+      public let compressionRatio: Double
+      /// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
+      public let noSpeechProb: Double
+
+      enum CodingKeys: String, CodingKey {
+         case id
+         case seek
+         case start
+         case end
+         case text
+         case tokens
+         case temperature
+         case avgLogprob = "avg_logprob"
+         case compressionRatio = "compression_ratio"
+         case noSpeechProb = "no_speech_prob"
+      }
+   }
 }