Skip to content

Commit

Permalink
Added timestamp_granularities parameter to the Audio API
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesrochabrun committed Feb 20, 2024
1 parent a404763 commit d7db8b4
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ public struct AudioTranscriptionParameters: Encodable {
let responseFormat: String?
/// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. Defaults to 0
let temperature: Double?
/// Defaults to segment
/// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
let timestampGranularities: [String]?

public enum Model: String {
case whisperOne = "whisper-1"
Expand All @@ -36,6 +39,7 @@ public struct AudioTranscriptionParameters: Encodable {
case responseFormat = "response_format"
case temperature
case language
case timestampGranularities = "timestamp_granularities[]"
}

public init(
Expand All @@ -45,7 +49,8 @@ public struct AudioTranscriptionParameters: Encodable {
prompt: String? = nil,
responseFormat: String? = nil,
temperature: Double? = nil,
language: String? = nil)
language: String? = nil,
timestampGranularities: [String]? = nil)
{
self.fileName = fileName
self.file = file
Expand All @@ -54,6 +59,7 @@ public struct AudioTranscriptionParameters: Encodable {
self.responseFormat = responseFormat
self.temperature = temperature
self.language = language
self.timestampGranularities = timestampGranularities
}
}

Expand All @@ -68,7 +74,8 @@ extension AudioTranscriptionParameters: MultipartFormDataParameters {
.string(paramName: Self.CodingKeys.language.rawValue, value: language),
.string(paramName: Self.CodingKeys.prompt.rawValue, value: prompt),
.string(paramName: Self.CodingKeys.responseFormat.rawValue, value: responseFormat),
.string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature)
.string(paramName: Self.CodingKeys.temperature.rawValue, value: temperature),
.string(paramName: Self.CodingKeys.timestampGranularities.rawValue, value: timestampGranularities)
]).build()
}
}
54 changes: 54 additions & 0 deletions Sources/OpenAI/Public/ResponseModels/Audio/AudioObject.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,60 @@ import Foundation
/// The [audio](https://platform.openai.com/docs/api-reference/audio) response.
public struct AudioObject: Decodable {

/// The language of the input audio.
public let language: String?
/// The duration of the input audio.
public let duration: String?
/// The transcribed text if the request uses the `transcriptions` API, or the translated text if the request uses the `translations` endpoint.
public let text: String
/// Extracted words and their corresponding timestamps.
public let words: [Word]?
/// Segments of the transcribed text and their corresponding details.
public let segments: [Segment]?

public struct Word: Decodable {

/// The text content of the word.
public let word: String
/// Start time of the word in seconds.
public let start: Double
/// End time of the word in seconds.
public let end: Double
}

public struct Segment: Decodable {
/// Unique identifier of the segment.
public let id: Int
/// Seek offset of the segment.
public let seek: Int
/// Start time of the segment in seconds.
public let start: Double
/// End time of the segment in seconds.
public let end: Double
/// Text content of the segment.
public let text: String
/// Array of token IDs for the text content.
public let tokens: [Int]
/// Temperature parameter used for generating the segment.
public let temperature: Double
/// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
public let avgLogprob: Double
/// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
public let compressionRatio: Double
/// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
public let noSpeechProb: Double

enum CodingKeys: String, CodingKey {
case id
case seek
case start
case end
case text
case tokens
case temperature
case avgLogprob = "avg_logprob"
case compressionRatio = "compression_ratio"
case noSpeechProb = "no_speech_prob"
}
}
}

0 comments on commit d7db8b4

Please sign in to comment.