speechToText method
Convert speech to text with full configuration support
Throws UnsupportedError if not supported. Check supportedFeatures first.
Implementation
@override
Future<STTResponse> speechToText(STTRequest request) async {
// Basic validation - let the provider handle specific limits
if (request.audioData == null && request.filePath == null) {
throw const InvalidRequestError(
'Either audioData or filePath must be provided',
);
}
final formData = FormData();
if (request.audioData != null) {
formData.files.add(
MapEntry(
'file',
MultipartFile.fromBytes(
request.audioData!,
filename: 'audio.${request.format ?? 'wav'}',
),
),
);
} else if (request.filePath != null) {
formData.files.add(
MapEntry('file', await MultipartFile.fromFile(request.filePath!)),
);
}
formData.fields.add(MapEntry(
'model', request.model ?? ProviderDefaults.openaiDefaultSTTModel));
if (request.language != null) {
formData.fields.add(MapEntry('language', request.language!));
}
if (request.prompt != null) {
formData.fields.add(MapEntry('prompt', request.prompt!));
}
if (request.responseFormat != null) {
formData.fields.add(MapEntry('response_format', request.responseFormat!));
}
if (request.temperature != null) {
formData.fields.add(
MapEntry('temperature', request.temperature.toString()),
);
}
// Handle timestamp granularities
// Reference: https://platform.openai.com/docs/api-reference/audio/createTranscription
final granularities = <String>[];
if (request.includeWordTiming ||
request.timestampGranularity == TimestampGranularity.word) {
granularities.add('word');
}
if (request.timestampGranularity == TimestampGranularity.segment) {
granularities.add('segment');
}
// Add each granularity as a separate field
for (final granularity in granularities) {
formData.fields.add(MapEntry('timestamp_granularities[]', granularity));
}
final responseData =
await client.postForm('audio/transcriptions', formData);
// Parse word timing if available
List<WordTiming>? words;
if ((request.includeWordTiming ||
request.timestampGranularity == TimestampGranularity.word) &&
responseData['words'] != null) {
final wordsData = responseData['words'] as List;
words = wordsData.map((w) {
final wordMap = w as Map<String, dynamic>;
return WordTiming(
word: wordMap['word'] as String,
start: (wordMap['start'] as num).toDouble(),
end: (wordMap['end'] as num).toDouble(),
confidence: null, // OpenAI doesn't provide word-level confidence
);
}).toList();
}
return STTResponse(
text: responseData['text'] as String,
language: responseData['language'] as String?,
confidence: null, // OpenAI doesn't provide overall confidence
words: words,
model: request.model,
duration: responseData['duration'] as double?,
usage: null,
);
}