speechToText method

  1. @override
Future<STTResponse> speechToText(
  1. STTRequest request
)
override

Convert speech to text with full configuration support

Throws UnsupportedError if not supported. Check supportedFeatures first.

Implementation

@override
Future<STTResponse> speechToText(STTRequest request) async {
  // Basic validation - let the provider handle specific limits
  if (request.audioData == null && request.filePath == null) {
    throw const InvalidRequestError(
      'Either audioData or filePath must be provided',
    );
  }

  final formData = FormData();

  if (request.audioData != null) {
    formData.files.add(
      MapEntry(
        'file',
        MultipartFile.fromBytes(
          request.audioData!,
          filename: 'audio.${request.format ?? 'wav'}',
        ),
      ),
    );
  } else if (request.filePath != null) {
    formData.files.add(
      MapEntry('file', await MultipartFile.fromFile(request.filePath!)),
    );
  }

  formData.fields.add(MapEntry(
      'model', request.model ?? ProviderDefaults.openaiDefaultSTTModel));
  if (request.language != null) {
    formData.fields.add(MapEntry('language', request.language!));
  }
  if (request.prompt != null) {
    formData.fields.add(MapEntry('prompt', request.prompt!));
  }
  if (request.responseFormat != null) {
    formData.fields.add(MapEntry('response_format', request.responseFormat!));
  }
  if (request.temperature != null) {
    formData.fields.add(
      MapEntry('temperature', request.temperature.toString()),
    );
  }

  // Handle timestamp granularities
  // Reference: https://platform.openai.com/docs/api-reference/audio/createTranscription
  final granularities = <String>[];
  if (request.includeWordTiming ||
      request.timestampGranularity == TimestampGranularity.word) {
    granularities.add('word');
  }
  if (request.timestampGranularity == TimestampGranularity.segment) {
    granularities.add('segment');
  }

  // Add each granularity as a separate field
  for (final granularity in granularities) {
    formData.fields.add(MapEntry('timestamp_granularities[]', granularity));
  }

  final responseData =
      await client.postForm('audio/transcriptions', formData);

  // Parse word timing if available
  List<WordTiming>? words;
  if ((request.includeWordTiming ||
          request.timestampGranularity == TimestampGranularity.word) &&
      responseData['words'] != null) {
    final wordsData = responseData['words'] as List;
    words = wordsData.map((w) {
      final wordMap = w as Map<String, dynamic>;
      return WordTiming(
        word: wordMap['word'] as String,
        start: (wordMap['start'] as num).toDouble(),
        end: (wordMap['end'] as num).toDouble(),
        confidence: null, // OpenAI doesn't provide word-level confidence
      );
    }).toList();
  }

  return STTResponse(
    text: responseData['text'] as String,
    language: responseData['language'] as String?,
    confidence: null, // OpenAI doesn't provide overall confidence
    words: words,
    model: request.model,
    duration: responseData['duration'] as double?,
    usage: null,
  );
}