getResponse method

  1. @override
Future<String> getResponse()
override

Implementation

@override
Future<String> getResponse() async {
  _assertNotClosed();
  final text = _queryBuffer.toString();
  _queryBuffer.clear();
  final audio = _pendingAudio;
  final image = _pendingImage;
  _pendingAudio = null;
  _pendingImage = null;

  final genSw = Stopwatch()..start();
  int? firstChunkMs;
  var chunkCount = 0;

  // For Gemma 4, walk raw SDK JSON so chat.dart can read `tool_calls` via
  // [LiteRtLmFfiClient.extractToolCalls]. Other models keep the existing
  // text-only fast path (raw JSON cache stays null).
  if (modelType == ModelType.gemma4) {
    final rawBuffer = StringBuffer();
    final textBuffer = StringBuffer();
    await for (final rawChunk in ffiClient.chatRaw(
      text,
      imageBytes: image,
      audioBytes: audio,
      enableThinking: enableThinking,
    )) {
      if (firstChunkMs == null) {
        firstChunkMs = genSw.elapsedMilliseconds;
        debugPrint(
            '[FfiInferenceModelSession/perf] time-to-first-chunk (prefill): ${firstChunkMs}ms');
      }
      chunkCount++;
      rawBuffer.write(rawChunk);
      textBuffer.write(LiteRtLmFfiClient.extractTextFromResponse(rawChunk));
    }
    _lastRawResponse = rawBuffer.toString();
    _logGenerationStats(genSw, firstChunkMs, chunkCount);
    return textBuffer.toString();
  }

  _lastRawResponse = null;
  final buffer = StringBuffer();
  await for (final chunk in ffiClient.chat(
    text,
    imageBytes: image,
    audioBytes: audio,
    enableThinking: enableThinking,
  )) {
    if (firstChunkMs == null) {
      firstChunkMs = genSw.elapsedMilliseconds;
      debugPrint(
          '[FfiInferenceModelSession/perf] time-to-first-chunk (prefill): ${firstChunkMs}ms');
    }
    chunkCount++;
    buffer.write(chunk);
  }
  _logGenerationStats(genSw, firstChunkMs, chunkCount);
  return buffer.toString();
}