getResponseAsync method

  1. @override
Stream<String> getResponseAsync()
override

Implementation

@override
Stream<String> getResponseAsync() async* {
  _assertNotClosed();
  final text = _queryBuffer.toString();
  _queryBuffer.clear();
  final audio = _pendingAudio;
  final image = _pendingImage;
  _pendingAudio = null;
  _pendingImage = null;

  final genSw = Stopwatch()..start();
  int? firstChunkMs;
  var chunkCount = 0;

  if (modelType == ModelType.gemma4) {
    final rawBuffer = StringBuffer();
    await for (final rawChunk in ffiClient.chatRaw(
      text,
      imageBytes: image,
      audioBytes: audio,
      enableThinking: enableThinking,
    )) {
      if (firstChunkMs == null) {
        firstChunkMs = genSw.elapsedMilliseconds;
        debugPrint(
            '[FfiInferenceModelSession/perf] (async) time-to-first-chunk (prefill): ${firstChunkMs}ms');
      }
      chunkCount++;
      rawBuffer.write(rawChunk);
      yield LiteRtLmFfiClient.extractTextFromResponse(rawChunk);
    }
    _lastRawResponse = rawBuffer.toString();
    _logGenerationStats(genSw, firstChunkMs, chunkCount);
    return;
  }

  _lastRawResponse = null;
  await for (final chunk in ffiClient.chat(
    text,
    imageBytes: image,
    audioBytes: audio,
    enableThinking: enableThinking,
  )) {
    if (firstChunkMs == null) {
      firstChunkMs = genSw.elapsedMilliseconds;
      debugPrint(
          '[FfiInferenceModelSession/perf] (async) time-to-first-chunk (prefill): ${firstChunkMs}ms');
    }
    chunkCount++;
    yield chunk;
  }
  _logGenerationStats(genSw, firstChunkMs, chunkCount);
}