getResponseAsync method
Implementation
@override
Stream<String> getResponseAsync() async* {
_assertNotClosed();
final text = _queryBuffer.toString();
_queryBuffer.clear();
final audio = _pendingAudio;
final image = _pendingImage;
_pendingAudio = null;
_pendingImage = null;
final genSw = Stopwatch()..start();
int? firstChunkMs;
var chunkCount = 0;
if (modelType == ModelType.gemma4) {
final rawBuffer = StringBuffer();
await for (final rawChunk in ffiClient.chatRaw(
text,
imageBytes: image,
audioBytes: audio,
enableThinking: enableThinking,
)) {
if (firstChunkMs == null) {
firstChunkMs = genSw.elapsedMilliseconds;
debugPrint(
'[FfiInferenceModelSession/perf] (async) time-to-first-chunk (prefill): ${firstChunkMs}ms');
}
chunkCount++;
rawBuffer.write(rawChunk);
yield LiteRtLmFfiClient.extractTextFromResponse(rawChunk);
}
_lastRawResponse = rawBuffer.toString();
_logGenerationStats(genSw, firstChunkMs, chunkCount);
return;
}
_lastRawResponse = null;
await for (final chunk in ffiClient.chat(
text,
imageBytes: image,
audioBytes: audio,
enableThinking: enableThinking,
)) {
if (firstChunkMs == null) {
firstChunkMs = genSw.elapsedMilliseconds;
debugPrint(
'[FfiInferenceModelSession/perf] (async) time-to-first-chunk (prefill): ${firstChunkMs}ms');
}
chunkCount++;
yield chunk;
}
_logGenerationStats(genSw, firstChunkMs, chunkCount);
}