getResponse method
Implementation
@override
Future<String> getResponse() async {
_assertNotClosed();
final text = _queryBuffer.toString();
_queryBuffer.clear();
final audio = _pendingAudio;
final image = _pendingImage;
_pendingAudio = null;
_pendingImage = null;
final genSw = Stopwatch()..start();
int? firstChunkMs;
var chunkCount = 0;
// For Gemma 4, walk raw SDK JSON so chat.dart can read `tool_calls` via
// [LiteRtLmFfiClient.extractToolCalls]. Other models keep the existing
// text-only fast path (raw JSON cache stays null).
if (modelType == ModelType.gemma4) {
final rawBuffer = StringBuffer();
final textBuffer = StringBuffer();
await for (final rawChunk in ffiClient.chatRaw(
text,
imageBytes: image,
audioBytes: audio,
enableThinking: enableThinking,
)) {
if (firstChunkMs == null) {
firstChunkMs = genSw.elapsedMilliseconds;
debugPrint(
'[FfiInferenceModelSession/perf] time-to-first-chunk (prefill): ${firstChunkMs}ms');
}
chunkCount++;
rawBuffer.write(rawChunk);
textBuffer.write(LiteRtLmFfiClient.extractTextFromResponse(rawChunk));
}
_lastRawResponse = rawBuffer.toString();
_logGenerationStats(genSw, firstChunkMs, chunkCount);
return textBuffer.toString();
}
_lastRawResponse = null;
final buffer = StringBuffer();
await for (final chunk in ffiClient.chat(
text,
imageBytes: image,
audioBytes: audio,
enableThinking: enableThinking,
)) {
if (firstChunkMs == null) {
firstChunkMs = genSw.elapsedMilliseconds;
debugPrint(
'[FfiInferenceModelSession/perf] time-to-first-chunk (prefill): ${firstChunkMs}ms');
}
chunkCount++;
buffer.write(chunk);
}
_logGenerationStats(genSw, firstChunkMs, chunkCount);
return buffer.toString();
}