generateChatResponseAsync method
Implementation
Stream<ModelResponse> generateChatResponseAsync() async* {
debugPrint('InferenceChat: Starting async stream generation');
final buffer = StringBuffer();
// Smart function handling mode - continuous scanning for JSON patterns
String funcBuffer = '';
debugPrint('InferenceChat: Starting to iterate over native tokens...');
// Track if we emitted a function call (to record correct history and skip session clearing)
bool emittedFunctionCall = false;
String lastFuncBuffer =
''; // Preserve funcBuffer content for history recording
final originalStream =
session.getResponseAsync().map((token) => TextResponse(token));
// Apply thinking filter if needed using ModelThinkingFilter
final Stream<ModelResponse> filteredStream = isThinking
? ModelThinkingFilter.filterThinkingStream(originalStream,
modelType: modelType)
: originalStream;
// Apply stop token filter for .litertlm on iOS (MediaPipe doesn't handle stop tokens)
final Stream<ModelResponse> stopFilteredStream =
StopTokenFilter.filterStopTokens(filteredStream, fileType: fileType);
await for (final response in stopFilteredStream) {
if (response is TextResponse) {
final token = response.token;
debugPrint('InferenceChat: Received filtered token: "$token"');
// Track if this token should be added to buffer (default true)
bool shouldAddToBuffer = true;
// Continuous scanning for function calls in text - for models like DeepSeek
if (tools.isNotEmpty &&
supportsFunctionCalls &&
toolChoice != ToolChoice.none) {
// Check if we're currently buffering potential JSON
if (funcBuffer.isNotEmpty) {
// We're already buffering - add token and check for completion
funcBuffer += token;
debugPrint(
'InferenceChat: Buffering token: "$token", total: ${funcBuffer.length} chars');
// Check if we now have a complete JSON
if (FunctionCallParser.isFunctionCallComplete(funcBuffer,
modelType: modelType)) {
// First try to extract message from any JSON with message field
try {
final jsonData = jsonDecode(funcBuffer);
if (jsonData is Map<String, dynamic> &&
jsonData.containsKey('message')) {
// Found JSON with message field - extract and display the message
final message = jsonData['message'] as String;
debugPrint(
'InferenceChat: Extracted message from JSON: "$message"');
yield TextResponse(message);
funcBuffer = '';
shouldAddToBuffer = false; // Don't add JSON tokens to buffer
continue;
}
} catch (e) {
debugPrint(
'InferenceChat: Failed to parse JSON for message extraction: $e');
}
// If no message field found, try parsing as function call(s)
final allCalls = FunctionCallParser.parseAll(
funcBuffer,
modelType: modelType,
);
if (allCalls.isNotEmpty) {
debugPrint(
'InferenceChat: Found ${allCalls.length} function call(s) in complete buffer!');
emittedFunctionCall = true;
lastFuncBuffer = funcBuffer;
if (allCalls.length == 1) {
yield allCalls.first;
} else {
yield ParallelFunctionCallResponse(calls: allCalls);
}
funcBuffer = '';
shouldAddToBuffer = false;
continue;
} else {
// Not a valid function call - emit as text and clear buffer
debugPrint('InferenceChat: Invalid JSON, emitting as text');
yield TextResponse(funcBuffer);
funcBuffer = '';
shouldAddToBuffer = false;
continue;
}
}
// If buffer gets too long without completing, flush as text
if (funcBuffer.length > _maxFunctionBufferLength) {
debugPrint(
'InferenceChat: Buffer too long without completion, flushing as text');
yield TextResponse(funcBuffer);
funcBuffer = '';
shouldAddToBuffer = false;
continue;
}
// Still buffering, don't emit yet
shouldAddToBuffer = false;
} else {
// Not currently buffering - check if this token starts a function call
if (FunctionCallParser.isFunctionCallStart(token,
modelType: modelType)) {
debugPrint(
'InferenceChat: Found potential function call start in token: "$token"');
funcBuffer = token;
shouldAddToBuffer =
false; // Don't add to main buffer while we determine if it's JSON
} else {
// Normal text token - emit immediately
debugPrint('InferenceChat: Emitting text token: "$token"');
yield response;
shouldAddToBuffer = true; // Add to main buffer for history
}
}
} else {
// No function processing happening - emit token directly
debugPrint(
'InferenceChat: No function processing, emitting token as text: "$token"');
yield response;
shouldAddToBuffer = true; // Add to main buffer for history
}
// Add token to buffer only if it should be included in final message
if (shouldAddToBuffer) {
buffer.write(token);
}
} else {
// For non-TextResponse (like ThinkingResponse), pass through
yield response;
}
}
debugPrint('InferenceChat: Native token stream ended');
final response = buffer.toString();
debugPrint('InferenceChat: Complete response accumulated: "$response"');
// Handle end of stream - process any remaining buffer
if (funcBuffer.isNotEmpty) {
debugPrint(
'InferenceChat: Processing remaining buffer at end of stream: ${funcBuffer.length} chars');
// For FunctionGemma, the function call spans response + funcBuffer
// (e.g., response="<start_function_call>call:fn", funcBuffer="{params}")
// For JSON models, funcBuffer contains the complete JSON
final contentToCheck = modelType == ModelType.functionGemma
? response + funcBuffer
: funcBuffer;
// First try to extract message from JSON if it has message field
if (FunctionCallParser.isFunctionCallComplete(contentToCheck,
modelType: modelType)) {
try {
// For JSON parsing, use funcBuffer (the actual JSON part)
// For FunctionGemma parsing, use contentToCheck (full function call)
if (modelType != ModelType.functionGemma) {
final jsonData = jsonDecode(funcBuffer);
if (jsonData is Map<String, dynamic> &&
jsonData.containsKey('message')) {
final message = jsonData['message'] as String;
debugPrint(
'InferenceChat: Extracted message from end-of-stream JSON: "$message"');
yield TextResponse(message);
return;
}
}
// Try to parse as function call(s)
final allCalls = FunctionCallParser.parseAll(
contentToCheck,
modelType: modelType,
);
if (allCalls.isNotEmpty) {
debugPrint(
'InferenceChat: ${allCalls.length} function call(s) found at end of stream');
emittedFunctionCall = true;
lastFuncBuffer = contentToCheck;
if (allCalls.length == 1) {
yield allCalls.first;
} else {
yield ParallelFunctionCallResponse(calls: allCalls);
}
} else {
yield TextResponse(funcBuffer);
}
} catch (e) {
debugPrint('InferenceChat: Failed to parse end-of-stream JSON: $e');
yield TextResponse(funcBuffer);
}
} else {
debugPrint(
'InferenceChat: No complete JSON at end of stream, emitting remaining as text');
yield TextResponse(funcBuffer);
}
}
try {
debugPrint('InferenceChat: Calculating response tokens...');
final responseTokens = await session.sizeInTokens(response);
debugPrint('InferenceChat: Response tokens: $responseTokens');
_currentTokens += responseTokens;
debugPrint('InferenceChat: Current total tokens: $_currentTokens');
if (_currentTokens >= (maxTokens - tokenBuffer)) {
debugPrint('InferenceChat: Token limit reached, recreating session...');
await _recreateSessionWithReducedChunks();
debugPrint('InferenceChat: Session recreated successfully');
}
} catch (e) {
debugPrint('InferenceChat: Error during token calculation: $e');
}
try {
debugPrint('InferenceChat: Adding message to history...');
// Use toolCall message for function calls, text message otherwise
final chatMessage = emittedFunctionCall
? Message.toolCall(
text: lastFuncBuffer.isNotEmpty ? lastFuncBuffer : response)
: Message(text: response, isUser: false);
debugPrint(
'InferenceChat: Created message object (toolCall=$emittedFunctionCall): ${chatMessage.text}');
_fullHistory.add(chatMessage);
debugPrint('InferenceChat: Added to full history');
_modelHistory.add(chatMessage);
debugPrint('InferenceChat: Added to model history');
debugPrint('InferenceChat: Message added to history successfully');
// Clear model history for single-turn models (e.g., FunctionGemma)
// BUT only if this was NOT a function call - we need context for tool response
if (_isSingleTurnModel && !emittedFunctionCall) {
debugPrint(
'InferenceChat: Single-turn model detected (text response), clearing model history...');
_modelHistory.clear();
_currentTokens = 0;
_toolsInstructionSent = false;
// Recreate session to clear native state
await session.close();
session = await sessionCreator!();
debugPrint(
'InferenceChat: Model history cleared and session recreated');
} else if (_isSingleTurnModel && emittedFunctionCall) {
debugPrint(
'InferenceChat: Single-turn model with function call - keeping history for tool response');
}
} catch (e) {
debugPrint('InferenceChat: Error adding message to history: $e');
rethrow;
}
debugPrint(
'InferenceChat: generateChatResponseAsync completed successfully');
}