generateChatResponseAsync method

Stream<ModelResponse> generateChatResponseAsync()
Implementation

Stream<ModelResponse> generateChatResponseAsync() async* {
  debugPrint('InferenceChat: Starting async stream generation');
  final buffer = StringBuffer();

  // Smart function handling mode - continuous scanning for JSON patterns
  String funcBuffer = '';

  debugPrint('InferenceChat: Starting to iterate over native tokens...');

  // Track if we emitted a function call (to record correct history and skip session clearing)
  bool emittedFunctionCall = false;
  String lastFuncBuffer =
      ''; // Preserve funcBuffer content for history recording

  final originalStream =
      session.getResponseAsync().map((token) => TextResponse(token));

  // Apply thinking filter if needed using ModelThinkingFilter
  final Stream<ModelResponse> filteredStream = isThinking
      ? ModelThinkingFilter.filterThinkingStream(originalStream,
          modelType: modelType)
      : originalStream;

  // Apply stop token filter for .litertlm on iOS (MediaPipe doesn't handle stop tokens)
  final Stream<ModelResponse> stopFilteredStream =
      StopTokenFilter.filterStopTokens(filteredStream, fileType: fileType);

  await for (final response in stopFilteredStream) {
    if (response is TextResponse) {
      final token = response.token;
      debugPrint('InferenceChat: Received filtered token: "$token"');

      // Track if this token should be added to buffer (default true)
      bool shouldAddToBuffer = true;

      // Continuous scanning for function calls in text - for models like DeepSeek
      if (tools.isNotEmpty &&
          supportsFunctionCalls &&
          toolChoice != ToolChoice.none) {
        // Check if we're currently buffering potential JSON
        if (funcBuffer.isNotEmpty) {
          // We're already buffering - add token and check for completion
          funcBuffer += token;
          debugPrint(
              'InferenceChat: Buffering token: "$token", total: ${funcBuffer.length} chars');

          // Check if we now have a complete JSON
          if (FunctionCallParser.isFunctionCallComplete(funcBuffer,
              modelType: modelType)) {
            // First try to extract message from any JSON with message field
            try {
              final jsonData = jsonDecode(funcBuffer);
              if (jsonData is Map<String, dynamic> &&
                  jsonData.containsKey('message')) {
                // Found JSON with message field - extract and display the message
                final message = jsonData['message'] as String;
                debugPrint(
                    'InferenceChat: Extracted message from JSON: "$message"');
                yield TextResponse(message);
                funcBuffer = '';
                shouldAddToBuffer = false; // Don't add JSON tokens to buffer
                continue;
              }
            } catch (e) {
              debugPrint(
                  'InferenceChat: Failed to parse JSON for message extraction: $e');
            }

            // If no message field found, try parsing as function call(s)
            final allCalls = FunctionCallParser.parseAll(
              funcBuffer,
              modelType: modelType,
            );
            if (allCalls.isNotEmpty) {
              debugPrint(
                  'InferenceChat: Found ${allCalls.length} function call(s) in complete buffer!');
              emittedFunctionCall = true;
              lastFuncBuffer = funcBuffer;
              if (allCalls.length == 1) {
                yield allCalls.first;
              } else {
                yield ParallelFunctionCallResponse(calls: allCalls);
              }
              funcBuffer = '';
              shouldAddToBuffer = false;
              continue;
            } else {
              // Not a valid function call - emit as text and clear buffer
              debugPrint('InferenceChat: Invalid JSON, emitting as text');
              yield TextResponse(funcBuffer);
              funcBuffer = '';
              shouldAddToBuffer = false;
              continue;
            }
          }

          // If buffer gets too long without completing, flush as text
          if (funcBuffer.length > _maxFunctionBufferLength) {
            debugPrint(
                'InferenceChat: Buffer too long without completion, flushing as text');
            yield TextResponse(funcBuffer);
            funcBuffer = '';
            shouldAddToBuffer = false;
            continue;
          }

          // Still buffering, don't emit yet
          shouldAddToBuffer = false;
        } else {
          // Not currently buffering - check if this token starts a function call
          if (FunctionCallParser.isFunctionCallStart(token,
              modelType: modelType)) {
            debugPrint(
                'InferenceChat: Found potential function call start in token: "$token"');
            funcBuffer = token;
            shouldAddToBuffer =
                false; // Don't add to main buffer while we determine if it's JSON
          } else {
            // Normal text token - emit immediately
            debugPrint('InferenceChat: Emitting text token: "$token"');
            yield response;
            shouldAddToBuffer = true; // Add to main buffer for history
          }
        }
      } else {
        // No function processing happening - emit token directly
        debugPrint(
            'InferenceChat: No function processing, emitting token as text: "$token"');
        yield response;
        shouldAddToBuffer = true; // Add to main buffer for history
      }

      // Add token to buffer only if it should be included in final message
      if (shouldAddToBuffer) {
        buffer.write(token);
      }
    } else {
      // For non-TextResponse (like ThinkingResponse), pass through
      yield response;
    }
  }

  debugPrint('InferenceChat: Native token stream ended');
  final response = buffer.toString();
  debugPrint('InferenceChat: Complete response accumulated: "$response"');

  // Handle end of stream - process any remaining buffer
  if (funcBuffer.isNotEmpty) {
    debugPrint(
        'InferenceChat: Processing remaining buffer at end of stream: ${funcBuffer.length} chars');

    // For FunctionGemma, the function call spans response + funcBuffer
    // (e.g., response="<start_function_call>call:fn", funcBuffer="{params}")
    // For JSON models, funcBuffer contains the complete JSON
    final contentToCheck = modelType == ModelType.functionGemma
        ? response + funcBuffer
        : funcBuffer;

    // First try to extract message from JSON if it has message field
    if (FunctionCallParser.isFunctionCallComplete(contentToCheck,
        modelType: modelType)) {
      try {
        // For JSON parsing, use funcBuffer (the actual JSON part)
        // For FunctionGemma parsing, use contentToCheck (full function call)
        if (modelType != ModelType.functionGemma) {
          final jsonData = jsonDecode(funcBuffer);
          if (jsonData is Map<String, dynamic> &&
              jsonData.containsKey('message')) {
            final message = jsonData['message'] as String;
            debugPrint(
                'InferenceChat: Extracted message from end-of-stream JSON: "$message"');
            yield TextResponse(message);
            return;
          }
        }

        // Try to parse as function call(s)
        final allCalls = FunctionCallParser.parseAll(
          contentToCheck,
          modelType: modelType,
        );
        if (allCalls.isNotEmpty) {
          debugPrint(
              'InferenceChat: ${allCalls.length} function call(s) found at end of stream');
          emittedFunctionCall = true;
          lastFuncBuffer = contentToCheck;
          if (allCalls.length == 1) {
            yield allCalls.first;
          } else {
            yield ParallelFunctionCallResponse(calls: allCalls);
          }
        } else {
          yield TextResponse(funcBuffer);
        }
      } catch (e) {
        debugPrint('InferenceChat: Failed to parse end-of-stream JSON: $e');
        yield TextResponse(funcBuffer);
      }
    } else {
      debugPrint(
          'InferenceChat: No complete JSON at end of stream, emitting remaining as text');
      yield TextResponse(funcBuffer);
    }
  }

  try {
    debugPrint('InferenceChat: Calculating response tokens...');
    final responseTokens = await session.sizeInTokens(response);
    debugPrint('InferenceChat: Response tokens: $responseTokens');
    _currentTokens += responseTokens;
    debugPrint('InferenceChat: Current total tokens: $_currentTokens');

    if (_currentTokens >= (maxTokens - tokenBuffer)) {
      debugPrint('InferenceChat: Token limit reached, recreating session...');
      await _recreateSessionWithReducedChunks();
      debugPrint('InferenceChat: Session recreated successfully');
    }
  } catch (e) {
    debugPrint('InferenceChat: Error during token calculation: $e');
  }

  try {
    debugPrint('InferenceChat: Adding message to history...');
    // Use toolCall message for function calls, text message otherwise
    final chatMessage = emittedFunctionCall
        ? Message.toolCall(
            text: lastFuncBuffer.isNotEmpty ? lastFuncBuffer : response)
        : Message(text: response, isUser: false);
    debugPrint(
        'InferenceChat: Created message object (toolCall=$emittedFunctionCall): ${chatMessage.text}');
    _fullHistory.add(chatMessage);
    debugPrint('InferenceChat: Added to full history');
    _modelHistory.add(chatMessage);
    debugPrint('InferenceChat: Added to model history');
    debugPrint('InferenceChat: Message added to history successfully');

    // Clear model history for single-turn models (e.g., FunctionGemma)
    // BUT only if this was NOT a function call - we need context for tool response
    if (_isSingleTurnModel && !emittedFunctionCall) {
      debugPrint(
          'InferenceChat: Single-turn model detected (text response), clearing model history...');
      _modelHistory.clear();
      _currentTokens = 0;
      _toolsInstructionSent = false;

      // Recreate session to clear native state
      await session.close();
      session = await sessionCreator!();
      debugPrint(
          'InferenceChat: Model history cleared and session recreated');
    } else if (_isSingleTurnModel && emittedFunctionCall) {
      debugPrint(
          'InferenceChat: Single-turn model with function call - keeping history for tool response');
    }
  } catch (e) {
    debugPrint('InferenceChat: Error adding message to history: $e');
    rethrow;
  }

  debugPrint(
      'InferenceChat: generateChatResponseAsync completed successfully');
}
generateChatResponseAsync method

Implementation

InferenceChat class