generate method

  1. @override
Future<String> generate(
  1. String prompt, {
  2. int maxTokens = 256,
})
override

Generates a response from a prompt (single turn, no conversation history).

Implementation

@override
Future<String> generate(String prompt, {int maxTokens = 256}) async {
  if (!isReady) {
    throw Exception('LocalLlamaProvider: Local LLM is not loaded. Call initialize() first.');
  }

  try {
    final session = llama.ChatSession(_engine!);
    final responseChunks = await session.create(
      [llama.LlamaTextContent(prompt)],
      params: llama.GenerationParams(maxTokens: maxTokens),
    ).toList();

    final buffer = StringBuffer();
    for (final chunk in responseChunks) {
      final delta = chunk.choices.first.delta;
      if (delta.content != null) {
        buffer.write(delta.content);
      }
    }
    final response = buffer.toString();

    // Extract thinking trace if model output contains it
    final parsed = ThinkingParser.split(response);

    return parsed.content;
  } catch (e) {
    SintSentinel.logger.e('LocalLlamaProvider: Generation failed: $e');
    rethrow;
  }
}