generateStream static method

Future<LLMStreamingResult> generateStream(
  1. String prompt, {
  2. LLMGenerationOptions? options,
})

Streaming text generation

Matches Swift RunAnywhere.generateStream(_:options:).

Returns an LLMStreamingResult containing:

  • stream: Stream of tokens as they are generated
  • result: Future that completes with final generation metrics
  • cancel: Function to cancel the generation
final result = await RunAnywhere.generateStream('Tell me a story');

// Consume tokens as they arrive
await for (final token in result.stream) {
  print(token);
}

// Get final metrics after stream completes
final metrics = await result.result;
print('Tokens: ${metrics.tokensUsed}');

// Or cancel early if needed
result.cancel();

Implementation

static Future<LLMStreamingResult> generateStream(
  String prompt, {
  LLMGenerationOptions? options,
}) async {
  if (!_isInitialized) {
    throw SDKError.notInitialized();
  }

  final opts = options ?? const LLMGenerationOptions();
  final startTime = DateTime.now();
  DateTime? firstTokenTime;

  // Verify model is loaded via DartBridgeLLM (mirrors Swift CppBridge.LLM pattern)
  if (!DartBridge.llm.isLoaded) {
    throw SDKError.componentNotReady(
      'LLM model not loaded. Call loadModel() first.',
    );
  }

  final modelId = DartBridge.llm.currentModelId ?? 'unknown';

  // Get model name from registry for telemetry
  final modelInfo =
      await DartBridgeModelRegistry.instance.getPublicModel(modelId);
  final modelName = modelInfo?.name;

  // Determine effective system prompt - add JSON conversion instructions if structuredOutput is provided
  String? effectiveSystemPrompt = opts.systemPrompt;
  if (opts.structuredOutput != null) {
    final jsonSystemPrompt =
        DartBridgeStructuredOutput.shared.getSystemPrompt(
      opts.structuredOutput!.schema,
    );
    // If user already provided a system prompt, prepend the JSON instructions
    if (effectiveSystemPrompt != null && effectiveSystemPrompt.isNotEmpty) {
      effectiveSystemPrompt = '$jsonSystemPrompt\n\n$effectiveSystemPrompt';
    } else {
      effectiveSystemPrompt = jsonSystemPrompt;
    }
  }

  // Create a broadcast stream controller for the tokens
  final controller = StreamController<String>.broadcast();
  final allTokens = <String>[];

  // Start streaming generation via DartBridgeLLM
  final tokenStream = DartBridge.llm.generateStream(
    prompt,
    maxTokens: opts.maxTokens,
    temperature: opts.temperature,
    systemPrompt: effectiveSystemPrompt,
  );

  // Forward tokens and collect them, track subscription in bridge for cancellation
  DartBridge.llm.setActiveStreamSubscription(
    tokenStream.listen(
      (token) {
        // Track first token time
        firstTokenTime ??= DateTime.now();
        allTokens.add(token);
        if (!controller.isClosed) {
          controller.add(token);
        }
      },
      onError: (Object error) {
        // Track streaming generation error
        TelemetryService.shared.trackError(
          errorCode: 'streaming_generation_failed',
          errorMessage: error.toString(),
          context: {'model_id': modelId},
        );
        if (!controller.isClosed) {
          controller.addError(error);
        }
      },
      onDone: () {
        if (!controller.isClosed) {
          unawaited(controller.close());
        }
        // Clear subscription when done
        DartBridge.llm.setActiveStreamSubscription(null);
      },
    ),
  );

  // Build result future that completes when stream is done
  final resultFuture = controller.stream.toList().then((_) {
    final endTime = DateTime.now();
    final latencyMs = endTime.difference(startTime).inMicroseconds / 1000.0;
    final tokensPerSecond =
        latencyMs > 0 ? allTokens.length / (latencyMs / 1000) : 0.0;

    // Calculate time to first token
    int? timeToFirstTokenMs;
    if (firstTokenTime != null) {
      timeToFirstTokenMs =
          firstTokenTime!.difference(startTime).inMilliseconds;
    }

    // Estimate tokens (~4 chars per token)
    final promptTokens = (prompt.length / 4).ceil();
    final completionTokens = allTokens.length;

    // Track streaming generation success with full metrics (mirrors other SDKs)
    TelemetryService.shared.trackGeneration(
      modelId: modelId,
      modelName: modelName,
      promptTokens: promptTokens,
      completionTokens: completionTokens,
      latencyMs: latencyMs.round(),
      temperature: opts.temperature,
      maxTokens: opts.maxTokens,
      contextLength: 8192, // Default context length for LlamaCpp
      tokensPerSecond: tokensPerSecond,
      timeToFirstTokenMs: timeToFirstTokenMs,
      isStreaming: true,
    );

    // Extract structured data if structuredOutput is provided
    Map<String, dynamic>? structuredData;
    final fullText = allTokens.join();
    if (opts.structuredOutput != null) {
      try {
        final jsonString =
            DartBridgeStructuredOutput.shared.extractJson(fullText);
        if (jsonString != null) {
          final parsed = jsonDecode(jsonString);
          structuredData = _normalizeStructuredData(parsed);
        }
      } catch (_) {
        // JSON extraction/parse failed — return text result without structured data
      }
    }

    return LLMGenerationResult(
      text: fullText,
      inputTokens: promptTokens,
      tokensUsed: completionTokens,
      modelUsed: modelId,
      latencyMs: latencyMs,
      framework: 'llamacpp',
      tokensPerSecond: tokensPerSecond,
      structuredData: structuredData,
    );
  });

  return LLMStreamingResult(
    stream: controller.stream,
    result: resultFuture,
    cancel: () {
      // Cancel via the bridge (handles both stream subscription and native cancel)
      DartBridge.llm.cancelGeneration();
    },
  );
}