createModel method

  1. @override
Future<InferenceModel> createModel({
  1. required ModelType modelType,
  2. ModelFileType fileType = ModelFileType.task,
  3. int maxTokens = 1024,
  4. PreferredBackend? preferredBackend,
  5. List<int>? loraRanks,
  6. int? maxNumImages,
  7. bool supportImage = false,
  8. bool supportAudio = false,
  9. bool? enableSpeculativeDecoding,
  10. int? maxConcurrentSessions,
})
override

Creates and returns a new InferenceModel instance.

modelType — model type to create. maxTokens — maximum context length for the model. preferredBackend — backend preference (e.g., CPU, GPU). loraRanks — optional supported LoRA ranks. maxNumImages — maximum number of images (for multimodal models). supportImage — whether the model supports images. supportAudio — whether the model supports audio (Gemma 3n E4B only). enableSpeculativeDecoding — Multi-Token Prediction toggle for Gemma 4 E2B/E4B (LiteRT-LM v0.11.0+). null honors the model's default; true/false forces on/off. Older .litertlm files without an MTP drafter ignore this flag at the SDK level. maxConcurrentSessions — optional cap on the number of sessions open at once via InferenceModel.openSession. null (default) = no cap, backward-compatible. When set, the (cap+1)-th InferenceModel.openSession throws StateError. Use this on mobile with large models to guard against OOM from multiple concurrent KV caches.

Implementation

@override
Future<InferenceModel> createModel({
  required ModelType modelType,
  ModelFileType fileType = ModelFileType.task,
  int maxTokens = 1024,
  PreferredBackend? preferredBackend,
  List<int>? loraRanks,
  int? maxNumImages,
  bool supportImage = false,
  bool supportAudio = false,
  bool? enableSpeculativeDecoding,
  int? maxConcurrentSessions,
}) async {
  // Check active model
  final activeModel = _modelManager.activeInferenceModel;
  if (activeModel == null) {
    throw StateError(
      'No active inference model set. Use `FlutterGemma.installModel()` or `modelManager.setActiveModel()` first',
    );
  }

  // Check if singleton exists and matches active model + runtime params
  if (_initCompleter != null &&
      _initializedModel != null &&
      _lastActiveInferenceSpec != null) {
    final currentSpec = _lastActiveInferenceSpec!;
    final requestedSpec = activeModel as InferenceModelSpec;

    final modelChanged = currentSpec.name != requestedSpec.name;
    final p = _lastInferenceParams;
    final paramsChanged = p != null &&
        (p.supportImage != supportImage ||
            p.supportAudio != supportAudio ||
            p.maxTokens != maxTokens);

    if (modelChanged || paramsChanged) {
      gemmaLog(
          'Model recreation: modelChanged=$modelChanged, paramsChanged=$paramsChanged');
      await _initializedModel?.close();
      _initCompleter = null;
      _initializedModel = null;
      _lastActiveInferenceSpec = null;
      _lastInferenceParams = null;
    } else {
      gemmaLog('Reusing existing model instance for ${requestedSpec.name}');
      return _initCompleter!.future;
    }
  }

  // Return existing completer if initialization in progress
  if (_initCompleter case Completer<InferenceModel> completer) {
    return completer.future;
  }

  final completer = _initCompleter = Completer<InferenceModel>();

  try {
    // Verify model is installed
    final isInstalled = await _modelManager.isModelInstalled(activeModel);
    if (!isInstalled) {
      throw Exception('Active model is no longer installed');
    }

    // Get model file path
    final modelFilePaths = await _modelManager.getModelFilePaths(activeModel);
    if (modelFilePaths == null || modelFilePaths.isEmpty) {
      throw Exception('Model file paths not found');
    }

    final modelPath = modelFilePaths.values.first;
    gemmaLog('[FlutterGemmaDesktop] Using model: $modelPath');

    // Core resolves the model path + owns the singleton lifecycle, then
    // dispatches construction polymorphically through the EngineRegistry.
    // Desktop registers NO default engine — the LiteRtLmEngine is supplied
    // via FlutterGemma.initialize(inferenceEngines:). If the registry is
    // empty (or no engine canHandle the spec), the findFor==null StateError
    // below fires. Desktop is litertlm-only; a `.task` request would simply
    // find no matching engine.
    final spec = activeModel as InferenceModelSpec;
    final config = RuntimeConfig(
      maxTokens: maxTokens,
      modelPath: modelPath,
      preferredBackend: preferredBackend,
      supportImage: supportImage,
      supportAudio: supportAudio,
      maxNumImages: maxNumImages,
      enableSpeculativeDecoding: enableSpeculativeDecoding,
      maxConcurrentSessions: maxConcurrentSessions,
    );
    final engine = EngineRegistry.instance.findFor(spec);
    if (engine == null) {
      throw StateError(
        'No inference engine can handle this model (ModelFileType.${spec.fileType.name}). '
        'Add the engine package to pubspec.yaml and pass it in inferenceEngines: '
        'of FlutterGemma.initialize(...). Registered engines: '
        '${EngineRegistry.instance.registered.map((e) => e.name).join(", ")}.',
      );
    }
    final model = await engine.createModel(spec, config);

    // Core owns the singleton lifecycle: track it + reset on close. The
    // package-built model fires this via CloseNotifier (addCloseListener).
    _initializedModel = model;
    _lastInferenceParams = (
      supportImage: supportImage,
      supportAudio: supportAudio,
      maxTokens: maxTokens,
    );
    model.addCloseListener(() {
      _initializedModel = null;
      _initCompleter = null;
      _lastActiveInferenceSpec = null;
      _lastInferenceParams = null;
    });

    _lastActiveInferenceSpec = spec;
    completer.complete(model);
    return model;
  } catch (e, st) {
    completer.completeError(e, st);
    _initCompleter = null;
    _initializedModel = null;
    _lastActiveInferenceSpec = null;
    _lastInferenceParams = null;
    rethrow;
  }
}