createModel method

  1. @override
Future<InferenceModel> createModel({
  1. required ModelType modelType,
  2. ModelFileType fileType = ModelFileType.task,
  3. int maxTokens = 1024,
  4. PreferredBackend? preferredBackend,
  5. List<int>? loraRanks,
  6. int? maxNumImages,
  7. bool supportImage = false,
  8. bool supportAudio = false,
  9. bool? enableSpeculativeDecoding,
  10. int? maxConcurrentSessions,
})
override

Creates and returns a new InferenceModel instance.

modelType — model type to create. maxTokens — maximum context length for the model. preferredBackend — backend preference (e.g., CPU, GPU). loraRanks — optional supported LoRA ranks. maxNumImages — maximum number of images (for multimodal models). supportImage — whether the model supports images. supportAudio — whether the model supports audio (Gemma 3n E4B only). enableSpeculativeDecoding — Multi-Token Prediction toggle for Gemma 4 E2B/E4B (LiteRT-LM v0.11.0+). null honors the model's default; true/false forces on/off. Older .litertlm files without an MTP drafter ignore this flag at the SDK level. maxConcurrentSessions — optional cap on the number of sessions open at once via InferenceModel.openSession. null (default) = no cap, backward-compatible. When set, the (cap+1)-th InferenceModel.openSession throws StateError. Use this on mobile with large models to guard against OOM from multiple concurrent KV caches.

Implementation

@override
Future<InferenceModel> createModel({
  required ModelType modelType,
  ModelFileType fileType = ModelFileType.task,
  int maxTokens = 1024,
  PreferredBackend? preferredBackend,
  List<int>? loraRanks,
  int? maxNumImages,
  bool supportImage = false, // Enabling image support
  bool supportAudio = false, // Enabling audio support (Gemma 3n E4B)
  bool? enableSpeculativeDecoding, // Ignored on web (MediaPipe path).
  int? maxConcurrentSessions,
}) async {
  // TODO: Implement multimodal support for web
  if (supportImage || maxNumImages != null) {
    if (kDebugMode) {
      debugPrint(
          'Warning: Image support is not yet implemented for web platform');
    }
  }

  // Check if model already exists with different parameters. Two web engine
  // types coexist now (MediaPipe `.task` and LiteRT-LM `.litertlm`), so the
  // cached singleton can be either — type-check, then compare params.
  if (_initializedModel != null) {
    final existing = _initializedModel!;
    bool parametersChanged;
    if (existing is WebInferenceModel) {
      parametersChanged = existing.modelType != modelType ||
          existing.fileType != fileType ||
          existing.maxTokens != maxTokens ||
          existing.supportImage != supportImage ||
          existing.supportAudio != supportAudio ||
          (existing.maxNumImages ?? 0) != (maxNumImages ?? 0);
    } else if (existing is LiteRtLmWebInferenceModel) {
      parametersChanged = existing.modelType != modelType ||
          existing.fileType != fileType ||
          existing.maxTokens != maxTokens;
    } else {
      // Unknown engine type — always replace.
      parametersChanged = true;
    }
    if (parametersChanged) {
      if (kDebugMode) {
        debugPrint(
            '[FlutterGemmaWeb] Model parameters changed, closing existing model');
      }
      await existing.close();
      _initializedModel = null;
    }
  }

  if (_initializedModel != null) {
    return _initializedModel!;
  }

  // Engine selection by file type, mirroring the mobile branch in
  // FlutterGemmaMobile.createModel: .task → MediaPipe (WebInferenceModel),
  // .litertlm → LiteRT-LM JS via @litert-lm/core (LiteRtLmWebInferenceModel).
  // Both share one [WebModelSourceResolver] — the storage-mode branch
  // (Blob URL vs OPFS ReadableStream) lives there, not here.
  final webManager = modelManager as WebModelManager;
  final sourceResolver = WebModelSourceResolver(webManager);
  if (fileType == ModelFileType.litertlm) {
    _initializedModel = LiteRtLmWebInferenceModel(
      modelType: modelType,
      maxTokens: maxTokens,
      sourceResolver: sourceResolver,
      maxConcurrentSessions: maxConcurrentSessions,
      onClose: () {
        _initializedModel = null;
      },
    );
  } else {
    _initializedModel = WebInferenceModel(
      modelType: modelType,
      fileType: fileType,
      maxTokens: maxTokens,
      loraRanks: loraRanks,
      sourceResolver: sourceResolver,
      supportImage: supportImage, // Passing the flag
      supportAudio: supportAudio, // Passing the audio flag
      maxNumImages: maxNumImages,
      maxConcurrentSessions: maxConcurrentSessions,
      onClose: () {
        _initializedModel = null;
      },
    );
  }
  return _initializedModel!;
}