createModel method
- required ModelType modelType,
- ModelFileType fileType = ModelFileType.task,
- int maxTokens = 1024,
- PreferredBackend? preferredBackend,
- List<
int> ? loraRanks, - int? maxNumImages,
- bool supportImage = false,
- bool supportAudio = false,
- bool? enableSpeculativeDecoding,
- int? maxConcurrentSessions,
Creates and returns a new InferenceModel instance.
modelType — model type to create.
maxTokens — maximum context length for the model.
preferredBackend — backend preference (e.g., CPU, GPU).
loraRanks — optional supported LoRA ranks.
maxNumImages — maximum number of images (for multimodal models).
supportImage — whether the model supports images.
supportAudio — whether the model supports audio (Gemma 3n E4B only).
enableSpeculativeDecoding — Multi-Token Prediction toggle for Gemma 4
E2B/E4B (LiteRT-LM v0.11.0+). null honors the model's default;
true/false forces on/off. Older .litertlm files without an MTP
drafter ignore this flag at the SDK level.
maxConcurrentSessions — optional cap on the number of sessions open
at once via InferenceModel.openSession. null (default) = no cap,
backward-compatible. When set, the (cap+1)-th InferenceModel.openSession
throws StateError. Use this on mobile with large models to guard
against OOM from multiple concurrent KV caches.
Implementation
@override
Future<InferenceModel> createModel({
required ModelType modelType,
ModelFileType fileType = ModelFileType.task,
int maxTokens = 1024,
PreferredBackend? preferredBackend,
List<int>? loraRanks,
int? maxNumImages,
bool supportImage = false, // Enabling image support
bool supportAudio = false, // Enabling audio support (Gemma 3n E4B)
bool? enableSpeculativeDecoding, // Ignored on web (MediaPipe path).
int? maxConcurrentSessions,
}) async {
// TODO: Implement multimodal support for web
if (supportImage || maxNumImages != null) {
if (kDebugMode) {
debugPrint(
'Warning: Image support is not yet implemented for web platform');
}
}
// Check if model already exists with different parameters. Two web engine
// types coexist now (MediaPipe `.task` and LiteRT-LM `.litertlm`), so the
// cached singleton can be either — type-check, then compare params.
if (_initializedModel != null) {
final existing = _initializedModel!;
bool parametersChanged;
if (existing is WebInferenceModel) {
parametersChanged = existing.modelType != modelType ||
existing.fileType != fileType ||
existing.maxTokens != maxTokens ||
existing.supportImage != supportImage ||
existing.supportAudio != supportAudio ||
(existing.maxNumImages ?? 0) != (maxNumImages ?? 0);
} else if (existing is LiteRtLmWebInferenceModel) {
parametersChanged = existing.modelType != modelType ||
existing.fileType != fileType ||
existing.maxTokens != maxTokens;
} else {
// Unknown engine type — always replace.
parametersChanged = true;
}
if (parametersChanged) {
if (kDebugMode) {
debugPrint(
'[FlutterGemmaWeb] Model parameters changed, closing existing model');
}
await existing.close();
_initializedModel = null;
}
}
if (_initializedModel != null) {
return _initializedModel!;
}
// Engine selection by file type, mirroring the mobile branch in
// FlutterGemmaMobile.createModel: .task → MediaPipe (WebInferenceModel),
// .litertlm → LiteRT-LM JS via @litert-lm/core (LiteRtLmWebInferenceModel).
// Both share one [WebModelSourceResolver] — the storage-mode branch
// (Blob URL vs OPFS ReadableStream) lives there, not here.
final webManager = modelManager as WebModelManager;
final sourceResolver = WebModelSourceResolver(webManager);
if (fileType == ModelFileType.litertlm) {
_initializedModel = LiteRtLmWebInferenceModel(
modelType: modelType,
maxTokens: maxTokens,
sourceResolver: sourceResolver,
maxConcurrentSessions: maxConcurrentSessions,
onClose: () {
_initializedModel = null;
},
);
} else {
_initializedModel = WebInferenceModel(
modelType: modelType,
fileType: fileType,
maxTokens: maxTokens,
loraRanks: loraRanks,
sourceResolver: sourceResolver,
supportImage: supportImage, // Passing the flag
supportAudio: supportAudio, // Passing the audio flag
maxNumImages: maxNumImages,
maxConcurrentSessions: maxConcurrentSessions,
onClose: () {
_initializedModel = null;
},
);
}
return _initializedModel!;
}