createModel method

  1. @override
Future<InferenceModel> createModel({
  1. required ModelType modelType,
  2. ModelFileType fileType = ModelFileType.task,
  3. int maxTokens = 1024,
  4. PreferredBackend? preferredBackend,
  5. List<int>? loraRanks,
  6. int? maxNumImages,
  7. bool supportImage = false,
  8. bool supportAudio = false,
  9. bool? enableSpeculativeDecoding,
  10. int? maxConcurrentSessions,
})
override

Creates and returns a new InferenceModel instance.

modelType — model type to create. maxTokens — maximum context length for the model. preferredBackend — backend preference (e.g., CPU, GPU). loraRanks — optional supported LoRA ranks. maxNumImages — maximum number of images (for multimodal models). supportImage — whether the model supports images. supportAudio — whether the model supports audio (Gemma 3n E4B only). enableSpeculativeDecoding — Multi-Token Prediction toggle for Gemma 4 E2B/E4B (LiteRT-LM v0.11.0+). null honors the model's default; true/false forces on/off. Older .litertlm files without an MTP drafter ignore this flag at the SDK level. maxConcurrentSessions — optional cap on the number of sessions open at once via InferenceModel.openSession. null (default) = no cap, backward-compatible. When set, the (cap+1)-th InferenceModel.openSession throws StateError. Use this on mobile with large models to guard against OOM from multiple concurrent KV caches.

Implementation

@override
Future<InferenceModel> createModel({
  required ModelType modelType,
  ModelFileType fileType = ModelFileType.task,
  int maxTokens = 1024,
  PreferredBackend? preferredBackend,
  List<int>? loraRanks,
  int? maxNumImages,
  bool supportImage = false, // Enabling image support
  bool supportAudio = false, // Enabling audio support (Gemma 3n E4B)
  bool? enableSpeculativeDecoding, // Ignored on web (MediaPipe path).
  int? maxConcurrentSessions,
}) async {
  // TODO: Implement multimodal support for web
  if (supportImage || maxNumImages != null) {
    if (kDebugMode) {
      gemmaLog(
          'Warning: Image support is not yet implemented for web platform');
    }
  }

  // A cached singleton may exist from a prior createModel call. Core no longer
  // imports any concrete web inference-model type (the MediaPipe web model
  // moved to flutter_gemma_mediapipe; LiteRT-LM's lives in
  // flutter_gemma_litertlm), so it can't introspect the model's params —
  // any existing model is always closed + replaced.
  if (_initializedModel != null) {
    if (kDebugMode) {
      gemmaLog(
          '[FlutterGemmaWeb] Replacing existing model, closing it first');
    }
    await _initializedModel!.close();
    _initializedModel = null;
  }

  // Engine selection routes through [EngineRegistry] (probe-chain), mirroring
  // the mobile/desktop refactor: .task → MediaPipe, .litertlm → LiteRT-LM.
  // Both web engines now live in their own packages (flutter_gemma_mediapipe /
  // flutter_gemma_litertlm), supplied via
  // FlutterGemma.initialize(inferenceEngines: ...). Web registers NO default;
  // each engine builds its own WebModelSourceResolver internally. Web has no
  // resolved file path/cache dir (paths are lazy via the resolver), so the
  // RuntimeConfig's modelPath is empty.

  // Web selection has always been by `fileType` alone; build a minimal spec
  // carrying it for the probe (web does not require an active model to be set
  // before createModel, so we don't depend on webManager.activeInferenceModel).
  final spec = InferenceModelSpec(
    name: 'web-active',
    modelSource: AssetSource('models/active.bin'),
    modelType: modelType,
    fileType: fileType,
  );
  final config = RuntimeConfig(
    maxTokens: maxTokens,
    modelPath: '',
    preferredBackend: preferredBackend,
    supportImage: supportImage,
    supportAudio: supportAudio,
    maxNumImages: maxNumImages,
    enableSpeculativeDecoding: enableSpeculativeDecoding,
    maxConcurrentSessions: maxConcurrentSessions,
    loraRanks: loraRanks,
  );
  final engine = EngineRegistry.instance.findFor(spec);
  if (engine == null) {
    throw StateError(
      'No inference engine can handle this model (ModelFileType.${spec.fileType.name}). '
      'Add the engine package to pubspec.yaml and pass it in inferenceEngines: '
      'of FlutterGemma.initialize(...). Registered engines: '
      '${EngineRegistry.instance.registered.map((e) => e.name).join(", ")}.',
    );
  }
  final model = await engine.createModel(spec, config);

  // Core owns the singleton lifecycle: track it + reset on close. The
  // package-built model fires this via CloseNotifier (addCloseListener).
  _initializedModel = model;
  model.addCloseListener(() {
    _initializedModel = null;
  });
  return model;
}