createSession method

@override

Future<InferenceModelSession> createSession({

double temperature = .8,
int randomSeed = 1,
int topK = 1,
double? topP,
String? loraPath,
bool? enableVisionModality,
bool? enableAudioModality,
String? systemInstruction,
bool enableThinking = false,
List<Tool> tools = const [],

})

override

Creates a new InferenceModelSession for generation.

temperature, randomSeed, topK, topP — parameters for sampling. loraPath — optional path to LoRA model. enableVisionModality — enable vision modality for multimodal models. enableAudioModality — enable audio modality for Gemma 3n E4B models.

Implementation

@override
Future<InferenceModelSession> createSession({
  double temperature = .8,
  int randomSeed = 1,
  int topK = 1,
  double? topP,
  String? loraPath,
  bool? enableVisionModality,
  bool? enableAudioModality,
  String? systemInstruction,
  bool enableThinking = false,
  List<Tool> tools = const [],
}) async {
  if (_isClosed) {
    throw StateError(
        'Model is closed. Create a new instance to use it again');
  }

  if (loraPath != null) {
    throw UnsupportedError(
      'LoRA weights are not supported on the .litertlm FFI path '
      '(loraPath=$loraPath). Track upstream LiteRT-LM C API support; '
      'remove loraPath or use a MediaPipe .task model on Android/iOS.',
    );
  }

  // Single-flight guard for genuinely *concurrent* callers only. The
  // completer is cleared in the `finally` below once creation settles,
  // so a *sequential* second call falls through and opens a fresh
  // conversation handle (closing the prior session first) instead of
  // returning the cached session. Without this, the cached completer
  // made every later createChat reuse the first session, so the
  // previous conversation's KV cache bled into the next chat. This is
  // the litert/FFI sibling of the MediaPipe fix in #309 (issue #308).
  if (_createCompleter case Completer<InferenceModelSession> completer) {
    return completer.future;
  }

  final completer = _createCompleter = Completer<InferenceModelSession>();
  final sessionSw = Stopwatch()..start();

  try {
    // Legacy singleton lane: close the previous conversation BEFORE
    // opening a fresh one. The engine holds at most one live
    // conversation (upstream litert-lm #966), so closing first never
    // leaves two alive — matching the delete-before-create order the
    // virtual-session multiplexer already uses. It also means a
    // teardown error can't leak a handle that was created first and
    // then left unwrapped: each session owns its own conversation
    // pointer, so closing the old one can't touch the not-yet-created
    // new one. See PR #310 review.
    await _session?.close();

    // For Gemma 4, push tools into the SDK conversation config so it can
    // render native `<|tool>declaration:...<tool|>` tokens via minja. Other
    // model types still use Dart-side prompt injection in chat.dart.
    final toolsJson = (modelType == ModelType.gemma4 && tools.isNotEmpty)
        ? SdkResponseParser.serializeToolsForSdk(tools)
        : null;

    final beforeConv = sessionSw.elapsedMilliseconds;
    final handle = ffiClient.createConversationHandle(
      systemMessage: systemInstruction,
      toolsJson: toolsJson,
      temperature: temperature,
      topK: topK,
      topP: topP,
      seed: randomSeed,
    );
    gemmaLog(
        '[FfiInferenceModel/perf] createConversation (FFI): ${sessionSw.elapsedMilliseconds - beforeConv}ms');

    late final FfiInferenceModelSession session;
    session = FfiInferenceModelSession(
      handle: handle,
      modelType: modelType,
      fileType: fileType,
      supportImage: enableVisionModality ?? supportImage,
      supportAudio: enableAudioModality ?? supportAudio,
      enableThinking: enableThinking,
      // Identity-guarded so a late close of a superseded session can't
      // null a newer `_session`. Does NOT touch `_createCompleter` —
      // that is owned by the `finally` below, not by session teardown.
      onClose: () {
        if (identical(_session, session)) _session = null;
      },
    );
    _session = session;

    completer.complete(session);
    gemmaLog(
        '[FfiInferenceModel/perf] createSession total: ${sessionSw.elapsedMilliseconds}ms');
  } catch (e, st) {
    completer.completeError(e, st);
  } finally {
    // Pure in-flight guard: clear once creation settles (success OR
    // failure) so the next call isn't blocked by a cached completer
    // (the issue #308 KV-cache bleed on success; a permanently cached
    // rejected future on failure).
    _createCompleter = null;
  }
  // Return `completer.future` (rather than the session / a rethrow) so
  // the caller stays the error listener after the completer field is
  // cleared above, and concurrent callers share the same result.
  return completer.future;
}

createSession method

Implementation

FfiInferenceModel class