forward method

Tensor forward(
  1. Tensor audioFeatures,
  2. List<Tensor> tracker
)

Takes input actualSequenceLength, featureDim

Implementation

Tensor forward(Tensor audioFeatures, List<Tensor> tracker) {
  // 1. Project MFCCs/Spectrogram features to embedding space
  // [Seq, featureDim] -> [Seq, embedSize]
  final xEmbed = featureProjection.forward(audioFeatures, tracker);

  // 2. Sliced Positional Embeddings
  // If the input audio is shorter than maxSequenceLength, we slice pos
  final actualSeqLen = audioFeatures.shape[0];
  final currentPos = posEmbeddings.slice(0, actualSeqLen);
  // Note: slice creates a view, add it to tracker if your engine requires it

  // 3. Add Positions
  final x = xEmbed + currentPos;
  tracker.add(x);

  // 4. Transformer Backbone
  final encoded = transformerEncoder.forwardEmbeddings(x, tracker);

  // 5. Global Average Pooling (Collapse [Seq, Embed] -> [1, Embed])
  // This aggregates the timeline into a single "audio fingerprint"
  final pooled = encoded.mean(); // Uses the sum/size reduction we discussed
  tracker.add(pooled);

  // 6. Classification Logits
  final logits = classificationHead.forward(pooled, tracker);

  return logits;
}