forward method
Takes input actualSequenceLength, featureDim
Implementation
Tensor forward(Tensor audioFeatures, List<Tensor> tracker) {
// 1. Project MFCCs/Spectrogram features to embedding space
// [Seq, featureDim] -> [Seq, embedSize]
final xEmbed = featureProjection.forward(audioFeatures, tracker);
// 2. Sliced Positional Embeddings
// If the input audio is shorter than maxSequenceLength, we slice pos
final actualSeqLen = audioFeatures.shape[0];
final currentPos = posEmbeddings.slice(0, actualSeqLen);
// Note: slice creates a view, add it to tracker if your engine requires it
// 3. Add Positions
final x = xEmbed + currentPos;
tracker.add(x);
// 4. Transformer Backbone
final encoded = transformerEncoder.forwardEmbeddings(x, tracker);
// 5. Global Average Pooling (Collapse [Seq, Embed] -> [1, Embed])
// This aggregates the timeline into a single "audio fingerprint"
final pooled = encoded.mean(); // Uses the sum/size reduction we discussed
tracker.add(pooled);
// 6. Classification Logits
final logits = classificationHead.forward(pooled, tracker);
return logits;
}