forward method

List<ValueVector> forward(
  1. List<ValueVector> x
)

Forward pass for a single self-attention head.

It takes a list of token vectors x and returns a new list of vectors where each vector is a weighted sum based on attention scores.

Implementation

List<ValueVector> forward(List<ValueVector> x) {
  final T = x.length; // Sequence length

  // Project input vectors into key, query, and value spaces
  final k = x.map((v) => key.forward(v)).toList(); // (T, headSize)
  final q = x.map((v) => query.forward(v)).toList(); // (T, headSize)
  final v = x.map((v) => value.forward(v)).toList(); // (T, headSize)

  // 1. Compute attention scores ("affinities")
  var wei = List.generate(T, (i) {
    final row = List.generate(T, (j) => q[i].dot(k[j]));
    // Scale by 1/sqrt(head_size)
    return ValueVector(row) * Value(1.0 / math.sqrt(headSize.toDouble()));
  });

  // 2. Apply optional mask (for decoder blocks)
  if (masked) {
    wei = List.generate(T, (i) {
      final newVals = wei[i].values.asMap().entries.map((entry) {
        int j = entry.key;
        Value val = entry.value;
        if (j > i) {
          // Set future tokens to -infinity so they become 0 after softmax
          return Value(double.negativeInfinity, {val}, 'mask');
        }
        return val;
      }).toList();
      return ValueVector(newVals);
    });
  }

  // 3. Apply softmax to get attention weights (probabilities)
  final p_attn = wei.map((row) => row.softmax()).toList();

  // 4. Perform the weighted aggregation of the value vectors
  final out = List.generate(T, (i) {
    var pos_out = ValueVector(List.filled(headSize, Value(0.0)));
    for (int j = 0; j < T; j++) {
      final weighted_v = v[j] * p_attn[i].values[j];
      pos_out += weighted_v;
    }
    return pos_out;
  });

  return out;
}