TransformerDecoder constructor

TransformerDecoder({
  1. int vocabSize = 4098,
  2. int embedSize = 128,
  3. int blockSize = 16,
  4. int numLayers = 4,
  5. int numHeads = 4,
  6. int encoderEmbedSize = 128,
})

Implementation

TransformerDecoder({
  this.vocabSize = 4098,
  this.embedSize = 128,
  this.blockSize = 16,
  this.numLayers = 4,
  this.numHeads = 4,
  this.encoderEmbedSize = 128,
}) : wte = Tensor.random([vocabSize, embedSize]),
     wpe = Tensor.random([blockSize, embedSize]),
     blocks = List.generate(
       numLayers,
       (i) => TransformerDecoderBlock(
         embedSize,
         numHeads,
         encoderEmbedSize,
         blockSize,
       ),
     ),
     finalLayerNorm = LayerNorm(embedSize),
     lmHead = Layer(embedSize, vocabSize, useGelu: false) {
  // 1. Remove the .step(-0.02) hack.
  // Instead, let's use a proper Xavier/Normal distribution if your Tensor class allows,
  // or manually scale the random data on the CPU once.

  List<double> rawWte = wte.fetchData();
  final rand = math.Random();
  for (int i = 0; i < rawWte.length; i++) {
    rawWte[i] = (rand.nextDouble() * 2 - 1) * 0.02; // Range [-0.02, 0.02]
  }
  wte.data = rawWte;

  // 2. CRITICAL: Zero out the lmHead bias
  // This forces the model to use the embeddings and attention to differentiate moves.
  final params = lmHead.parameters();
  if (params.length > 1) {
    // Assuming params[0] is weights and params[1] is bias
    Tensor bias = params[1];
    bias.data = List.filled(bias.length, 0.0);
    print("🎯 lmHead bias zeroed to prevent index-collapse.");
  }
}