TransformerDecoder constructor
TransformerDecoder({})
Implementation
TransformerDecoder({
this.vocabSize = 4098,
this.embedSize = 128,
this.blockSize = 16,
this.numLayers = 4,
this.numHeads = 4,
this.encoderEmbedSize = 128,
}) : wte = Tensor.random([vocabSize, embedSize]),
wpe = Tensor.random([blockSize, embedSize]),
blocks = List.generate(
numLayers,
(i) => TransformerDecoderBlock(
embedSize,
numHeads,
encoderEmbedSize,
blockSize,
),
),
finalLayerNorm = LayerNorm(embedSize),
lmHead = Layer(embedSize, vocabSize, useGelu: false) {
// 1. Remove the .step(-0.02) hack.
// Instead, let's use a proper Xavier/Normal distribution if your Tensor class allows,
// or manually scale the random data on the CPU once.
List<double> rawWte = wte.fetchData();
final rand = math.Random();
for (int i = 0; i < rawWte.length; i++) {
rawWte[i] = (rand.nextDouble() * 2 - 1) * 0.02; // Range [-0.02, 0.02]
}
wte.data = rawWte;
// 2. CRITICAL: Zero out the lmHead bias
// This forces the model to use the embeddings and attention to differentiate moves.
final params = lmHead.parameters();
if (params.length > 1) {
// Assuming params[0] is weights and params[1] is bias
Tensor bias = params[1];
bias.data = List.filled(bias.length, 0.0);
print("🎯 lmHead bias zeroed to prevent index-collapse.");
}
}