detect method
Detects the musical key of the given audio samples by analyzing the chroma features and comparing them to the Krumhansl-Schmuckler key profiles. The method performs a short-time Fourier transform (STFT) on the audio to extract the chroma features, then computes the correlation between the chroma vector and each key profile to determine the most likely key. The result is returned as a set of predictions, each containing a Camelot key and a confidence score.
Implementation
@override
CamelotKeyPredictionSet detect(Float32List samples, int sampleRate) {
if (sampleRate <= 0 || samples.length < _frameSize) {
throw ArgumentError('Sample rate must be positive and samples must be at least $_frameSize in length.');
}
final chroma = Float64List(12);
final window = _hannWindow(_frameSize);
final real = Float64List(_frameSize);
final imaginary = Float64List(_frameSize);
final frameChroma = Float64List(12);
final starts = _frameStarts(samples.length);
for (final start in starts) {
for (int i = 0; i < _frameSize; i++) {
real[i] = samples[start + i] * window[i];
imaginary[i] = 0;
}
FFT.transform(real, imaginary);
frameChroma.fillRange(0, 12, 0);
final minimumBin = math.max(
1,
(_minimumFrequency * _frameSize / sampleRate).ceil(),
);
final maximumBin = math.min(
_frameSize ~/ 2 - 1,
(_maximumFrequency * _frameSize / sampleRate).floor(),
);
double frameWeight = 0;
for (int bin = minimumBin; bin <= maximumBin; bin++) {
final frequency = bin * sampleRate / _frameSize;
final magnitude = math.sqrt(
real[bin] * real[bin] +
imaginary[bin] * imaginary[bin],
);
if (magnitude <= 1e-12) {
continue;
}
final weightedMagnitude =
math.pow(magnitude, 0.6).toDouble();
final midiPitch =
69 +
12 *
(math.log(frequency / 440.0) /
math.ln2);
final lowerPitch = midiPitch.floor();
final fraction = midiPitch - lowerPitch;
frameChroma[_pitchClass(lowerPitch)] +=
weightedMagnitude * (1 - fraction);
frameChroma[_pitchClass(lowerPitch + 1)] +=
weightedMagnitude * fraction;
frameWeight += weightedMagnitude;
}
if (frameWeight <= 1e-12) {
continue;
}
for (int i = 0; i < 12; i++) {
chroma[i] += frameChroma[i];
}
}
final totalEnergy =
chroma.reduce((a, b) => a + b);
if (totalEnergy <= 1e-12) {
throw StateError('No significant energy detected in the audio.');
}
for (int i = 0; i < 12; i++) {
chroma[i] /= totalEnergy;
}
final candidates = <KeyCandidate>[];
for (int tonic = 0; tonic < 12; tonic++) {
candidates.add(
KeyCandidate(
tonic: tonic,
isMajor: true,
score: _correlation(
chroma,
_majorProfile,
tonic,
),
),
);
candidates.add(
KeyCandidate(
tonic: tonic,
isMajor: false,
score: _correlation(
chroma,
_minorProfile,
tonic,
),
),
);
}
candidates.sort(
(a, b) => b.score.compareTo(a.score),
);
final predictions = candidates.map((e) {
//Convert from tonic to CamelotKey
final camelotKey = e.isMajor
? _majorCamelotKeys[e.tonic]
: _minorCamelotKeys[e.tonic];
return CamelotKeyPrediction(
key: camelotKey,
confidence: e.score,
);
},).toList();
return CamelotKeyPredictionSet(predictions: predictions);
}