policyIteration method
Policy iteration
Implementation
Map<String, dynamic> policyIteration({
double gamma = 0.99,
int maxIter = 1000,
}) {
var policy = List<int>.filled(nStates, 0);
final V = List<double>.filled(nStates, 0.0);
for (var it = 0; it < maxIter; it++) {
// policy evaluation (solve linear system (I - gamma P_pi) V = r_pi)
final A = List.generate(
nStates,
(_) => List<double>.filled(nStates, 0.0),
);
final b = List<double>.filled(nStates, 0.0);
for (var s = 0; s < nStates; s++) {
A[s][s] = 1.0;
final a = policy[s];
for (var sp = 0; sp < nStates; sp++) {
A[s][sp] -= gamma * P[s][a][sp];
b[s] += P[s][a][sp] * R[s][a][sp];
}
}
final solved = _solveLinear(A, b);
for (var i = 0; i < nStates; i++) {
V[i] = solved[i];
}
// policy improvement
var changed = false;
for (var s = 0; s < nStates; s++) {
var bestA = policy[s];
var bestVal = double.negativeInfinity;
for (var a = 0; a < nActions; a++) {
var q = 0.0;
for (var sp = 0; sp < nStates; sp++) {
q += P[s][a][sp] * (R[s][a][sp] + gamma * V[sp]);
}
if (q > bestVal) {
bestVal = q;
bestA = a;
}
}
if (bestA != policy[s]) {
policy[s] = bestA;
changed = true;
}
}
if (!changed) break;
}
return {'values': V, 'policy': policy};
}