AhoCorasick.fromWordList constructor

AhoCorasick.fromWordList(
  1. List<String> patterns, {
  2. bool separateBySpaces = false,
})

initialize the aho corasick algorithm with a given list of words this will create the state machine. If you want to force that recognized words are either at the beginning / end or are surounded by spaces set separateBySpaces to true

Implementation

factory AhoCorasick.fromWordList(List<String> patterns,
    {bool separateBySpaces = false}) {
  // initial state is the empty word
  final states = [WordState(index: 0, words: [], failure: 0)];
  var longest = 0;
  // ignore: omit_local_variable_types
  Map<int, Map<String, int>> transitionMap = {};
  patterns.forEach((inputPattern) {
    final pattern = separateBySpaces ? ' $inputPattern ' : inputPattern;
    var curStateIndex = 0;
    longest = max(longest, pattern.length);
    for (var i = 0; i < pattern.length; i++) {
      final stateMap = transitionMap[curStateIndex] ?? {};
      final resState = stateMap[pattern[i]];
      if (resState == null) {
        final newStateIdx = states.length;
        final newState = WordState(
            index: newStateIdx,
            words: (i == pattern.length - 1) ? [pattern] : [],
            isFinal: i == pattern.length - 1);
        states.add(newState);
        stateMap.putIfAbsent(pattern[i], () => newStateIdx);
        transitionMap.update(curStateIndex, (_) => stateMap,
            ifAbsent: () => stateMap);
        curStateIndex = newStateIdx;
      } else {
        curStateIndex = resState;
        if (i == pattern.length - 1) {
          final curState = WordState(
              index: curStateIndex,
              words: states[curStateIndex].words..add(pattern),
              isFinal: i == pattern.length - 1);
          states[curStateIndex] = curState;
        }
      }
    }
  });
  // edge case when there are no patterns ;)
  if (transitionMap.containsKey(0)) {
    // if the state machine cannot follow up on a pattern it might
    // follow up on other patterns those are tracked with this
    // failure map
    final queue = Queue<int>();
    transitionMap[0]?.forEach((character, resState) {
      states[resState] = states[resState].copyWith(
        failure: 0,
      );
      queue.add(resState);
    });
    while (queue.isNotEmpty) {
      final stateIdx = queue.removeFirst();
      (transitionMap[stateIdx] ?? {}).forEach((character, resState) {
        queue.add(resState);
        var failStateIdx = states[stateIdx].failure;
        while ((transitionMap[failStateIdx] ?? {})[character] == null) {
          if (failStateIdx == 0) {
            break;
          }
          failStateIdx = states[failStateIdx].failure;
        }
        failStateIdx = (transitionMap[failStateIdx] ?? {})[character] ?? 0;
        states[resState] = states[resState].copyWith(
          failure: failStateIdx,
          words: states[resState].words..addAll(states[failStateIdx].words),
          isFinal: states[failStateIdx].isFinal || states[resState].isFinal,
        );
      });
    }
  }
  return AhoCorasick._(
      separateBySpaces: separateBySpaces,
      longestPattern: longest,
      stateMachine: StateMachine<WordState, String>(
          states: states,
          isSuccessState: (state) => state.isFinal,
          transition: (
              {required WordState activeState, required String input}) {
            var curState = activeState.index;
            var endState = (transitionMap[curState] ?? {})[input] ?? -1;

            while (endState == -1) {
              endState = states[curState].failure;
              curState = endState;
            }
            return states[(transitionMap[curState] ?? {})[input] ?? 0];
          }));
}