tokenizeBash function

List<BashToken> tokenizeBash(
  1. String input
)

Tokenize a bash command string into a list of BashTokens.

Handles quoting (single, double, ANSI-C $''), escapes, operators, heredoc markers, comments, variables, subshells, backticks, globs, and assignments.

Implementation

List<BashToken> tokenizeBash(String input) {
  final tokens = <BashToken>[];
  final len = input.length;
  var i = 0;

  bool isOperatorChar(String ch) =>
      ch == '|' ||
      ch == '&' ||
      ch == ';' ||
      ch == '(' ||
      ch == ')' ||
      ch == '{' ||
      ch == '}';

  bool isWhitespace(String ch) => ch == ' ' || ch == '\t';

  // Check if we are at the start of the input or after an operator / newline.
  bool isAtCommandStart() {
    if (tokens.isEmpty) return true;
    final last = tokens.last.type;
    return last == BashTokenType.newline ||
        last == BashTokenType.semicolon ||
        last == BashTokenType.pipe ||
        last == BashTokenType.and_ ||
        last == BashTokenType.or_ ||
        last == BashTokenType.lparen ||
        last == BashTokenType.lbrace;
  }

  while (i < len) {
    final ch = input[i];

    // Skip whitespace.
    if (isWhitespace(ch)) {
      i++;
      continue;
    }

    // Newline.
    if (ch == '\n') {
      tokens.add(BashToken(BashTokenType.newline, '\n', i, 1));
      i++;
      continue;
    }

    // Comment — # at start of a logical command position or after whitespace.
    // In bash, # is only a comment when at the start of a word in command
    // position. For simplicity, we treat any # at word-start as a comment.
    if (ch == '#' && isAtCommandStart()) {
      final start = i;
      while (i < len && input[i] != '\n') {
        i++;
      }
      tokens.add(
        BashToken(
          BashTokenType.comment,
          input.substring(start, i),
          start,
          i - start,
        ),
      );
      continue;
    }

    // Operators: ||, &&, |, &, ;, (, ), {, }
    if (ch == '|') {
      final start = i;
      if (i + 1 < len && input[i + 1] == '|') {
        tokens.add(BashToken(BashTokenType.or_, '||', start, 2));
        i += 2;
      } else {
        tokens.add(BashToken(BashTokenType.pipe, '|', start, 1));
        i++;
      }
      continue;
    }

    if (ch == '&') {
      final start = i;
      if (i + 1 < len && input[i + 1] == '&') {
        tokens.add(BashToken(BashTokenType.and_, '&&', start, 2));
        i += 2;
      } else if (i + 1 < len && input[i + 1] == '>') {
        // &> redirect both
        if (i + 2 < len && input[i + 2] == '>') {
          tokens.add(BashToken(BashTokenType.redirect, '&>>', start, 3));
          i += 3;
        } else {
          tokens.add(BashToken(BashTokenType.redirect, '&>', start, 2));
          i += 2;
        }
      } else {
        tokens.add(BashToken(BashTokenType.background, '&', start, 1));
        i++;
      }
      continue;
    }

    if (ch == ';') {
      tokens.add(BashToken(BashTokenType.semicolon, ';', i, 1));
      i++;
      continue;
    }

    if (ch == '(') {
      tokens.add(BashToken(BashTokenType.lparen, '(', i, 1));
      i++;
      continue;
    }
    if (ch == ')') {
      tokens.add(BashToken(BashTokenType.rparen, ')', i, 1));
      i++;
      continue;
    }
    if (ch == '{') {
      tokens.add(BashToken(BashTokenType.lbrace, '{', i, 1));
      i++;
      continue;
    }
    if (ch == '}') {
      tokens.add(BashToken(BashTokenType.rbrace, '}', i, 1));
      i++;
      continue;
    }

    // Redirections: >, >>, <, <<, <<-, 2>, 2>>, >&
    if (ch == '>' ||
        ch == '<' ||
        (ch == '2' && i + 1 < len && input[i + 1] == '>')) {
      final start = i;
      if (ch == '2' && i + 1 < len && input[i + 1] == '>') {
        if (i + 2 < len && input[i + 2] == '>') {
          tokens.add(BashToken(BashTokenType.redirect, '2>>', start, 3));
          i += 3;
        } else {
          tokens.add(BashToken(BashTokenType.redirect, '2>', start, 2));
          i += 2;
        }
        continue;
      }
      if (ch == '>') {
        if (i + 1 < len && input[i + 1] == '>') {
          tokens.add(BashToken(BashTokenType.redirect, '>>', start, 2));
          i += 2;
        } else if (i + 1 < len && input[i + 1] == '&') {
          tokens.add(BashToken(BashTokenType.redirect, '>&', start, 2));
          i += 2;
        } else {
          tokens.add(BashToken(BashTokenType.redirect, '>', start, 1));
          i++;
        }
        continue;
      }
      if (ch == '<') {
        if (i + 1 < len && input[i + 1] == '<') {
          if (i + 2 < len && input[i + 2] == '-') {
            tokens.add(BashToken(BashTokenType.heredocMarker, '<<-', start, 3));
            i += 3;
          } else if (i + 2 < len && input[i + 2] == '<') {
            // <<< here-string
            tokens.add(BashToken(BashTokenType.redirect, '<<<', start, 3));
            i += 3;
          } else {
            tokens.add(BashToken(BashTokenType.heredocMarker, '<<', start, 2));
            i += 2;
          }
        } else if (i + 1 < len && input[i + 1] == '>') {
          tokens.add(BashToken(BashTokenType.redirect, '<>', start, 2));
          i += 2;
        } else {
          tokens.add(BashToken(BashTokenType.redirect, '<', start, 1));
          i++;
        }
        continue;
      }
    }

    // Single-quoted string.
    if (ch == "'") {
      final start = i;
      i++; // skip opening quote
      final buf = StringBuffer();
      while (i < len && input[i] != "'") {
        buf.write(input[i]);
        i++;
      }
      if (i < len) i++; // skip closing quote
      tokens.add(
        BashToken(BashTokenType.singleQuote, buf.toString(), start, i - start),
      );
      continue;
    }

    // Double-quoted string.
    if (ch == '"') {
      final start = i;
      i++; // skip opening quote
      final buf = StringBuffer();
      while (i < len && input[i] != '"') {
        if (input[i] == '\\' && i + 1 < len) {
          final next = input[i + 1];
          if (next == '"' ||
              next == '\\' ||
              next == '\$' ||
              next == '`' ||
              next == '\n') {
            buf.write(next);
            i += 2;
            continue;
          }
        }
        buf.write(input[i]);
        i++;
      }
      if (i < len) i++; // skip closing quote
      tokens.add(
        BashToken(BashTokenType.doubleQuote, buf.toString(), start, i - start),
      );
      continue;
    }

    // Backtick command substitution.
    if (ch == '`') {
      final start = i;
      i++; // skip opening backtick
      final buf = StringBuffer();
      while (i < len && input[i] != '`') {
        if (input[i] == '\\' && i + 1 < len) {
          buf.write(input[i + 1]);
          i += 2;
          continue;
        }
        buf.write(input[i]);
        i++;
      }
      if (i < len) i++; // skip closing backtick
      tokens.add(
        BashToken(BashTokenType.backtick, buf.toString(), start, i - start),
      );
      continue;
    }

    // Dollar-prefixed constructs: $(), ${}, $VAR, $'...'
    if (ch == '\$') {
      final start = i;
      if (i + 1 < len && input[i + 1] == '(') {
        // $() — subshell / command substitution
        i += 2;
        var depth = 1;
        final buf = StringBuffer();
        while (i < len && depth > 0) {
          if (input[i] == '(') depth++;
          if (input[i] == ')') depth--;
          if (depth > 0) buf.write(input[i]);
          i++;
        }
        tokens.add(
          BashToken(BashTokenType.subshell, buf.toString(), start, i - start),
        );
        continue;
      }
      if (i + 1 < len && input[i + 1] == '{') {
        // ${VAR}
        i += 2;
        final buf = StringBuffer();
        while (i < len && input[i] != '}') {
          buf.write(input[i]);
          i++;
        }
        if (i < len) i++; // skip }
        tokens.add(
          BashToken(
            BashTokenType.variable,
            '\${${buf.toString()}}',
            start,
            i - start,
          ),
        );
        continue;
      }
      if (i + 1 < len && input[i + 1] == "'") {
        // $'...' ANSI-C quoting
        i += 2;
        final buf = StringBuffer();
        while (i < len && input[i] != "'") {
          if (input[i] == '\\' && i + 1 < len) {
            final esc = input[i + 1];
            switch (esc) {
              case 'n':
                buf.write('\n');
                break;
              case 't':
                buf.write('\t');
                break;
              case 'r':
                buf.write('\r');
                break;
              case '\\':
                buf.write('\\');
                break;
              case "'":
                buf.write("'");
                break;
              case 'a':
                buf.write('\x07');
                break;
              case 'b':
                buf.write('\b');
                break;
              case 'e':
                buf.write('\x1B');
                break;
              case 'f':
                buf.write('\x0C');
                break;
              case 'v':
                buf.write('\x0B');
                break;
              default:
                buf.write('\\');
                buf.write(esc);
            }
            i += 2;
            continue;
          }
          buf.write(input[i]);
          i++;
        }
        if (i < len) i++; // skip closing '
        tokens.add(
          BashToken(
            BashTokenType.singleQuote,
            buf.toString(),
            start,
            i - start,
          ),
        );
        continue;
      }
      if (i + 1 < len && _isVarStartChar(input[i + 1])) {
        // $VAR
        i++; // skip $
        final buf = StringBuffer();
        while (i < len && _isVarChar(input[i])) {
          buf.write(input[i]);
          i++;
        }
        tokens.add(
          BashToken(
            BashTokenType.variable,
            '\$${buf.toString()}',
            start,
            i - start,
          ),
        );
        continue;
      }
      if (i + 1 < len &&
          (input[i + 1] == '?' ||
              input[i + 1] == '!' ||
              input[i + 1] == '#' ||
              input[i + 1] == '@' ||
              input[i + 1] == '*' ||
              input[i + 1] == '-' ||
              input[i + 1] == '\$')) {
        // Special variables: $?, $!, $#, $@, $*, $-, $$
        tokens.add(
          BashToken(BashTokenType.variable, '\$${input[i + 1]}', start, 2),
        );
        i += 2;
        continue;
      }
      if (i + 1 < len &&
          input[i + 1].codeUnitAt(0) >= 0x30 &&
          input[i + 1].codeUnitAt(0) <= 0x39) {
        // Positional: $0..$9
        tokens.add(
          BashToken(BashTokenType.variable, '\$${input[i + 1]}', start, 2),
        );
        i += 2;
        continue;
      }
      // Bare $ — treat as word
      i++;
      tokens.add(BashToken(BashTokenType.word, '\$', start, 1));
      continue;
    }

    // Glob characters standalone.
    if (ch == '*' || ch == '?') {
      tokens.add(BashToken(BashTokenType.glob, ch, i, 1));
      i++;
      continue;
    }
    if (ch == '[') {
      final start = i;
      i++;
      while (i < len && input[i] != ']') {
        i++;
      }
      if (i < len) i++; // skip ]
      tokens.add(
        BashToken(
          BashTokenType.glob,
          input.substring(start, i),
          start,
          i - start,
        ),
      );
      continue;
    }

    // Word / assignment — collect until whitespace or operator.
    {
      final start = i;
      final buf = StringBuffer();
      var sawEquals = false;
      var equalsPos = -1;
      while (i < len) {
        final c = input[i];
        if (isWhitespace(c) ||
            c == '\n' ||
            isOperatorChar(c) ||
            c == '>' ||
            c == '<') {
          break;
        }
        if (c == '#' && buf.isNotEmpty) break; // inline comment start
        if (c == '\\' && i + 1 < len) {
          buf.write(input[i + 1]);
          i += 2;
          continue;
        }
        if (c == "'" || c == '"' || c == '`' || c == '\$') break;
        if (c == '*' || c == '?' || c == '[') break;
        if (c == '=' && !sawEquals) {
          sawEquals = true;
          equalsPos = buf.length;
        }
        buf.write(c);
        i++;
      }
      final word = buf.toString();
      if (word.isEmpty) {
        // Safety: advance past an unrecognised character to avoid infinite loop.
        i++;
        continue;
      }

      // Check for assignment (VAR=value).
      if (sawEquals && equalsPos > 0) {
        final name = word.substring(0, equalsPos);
        if (_isValidVarName(name)) {
          tokens.add(
            BashToken(BashTokenType.assignment, word, start, i - start),
          );
          continue;
        }
      }

      tokens.add(BashToken(BashTokenType.word, word, start, i - start));
    }
  }

  return tokens;
}