tokenizeBash function
Tokenize a bash command string into a list of BashTokens.
Handles quoting (single, double, ANSI-C $''), escapes, operators, heredoc markers, comments, variables, subshells, backticks, globs, and assignments.
Implementation
List<BashToken> tokenizeBash(String input) {
final tokens = <BashToken>[];
final len = input.length;
var i = 0;
bool isOperatorChar(String ch) =>
ch == '|' ||
ch == '&' ||
ch == ';' ||
ch == '(' ||
ch == ')' ||
ch == '{' ||
ch == '}';
bool isWhitespace(String ch) => ch == ' ' || ch == '\t';
// Check if we are at the start of the input or after an operator / newline.
bool isAtCommandStart() {
if (tokens.isEmpty) return true;
final last = tokens.last.type;
return last == BashTokenType.newline ||
last == BashTokenType.semicolon ||
last == BashTokenType.pipe ||
last == BashTokenType.and_ ||
last == BashTokenType.or_ ||
last == BashTokenType.lparen ||
last == BashTokenType.lbrace;
}
while (i < len) {
final ch = input[i];
// Skip whitespace.
if (isWhitespace(ch)) {
i++;
continue;
}
// Newline.
if (ch == '\n') {
tokens.add(BashToken(BashTokenType.newline, '\n', i, 1));
i++;
continue;
}
// Comment — # at start of a logical command position or after whitespace.
// In bash, # is only a comment when at the start of a word in command
// position. For simplicity, we treat any # at word-start as a comment.
if (ch == '#' && isAtCommandStart()) {
final start = i;
while (i < len && input[i] != '\n') {
i++;
}
tokens.add(
BashToken(
BashTokenType.comment,
input.substring(start, i),
start,
i - start,
),
);
continue;
}
// Operators: ||, &&, |, &, ;, (, ), {, }
if (ch == '|') {
final start = i;
if (i + 1 < len && input[i + 1] == '|') {
tokens.add(BashToken(BashTokenType.or_, '||', start, 2));
i += 2;
} else {
tokens.add(BashToken(BashTokenType.pipe, '|', start, 1));
i++;
}
continue;
}
if (ch == '&') {
final start = i;
if (i + 1 < len && input[i + 1] == '&') {
tokens.add(BashToken(BashTokenType.and_, '&&', start, 2));
i += 2;
} else if (i + 1 < len && input[i + 1] == '>') {
// &> redirect both
if (i + 2 < len && input[i + 2] == '>') {
tokens.add(BashToken(BashTokenType.redirect, '&>>', start, 3));
i += 3;
} else {
tokens.add(BashToken(BashTokenType.redirect, '&>', start, 2));
i += 2;
}
} else {
tokens.add(BashToken(BashTokenType.background, '&', start, 1));
i++;
}
continue;
}
if (ch == ';') {
tokens.add(BashToken(BashTokenType.semicolon, ';', i, 1));
i++;
continue;
}
if (ch == '(') {
tokens.add(BashToken(BashTokenType.lparen, '(', i, 1));
i++;
continue;
}
if (ch == ')') {
tokens.add(BashToken(BashTokenType.rparen, ')', i, 1));
i++;
continue;
}
if (ch == '{') {
tokens.add(BashToken(BashTokenType.lbrace, '{', i, 1));
i++;
continue;
}
if (ch == '}') {
tokens.add(BashToken(BashTokenType.rbrace, '}', i, 1));
i++;
continue;
}
// Redirections: >, >>, <, <<, <<-, 2>, 2>>, >&
if (ch == '>' ||
ch == '<' ||
(ch == '2' && i + 1 < len && input[i + 1] == '>')) {
final start = i;
if (ch == '2' && i + 1 < len && input[i + 1] == '>') {
if (i + 2 < len && input[i + 2] == '>') {
tokens.add(BashToken(BashTokenType.redirect, '2>>', start, 3));
i += 3;
} else {
tokens.add(BashToken(BashTokenType.redirect, '2>', start, 2));
i += 2;
}
continue;
}
if (ch == '>') {
if (i + 1 < len && input[i + 1] == '>') {
tokens.add(BashToken(BashTokenType.redirect, '>>', start, 2));
i += 2;
} else if (i + 1 < len && input[i + 1] == '&') {
tokens.add(BashToken(BashTokenType.redirect, '>&', start, 2));
i += 2;
} else {
tokens.add(BashToken(BashTokenType.redirect, '>', start, 1));
i++;
}
continue;
}
if (ch == '<') {
if (i + 1 < len && input[i + 1] == '<') {
if (i + 2 < len && input[i + 2] == '-') {
tokens.add(BashToken(BashTokenType.heredocMarker, '<<-', start, 3));
i += 3;
} else if (i + 2 < len && input[i + 2] == '<') {
// <<< here-string
tokens.add(BashToken(BashTokenType.redirect, '<<<', start, 3));
i += 3;
} else {
tokens.add(BashToken(BashTokenType.heredocMarker, '<<', start, 2));
i += 2;
}
} else if (i + 1 < len && input[i + 1] == '>') {
tokens.add(BashToken(BashTokenType.redirect, '<>', start, 2));
i += 2;
} else {
tokens.add(BashToken(BashTokenType.redirect, '<', start, 1));
i++;
}
continue;
}
}
// Single-quoted string.
if (ch == "'") {
final start = i;
i++; // skip opening quote
final buf = StringBuffer();
while (i < len && input[i] != "'") {
buf.write(input[i]);
i++;
}
if (i < len) i++; // skip closing quote
tokens.add(
BashToken(BashTokenType.singleQuote, buf.toString(), start, i - start),
);
continue;
}
// Double-quoted string.
if (ch == '"') {
final start = i;
i++; // skip opening quote
final buf = StringBuffer();
while (i < len && input[i] != '"') {
if (input[i] == '\\' && i + 1 < len) {
final next = input[i + 1];
if (next == '"' ||
next == '\\' ||
next == '\$' ||
next == '`' ||
next == '\n') {
buf.write(next);
i += 2;
continue;
}
}
buf.write(input[i]);
i++;
}
if (i < len) i++; // skip closing quote
tokens.add(
BashToken(BashTokenType.doubleQuote, buf.toString(), start, i - start),
);
continue;
}
// Backtick command substitution.
if (ch == '`') {
final start = i;
i++; // skip opening backtick
final buf = StringBuffer();
while (i < len && input[i] != '`') {
if (input[i] == '\\' && i + 1 < len) {
buf.write(input[i + 1]);
i += 2;
continue;
}
buf.write(input[i]);
i++;
}
if (i < len) i++; // skip closing backtick
tokens.add(
BashToken(BashTokenType.backtick, buf.toString(), start, i - start),
);
continue;
}
// Dollar-prefixed constructs: $(), ${}, $VAR, $'...'
if (ch == '\$') {
final start = i;
if (i + 1 < len && input[i + 1] == '(') {
// $() — subshell / command substitution
i += 2;
var depth = 1;
final buf = StringBuffer();
while (i < len && depth > 0) {
if (input[i] == '(') depth++;
if (input[i] == ')') depth--;
if (depth > 0) buf.write(input[i]);
i++;
}
tokens.add(
BashToken(BashTokenType.subshell, buf.toString(), start, i - start),
);
continue;
}
if (i + 1 < len && input[i + 1] == '{') {
// ${VAR}
i += 2;
final buf = StringBuffer();
while (i < len && input[i] != '}') {
buf.write(input[i]);
i++;
}
if (i < len) i++; // skip }
tokens.add(
BashToken(
BashTokenType.variable,
'\${${buf.toString()}}',
start,
i - start,
),
);
continue;
}
if (i + 1 < len && input[i + 1] == "'") {
// $'...' ANSI-C quoting
i += 2;
final buf = StringBuffer();
while (i < len && input[i] != "'") {
if (input[i] == '\\' && i + 1 < len) {
final esc = input[i + 1];
switch (esc) {
case 'n':
buf.write('\n');
break;
case 't':
buf.write('\t');
break;
case 'r':
buf.write('\r');
break;
case '\\':
buf.write('\\');
break;
case "'":
buf.write("'");
break;
case 'a':
buf.write('\x07');
break;
case 'b':
buf.write('\b');
break;
case 'e':
buf.write('\x1B');
break;
case 'f':
buf.write('\x0C');
break;
case 'v':
buf.write('\x0B');
break;
default:
buf.write('\\');
buf.write(esc);
}
i += 2;
continue;
}
buf.write(input[i]);
i++;
}
if (i < len) i++; // skip closing '
tokens.add(
BashToken(
BashTokenType.singleQuote,
buf.toString(),
start,
i - start,
),
);
continue;
}
if (i + 1 < len && _isVarStartChar(input[i + 1])) {
// $VAR
i++; // skip $
final buf = StringBuffer();
while (i < len && _isVarChar(input[i])) {
buf.write(input[i]);
i++;
}
tokens.add(
BashToken(
BashTokenType.variable,
'\$${buf.toString()}',
start,
i - start,
),
);
continue;
}
if (i + 1 < len &&
(input[i + 1] == '?' ||
input[i + 1] == '!' ||
input[i + 1] == '#' ||
input[i + 1] == '@' ||
input[i + 1] == '*' ||
input[i + 1] == '-' ||
input[i + 1] == '\$')) {
// Special variables: $?, $!, $#, $@, $*, $-, $$
tokens.add(
BashToken(BashTokenType.variable, '\$${input[i + 1]}', start, 2),
);
i += 2;
continue;
}
if (i + 1 < len &&
input[i + 1].codeUnitAt(0) >= 0x30 &&
input[i + 1].codeUnitAt(0) <= 0x39) {
// Positional: $0..$9
tokens.add(
BashToken(BashTokenType.variable, '\$${input[i + 1]}', start, 2),
);
i += 2;
continue;
}
// Bare $ — treat as word
i++;
tokens.add(BashToken(BashTokenType.word, '\$', start, 1));
continue;
}
// Glob characters standalone.
if (ch == '*' || ch == '?') {
tokens.add(BashToken(BashTokenType.glob, ch, i, 1));
i++;
continue;
}
if (ch == '[') {
final start = i;
i++;
while (i < len && input[i] != ']') {
i++;
}
if (i < len) i++; // skip ]
tokens.add(
BashToken(
BashTokenType.glob,
input.substring(start, i),
start,
i - start,
),
);
continue;
}
// Word / assignment — collect until whitespace or operator.
{
final start = i;
final buf = StringBuffer();
var sawEquals = false;
var equalsPos = -1;
while (i < len) {
final c = input[i];
if (isWhitespace(c) ||
c == '\n' ||
isOperatorChar(c) ||
c == '>' ||
c == '<') {
break;
}
if (c == '#' && buf.isNotEmpty) break; // inline comment start
if (c == '\\' && i + 1 < len) {
buf.write(input[i + 1]);
i += 2;
continue;
}
if (c == "'" || c == '"' || c == '`' || c == '\$') break;
if (c == '*' || c == '?' || c == '[') break;
if (c == '=' && !sawEquals) {
sawEquals = true;
equalsPos = buf.length;
}
buf.write(c);
i++;
}
final word = buf.toString();
if (word.isEmpty) {
// Safety: advance past an unrecognised character to avoid infinite loop.
i++;
continue;
}
// Check for assignment (VAR=value).
if (sawEquals && equalsPos > 0) {
final name = word.substring(0, equalsPos);
if (_isValidVarName(name)) {
tokens.add(
BashToken(BashTokenType.assignment, word, start, i - start),
);
continue;
}
}
tokens.add(BashToken(BashTokenType.word, word, start, i - start));
}
}
return tokens;
}