shell-parse | | | Search

The code is a parser for a Bash-like shell scripting language, using regular expressions to tokenize the input string and apply specific parsing rules for quoting and syntax. The parser handles various syntax elements, including single and double quotes, control characters and keywords, and backslashes and dollar signs, to correctly interpret the input string.

Run example

npm run import -- "shell parse"

shell parse


// '<(' is process substitution operator and
// can be parsed the same as control operator
var CONTROL = '(?:' + [
	'\\|\\|',
	'\\&\\&',
	';;',
	'\\|\\&',
	'\\<\\(',
	'\\<\\<\\<',
	'>>',
	'>\\&',
	'<\\&',
	'[&;()|<>]'
].join('|') + ')';
var controlRE = new RegExp('^' + CONTROL + '


);
var META = '|&;()<> \\t';
var SINGLE_QUOTE = '"((\\\\"|[^"])*?)"';
var DOUBLE_QUOTE = '\'((\\\\\'|[^\'])*?)\'';
var hash = /^#$/;

var SQ = "'";
var DQ = '"';
var DS = '


;

var TOKEN = '';
var mult = 0x100000000; // Math.pow(16, 8);
for (var i = 0; i < 4; i++) {
	TOKEN += (mult * Math.random()).toString(16);
}
var startsWithToken = new RegExp('^' + TOKEN);

function matchAll(s, r) {
	var origIndex = r.lastIndex;

	var matches = [];
	var matchObj;

	while ((matchObj = r.exec(s))) {
		matches.push(matchObj);
		if (r.lastIndex === matchObj.index) {
			r.lastIndex += 1;
		}
	}

	r.lastIndex = origIndex;

	return matches;
}

function getVar(env, pre, key) {
	var r = typeof env === 'function' ? env(key) : env[key];
	if (typeof r === 'undefined' && key != '') {
		r = '';
	} else if (typeof r === 'undefined') {
		r = '


;
	}

	if (typeof r === 'object') {
		return pre + TOKEN + JSON.stringify(r) + TOKEN;
	}
	return pre + r;
}

function parseInternal(string, env, opts) {
	if (!opts) {
		opts = {};
	}
	var BS = opts.escape || '\\';
	var BAREWORD = '(\\' + BS + '[\'"' + META + ']|[^\\s\'"' + META + '])+';

	var chunker = new RegExp([
		'(' + CONTROL + ')', // control chars
		'(' + BAREWORD + '|' + SINGLE_QUOTE + '|' + DOUBLE_QUOTE + ')+'
	].join('|'), 'g');

	var matches = matchAll(string, chunker);

	if (matches.length === 0) {
		return [];
	}
	if (!env) {
		env = {};
	}

	var commented = false;

	return matches.map(function (match) {
		var s = match[0];
		if (!s || commented) {
			return void undefined;
		}
		if (controlRE.test(s)) {
			return { op: s };
		}

		// Hand-written scanner/parser for Bash quoting rules:
		//
		// 1. inside single quotes, all characters are printed literally.
		// 2. inside double quotes, all characters are printed literally
		//    except variables prefixed by '


 and backslashes followed by
		//    either a double quote or another backslash.
		// 3. outside of any quotes, backslashes are treated as escape
		//    characters and not printed (unless they are themselves escaped)
		// 4. quote context can switch mid-token if there is no whitespace
		//     between the two quote contexts (e.g. all'one'"token" parses as
		//     "allonetoken")
		var quote = false;
		var esc = false;
		var out = '';
		var isGlob = false;
		var i;

		function parseEnvVar() {
			i += 1;
			var varend;
			var varname;
			var char = s.charAt(i);

			if (char === '{') {
				i += 1;
				if (s.charAt(i) === '}') {
					throw new Error('Bad substitution: ' + s.slice(i - 2, i + 1));
				}
				varend = s.indexOf('}', i);
				if (varend < 0) {
					throw new Error('Bad substitution: ' + s.slice(i));
				}
				varname = s.slice(i, varend);
				i = varend;
			} else if ((/[*@#?$!_-]/).test(char)) {
				varname = char;
				i += 1;
			} else {
				var slicedFromI = s.slice(i);
				varend = slicedFromI.match(/[^\w\d_]/);
				if (!varend) {
					varname = slicedFromI;
					i = s.length;
				} else {
					varname = slicedFromI.slice(0, varend.index);
					i += varend.index - 1;
				}
			}
			return getVar(env, '', varname);
		}

		for (i = 0; i < s.length; i++) {
			var c = s.charAt(i);
			isGlob = isGlob || (!quote && (c === '*' || c === '?'));
			if (esc) {
				out += c;
				esc = false;
			} else if (quote) {
				if (c === quote) {
					quote = false;
				} else if (quote == SQ) {
					out += c;
				} else { // Double quote
					if (c === BS) {
						i += 1;
						c = s.charAt(i);
						if (c === DQ || c === BS || c === DS) {
							out += c;
						} else {
							out += BS + c;
						}
					} else if (c === DS) {
						out += parseEnvVar();
					} else {
						out += c;
					}
				}
			} else if (c === DQ || c === SQ) {
				quote = c;
			} else if (controlRE.test(c)) {
				return { op: s };
			} else if (hash.test(c)) {
				commented = true;
				var commentObj = { comment: string.slice(match.index + i + 1) };
				if (out.length) {
					return [out, commentObj];
				}
				return [commentObj];
			} else if (c === BS) {
				esc = true;
			} else if (c === DS) {
				out += parseEnvVar();
			} else {
				out += c;
			}
		}

		if (isGlob) {
			return { op: 'glob', pattern: out };
		}

		return out;
	}).reduce(function (prev, arg) { // finalize parsed arguments
		// TODO: replace this whole reduce with a concat
		return typeof arg === 'undefined' ? prev : prev.concat(arg);
	}, []);
}

module.exports = function parse(s, env, opts) {
	var mapped = parseInternal(s, env, opts);
	if (typeof env !== 'function') {
		return mapped;
	}
	return mapped.reduce(function (acc, s) {
		if (typeof s === 'object') {
			return acc.concat(s);
		}
		var xs = s.split(RegExp('(' + TOKEN + '.*?' + TOKEN + ')', 'g'));
		if (xs.length === 1) {
			return acc.concat(xs[0]);
		}
		return acc.concat(xs.filter(Boolean).map(function (x) {
			if (startsWithToken.test(x)) {
				return JSON.parse(x.split(TOKEN)[1]);
			}
			return x;
		}));
	}, []);
};

What the code could have been:

const controlChars = '|&&;;|&|<(|<<<|>>|>&|&<|[&;()|<>]';
const controlRE = new RegExp(`^(${controlChars})



  
  
  shell parse
  
  
  



  
  
shell-parse | | | Search
); const metaChars = '|&;()<>'; const singleQuote = '"((\\\\\')|[^"])*?"'; const doubleQuote = '\'((\\\\\')|[^\'])*?\''; const token = (0x100000000 * Math.random()).toString(16).slice(0, 8); function* matchAll(s, r) { const origIndex = r.lastIndex; while ((r.lastIndex === 0 || s[r.lastIndex - 1]!== r.lastIndex[0]) && (matchObj = r.exec(s))) { yield matchObj; } r.lastIndex = origIndex; } function getVar(env, pre, key) { let r = typeof env === 'function'? env(key) : env[key]; if (typeof r === 'undefined' && key!= '') { r = ''; } else if (typeof r === 'undefined') { r = ' ; } if (typeof r === 'object') { return `${pre}${token}${JSON.stringify(r)}${token}`; } return `${pre}${r}`; } function parseInternal(string, env, opts) { if (!opts) { opts = {}; } const escape = opts.escape || '\\'; const bareword = `(\\${escape}[\'"\${metaChars}]|[^\\s\'"\${metaChars}])+`; const chunker = new RegExp(`(${controlChars})|(${bareword}|${singleQuote}|${doubleQuote})+`, 'g'); const matches = matchAll(string, chunker); if (matches.length === 0) { return []; } if (!env) { env = {}; } let commented = false; for (const match of matches) { const s = match[0]; if (!s || commented) { continue; } if (controlRE.test(s)) { yield { op: s }; continue; } let out = ''; let quote = false; let esc = false; let isGlob = false; let i; function parseEnvVar() { i += 1; let varname; let char = s.charAt(i); if (char === '{') { i += 1; if (s.charAt(i) === '}') { throw new Error('Bad substitution:'+ s.slice(i - 2, i + 1)); } varname = s.slice(i, s.indexOf('}', i)); i = s.indexOf('}', i) + 1; } else if ((/[*@#?$!_-]/).test(char)) { varname = char; i += 1; } else { var slicedFromI = s.slice(i); const varend = slicedFromI.match(/[^\w\d_]/); if (!varend) { varname = slicedFromI; i = s.length; } else { varname = slicedFromI.slice(0, varend.index); i += varend.index - 1; } } yield getVar(env, '', varname); } for (i = 0; i < s.length; i++) { let c = s.charAt(i); isGlob = isGlob || (!quote && (c === '*' || c === '?')); if (esc) { out += c; esc = false; } else if (quote) { if (c === quote) { quote = false; } else if (quote == '"') { out += c; } else { // Single quote if (c === escape) { i += 1; c = s.charAt(i); if (c === quote || c === escape) { out += c; } else { out += escape + c; } } else if (i + 1 < s.length && s.slice(i, i + 2) === escape + quote) { if (out[out.length - 1] === '\\') { out = out.slice(0, -1) + quote; } else { out += '\\'; } } else if (c === escape + quote) { out += c; } else { out += c; } } } else if (c === doubleQuote || c === singleQuote) { quote = c; } else if (controlRE.test(c)) { yield { op: s }; continue; } else if (c === escape) { esc = true; } else if (i + 1 < s.length && s.slice(i, i + 2) === escape + quote) { if (out[out.length - 1] === '\\') { out = out.slice(0, -1) + quote; } else { out += '\\'; } } else if (c === escape + quote) { out += c; } else if (c === quote) { quote = false; } else if (c === ' ) { yield parseEnvVar(); } else { out += c; } } if (isGlob) { yield { op: 'glob', pattern: out }; } yield out; } } module.exports = function parse(s, env, opts) { const mapped = parseInternal(s, env, opts); if (typeof env!== 'function') { return mapped; } const tokenized = Array.from(mapped).map(arg => { if (typeof arg === 'undefined') return arg; if (typeof arg === 'object') return arg; const xs = arg.split(new RegExp(`(${token}.*?${token})`, 'g')); if (xs.length === 1) return xs[0]; return xs.filter(Boolean).map(x => { if (startsWithToken.test(x)) { return JSON.parse(x.split(token)[1]); } return x; }); }).flat(Infinity); return tokenized; };

Code Breakdown

The code appears to be a parser for Bash-like shell scripting language. It uses regular expressions to scan and tokenize the input string, and then applies specific parsing rules to handle quoting and other syntax.

Variables and Constants

Functions

Parsing Rules

The code implements hand-written parsing rules for Bash quoting, which includes:

  1. Single quotes: all characters are printed literally.
  2. Double quotes: all characters are printed literally, except for special characters that are escaped or followed by a backslash.
  3. Quoting: the parser handles quoting by identifying the quoting character (single or double quote) and then processing the inner string accordingly.

The parser also handles control characters and keywords, as well as backslashes and dollar signs in the input string.