This code parses simplified glob patterns, converting them into regular expressions for matching files and directories. It handles special characters, escaping, and nested patterns to accurately represent the glob syntax in a regular expression format.
npm run import -- "mini parser"
var reSpecials = charSet('().*{}+?[]^$\\!')
// replace stuff like \* with *
function globUnescape (s) {
return s.replace(/\\(.)/g, '$1')
}
function charSet (s) {
return s.split('').reduce(function (set, c) {
set[c] = true
return set
}, {})
}
var SUBPARSE = {}
function parse (pattern, isSub) {
if (pattern.length > 1024 * 64) {
throw new TypeError('pattern is too long')
}
// shortcuts
if (pattern === '**') return {glob: true}
if (pattern === '') return ''
var re = ''
var hasMagic = false
var escaping = false
// ? => one single character
var patternListStack = []
var negativeLists = []
var stateChar
var inClass = false
var reClassStart = -1
var classStart = -1
// . and .. never match anything that doesn't start with .,
// even when options.dot is set.
var patternStart = pattern.charAt(0) === '.' ? '' // anything
// not (start or / followed by . or .. followed by / or end)
: '(?!(?:^|\\\/)\\.{1,2}(?:$|\\\/))'
function clearStateChar () {
if (stateChar) {
// we had some state-tracking character
// that wasn't consumed by this pass.
switch (stateChar) {
case '*':
re += '[^/]*?'
hasMagic = true
break
case '?':
re += '[^/]'
hasMagic = true
break
default:
re += '\\' + stateChar
break
}
stateChar = false
}
}
for (var i = 0, len = pattern.length, c
; (i < len) && (c = pattern.charAt(i))
; i++) {
// skip over any that are escaped.
if (escaping && reSpecials[c]) {
re += '\\' + c
escaping = false
continue
}
switch (c) {
case '/':
// completely not allowed, even escaped.
// Should already be path-split by now.
return false
case '\\':
clearStateChar()
escaping = true
continue
// the various stateChar values
// for the "extglob" stuff.
case '?':
case '*':
case '+':
case '@':
case '!':
// all of those are literals inside a class, except that
// the glob [!a] means [^a] in regexp
if (inClass) {
if (c === '!' && i === classStart + 1) c = '^'
re += c
continue
}
// if we already have a stateChar, then it means
// that there was something like ** or +? in there.
// Handle the stateChar, then proceed with this one.
clearStateChar()
stateChar = c
// if extglob is disabled, then +(asdf|foo) isn't a thing.
// just clear the statechar *now*, rather than even diving into
// the patternList stuff.
continue
case '(':
if (inClass) {
re += '('
continue
}
if (!stateChar) {
re += '\\('
continue
}
patternListStack.push({
type: stateChar,
start: i - 1,
reStart: re.length,
open: '\\(',
close: '\\)'
})
// negation is (?:(?!js)[^/]*)
re += stateChar === '!' ? '(?:(?!(?:' : '(?:'
stateChar = false
continue
case ')':
if (inClass || !patternListStack.length) {
re += '\\)'
continue
}
clearStateChar()
hasMagic = true
var pl = patternListStack.pop()
// negation is (?:(?!js)[^/]*)
// The others are (?:<pattern>)<type>
re += pl.close
if (pl.type === '!') {
negativeLists.push(pl)
}
pl.reEnd = re.length
continue
case '|':
if (inClass || !patternListStack.length || escaping) {
re += '\\|'
escaping = false
continue
}
clearStateChar()
re += '|'
continue
// these are mostly the same in regexp and glob
case '[':
// swallow any state-tracking char before the [
clearStateChar()
if (inClass) {
re += '\\' + c
continue
}
inClass = true
classStart = i
reClassStart = re.length
re += c
continue
case ']':
// a right bracket shall lose its special
// meaning and represent itself in
// a bracket expression if it occurs
// first in the list. -- POSIX.2 2.8.3.2
if (i === classStart + 1 || !inClass) {
re += '\\' + c
escaping = false
continue
}
// handle the case where we left a class open.
// "[z-a]" is valid, equivalent to "\[z-a\]"
if (inClass) {
// split where the last [ was, make sure we don't have
// an invalid re. if so, re-walk the contents of the
// would-be class to re-translate any characters that
// were passed through as-is
// TODO: It would probably be faster to determine this
// without a try/catch and a new RegExp, but it's tricky
// to do safely. For now, this is safe and works.
var cs = pattern.substring(classStart + 1, i)
try {
RegExp('[' + cs + ']')
} catch (er) {
// not a valid class!
var sp = parse(cs, SUBPARSE)
re = re.substr(0, reClassStart) + '\\[' + sp[0] + '\\]'
hasMagic = hasMagic || sp[1]
inClass = false
continue
}
}
// finish up the class.
hasMagic = true
inClass = false
re += c
continue
default:
// swallow any state char that wasn't consumed
clearStateChar()
if (escaping) {
// no need
escaping = false
} else if (reSpecials[c]
&& !(c === '^' && inClass)) {
re += '\\'
}
re += c
} // switch
} // for
// handle the case where we left a class open.
// "[abc" is valid, equivalent to "\[abc"
if (inClass) {
// split where the last [ was, and escape it
// this is a huge pita. We now have to re-walk
// the contents of the would-be class to re-translate
// any characters that were passed through as-is
cs = pattern.substr(classStart + 1)
sp = parse(cs, SUBPARSE)
re = re.substr(0, reClassStart) + '\\[' + sp[0]
hasMagic = hasMagic || sp[1]
}
// handle the case where we had a +( thing at the *end*
// of the pattern.
// each pattern list stack adds 3 chars, and we need to go through
// and escape any | chars that were passed through as-is for the regexp.
// Go through and escape them, taking care not to double-escape any
// | chars that were already escaped.
for (pl = patternListStack.pop(); pl; pl = patternListStack.pop()) {
var tail = re.slice(pl.reStart + pl.open.length)
// maybe some even number of \, then maybe 1 \, followed by a |
tail = tail.replace(/((?:\\{2}){0,64})(\\?)\|/g, function (_, $1, $2) {
if (!$2) {
// the | isn't already escaped, so escape it.
$2 = '\\'
}
// need to escape all those slashes *again*, without escaping the
// one that we need for escaping the | character. As it works out,
// escaping an even number of slashes can be done by simply repeating
// it exactly after itself. That's why this trick works.
//
// I am sorry that you have to see this.
return $1 + $1 + $2 + '|'
})
var t = pl.type === '*' ? star
: pl.type === '?' ? qmark
: '\\' + pl.type
hasMagic = true
re = re.slice(0, pl.reStart) + t + '\\(' + tail
}
// handle trailing things that only matter at the very end.
clearStateChar()
if (escaping) {
// trailing \\
re += '\\\\'
}
// only need to apply the nodot start if the re starts with
// something that could conceivably capture a dot
var addPatternStart = false
switch (re.charAt(0)) {
case '.':
case '[':
case '(': addPatternStart = true
}
// Hack to work around lack of negative lookbehind in JS
// A pattern like: *.!(x).!(y|z) needs to ensure that a name
// like 'a.xyz.yz' doesn't match. So, the first negative
// lookahead, has to look ALL the way ahead, to the end of
// the pattern.
for (var n = negativeLists.length - 1; n > -1; n--) {
var nl = negativeLists[n]
var nlBefore = re.slice(0, nl.reStart)
var nlFirst = re.slice(nl.reStart, nl.reEnd - 8)
var nlLast = re.slice(nl.reEnd - 8, nl.reEnd)
var nlAfter = re.slice(nl.reEnd)
nlLast += nlAfter
// Handle nested stuff like *(*.js|!(*.json)), where open parens
// mean that we should *not* include the ) in the bit that is considered
// "after" the negated section.
var openParensBefore = nlBefore.split('(').length - 1
var cleanAfter = nlAfter
for (i = 0; i < openParensBefore; i++) {
cleanAfter = cleanAfter.replace(/\)[+*?]?/, '')
}
nlAfter = cleanAfter
var dollar = ''
if (nlAfter === '' && isSub !== SUBPARSE) {
dollar = '