minimatch | , expand | minimatch | Search

This code parses simplified glob patterns, converting them into regular expressions for matching files and directories. It handles special characters, escaping, and nested patterns to accurately represent the glob syntax in a regular expression format.

Run example

npm run import -- "mini parser"

mini parser

var reSpecials = charSet('().*{}+?[]^$\\!')

// replace stuff like \* with *
function globUnescape (s) {
  return s.replace(/\\(.)/g, '$1')
}

function charSet (s) {
  return s.split('').reduce(function (set, c) {
    set[c] = true
    return set
  }, {})
}

var SUBPARSE = {}
function parse (pattern, isSub) {
  if (pattern.length > 1024 * 64) {
    throw new TypeError('pattern is too long')
  }

  // shortcuts
  if (pattern === '**') return {glob: true}
  if (pattern === '') return ''

  var re = ''
  var hasMagic = false
  var escaping = false
  // ? => one single character
  var patternListStack = []
  var negativeLists = []
  var stateChar
  var inClass = false
  var reClassStart = -1
  var classStart = -1
  // . and .. never match anything that doesn't start with .,
  // even when options.dot is set.
  var patternStart = pattern.charAt(0) === '.' ? '' // anything
  // not (start or / followed by . or .. followed by / or end)
  : '(?!(?:^|\\\/)\\.{1,2}(?:$|\\\/))'

  function clearStateChar () {
    if (stateChar) {
      // we had some state-tracking character
      // that wasn't consumed by this pass.
      switch (stateChar) {
        case '*':
          re += '[^/]*?'
          hasMagic = true
        break
        case '?':
          re += '[^/]'
          hasMagic = true
        break
        default:
          re += '\\' + stateChar
        break
      }
      stateChar = false
    }
  }

  for (var i = 0, len = pattern.length, c
    ; (i < len) && (c = pattern.charAt(i))
    ; i++) {
    // skip over any that are escaped.
    if (escaping && reSpecials[c]) {
      re += '\\' + c
      escaping = false
      continue
    }

    switch (c) {
      case '/':
        // completely not allowed, even escaped.
        // Should already be path-split by now.
        return false

      case '\\':
        clearStateChar()
        escaping = true
      continue

      // the various stateChar values
      // for the "extglob" stuff.
      case '?':
      case '*':
      case '+':
      case '@':
      case '!':
        // all of those are literals inside a class, except that
        // the glob [!a] means [^a] in regexp
        if (inClass) {
          if (c === '!' && i === classStart + 1) c = '^'
          re += c
          continue
        }

        // if we already have a stateChar, then it means
        // that there was something like ** or +? in there.
        // Handle the stateChar, then proceed with this one.
        clearStateChar()
        stateChar = c
        // if extglob is disabled, then +(asdf|foo) isn't a thing.
        // just clear the statechar *now*, rather than even diving into
        // the patternList stuff.
      continue

      case '(':
        if (inClass) {
          re += '('
          continue
        }

        if (!stateChar) {
          re += '\\('
          continue
        }

        patternListStack.push({
          type: stateChar,
          start: i - 1,
          reStart: re.length,
          open: '\\(',
          close: '\\)'
        })
        // negation is (?:(?!js)[^/]*)
        re += stateChar === '!' ? '(?:(?!(?:' : '(?:'
        stateChar = false
      continue

      case ')':
        if (inClass || !patternListStack.length) {
          re += '\\)'
          continue
        }

        clearStateChar()
        hasMagic = true
        var pl = patternListStack.pop()
        // negation is (?:(?!js)[^/]*)
        // The others are (?:<pattern>)<type>
        re += pl.close
        if (pl.type === '!') {
          negativeLists.push(pl)
        }
        pl.reEnd = re.length
      continue

      case '|':
        if (inClass || !patternListStack.length || escaping) {
          re += '\\|'
          escaping = false
          continue
        }

        clearStateChar()
        re += '|'
      continue

      // these are mostly the same in regexp and glob
      case '[':
        // swallow any state-tracking char before the [
        clearStateChar()

        if (inClass) {
          re += '\\' + c
          continue
        }

        inClass = true
        classStart = i
        reClassStart = re.length
        re += c
      continue

      case ']':
        //  a right bracket shall lose its special
        //  meaning and represent itself in
        //  a bracket expression if it occurs
        //  first in the list.  -- POSIX.2 2.8.3.2
        if (i === classStart + 1 || !inClass) {
          re += '\\' + c
          escaping = false
          continue
        }

        // handle the case where we left a class open.
        // "[z-a]" is valid, equivalent to "\[z-a\]"
        if (inClass) {
          // split where the last [ was, make sure we don't have
          // an invalid re. if so, re-walk the contents of the
          // would-be class to re-translate any characters that
          // were passed through as-is
          // TODO: It would probably be faster to determine this
          // without a try/catch and a new RegExp, but it's tricky
          // to do safely.  For now, this is safe and works.
          var cs = pattern.substring(classStart + 1, i)
          try {
            RegExp('[' + cs + ']')
          } catch (er) {
            // not a valid class!
            var sp = parse(cs, SUBPARSE)
            re = re.substr(0, reClassStart) + '\\[' + sp[0] + '\\]'
            hasMagic = hasMagic || sp[1]
            inClass = false
            continue
          }
        }

        // finish up the class.
        hasMagic = true
        inClass = false
        re += c
      continue

      default:
        // swallow any state char that wasn't consumed
        clearStateChar()

        if (escaping) {
          // no need
          escaping = false
        } else if (reSpecials[c]
          && !(c === '^' && inClass)) {
          re += '\\'
        }

        re += c

    } // switch
  } // for

  // handle the case where we left a class open.
  // "[abc" is valid, equivalent to "\[abc"
  if (inClass) {
    // split where the last [ was, and escape it
    // this is a huge pita.  We now have to re-walk
    // the contents of the would-be class to re-translate
    // any characters that were passed through as-is
    cs = pattern.substr(classStart + 1)
    sp = parse(cs, SUBPARSE)
    re = re.substr(0, reClassStart) + '\\[' + sp[0]
    hasMagic = hasMagic || sp[1]
  }

  // handle the case where we had a +( thing at the *end*
  // of the pattern.
  // each pattern list stack adds 3 chars, and we need to go through
  // and escape any | chars that were passed through as-is for the regexp.
  // Go through and escape them, taking care not to double-escape any
  // | chars that were already escaped.
  for (pl = patternListStack.pop(); pl; pl = patternListStack.pop()) {
    var tail = re.slice(pl.reStart + pl.open.length)
    // maybe some even number of \, then maybe 1 \, followed by a |
    tail = tail.replace(/((?:\\{2}){0,64})(\\?)\|/g, function (_, $1, $2) {
      if (!$2) {
        // the | isn't already escaped, so escape it.
        $2 = '\\'
      }

      // need to escape all those slashes *again*, without escaping the
      // one that we need for escaping the | character.  As it works out,
      // escaping an even number of slashes can be done by simply repeating
      // it exactly after itself.  That's why this trick works.
      //
      // I am sorry that you have to see this.
      return $1 + $1 + $2 + '|'
    })

    var t = pl.type === '*' ? star
      : pl.type === '?' ? qmark
      : '\\' + pl.type

    hasMagic = true
    re = re.slice(0, pl.reStart) + t + '\\(' + tail
  }

  // handle trailing things that only matter at the very end.
  clearStateChar()
  if (escaping) {
    // trailing \\
    re += '\\\\'
  }

  // only need to apply the nodot start if the re starts with
  // something that could conceivably capture a dot
  var addPatternStart = false
  switch (re.charAt(0)) {
    case '.':
    case '[':
    case '(': addPatternStart = true
  }

  // Hack to work around lack of negative lookbehind in JS
  // A pattern like: *.!(x).!(y|z) needs to ensure that a name
  // like 'a.xyz.yz' doesn't match.  So, the first negative
  // lookahead, has to look ALL the way ahead, to the end of
  // the pattern.
  for (var n = negativeLists.length - 1; n > -1; n--) {
    var nl = negativeLists[n]

    var nlBefore = re.slice(0, nl.reStart)
    var nlFirst = re.slice(nl.reStart, nl.reEnd - 8)
    var nlLast = re.slice(nl.reEnd - 8, nl.reEnd)
    var nlAfter = re.slice(nl.reEnd)

    nlLast += nlAfter

    // Handle nested stuff like *(*.js|!(*.json)), where open parens
    // mean that we should *not* include the ) in the bit that is considered
    // "after" the negated section.
    var openParensBefore = nlBefore.split('(').length - 1
    var cleanAfter = nlAfter
    for (i = 0; i < openParensBefore; i++) {
      cleanAfter = cleanAfter.replace(/\)[+*?]?/, '')
    }
    nlAfter = cleanAfter

    var dollar = ''
    if (nlAfter === '' && isSub !== SUBPARSE) {
      dollar = '



    }
    var newRe = nlBefore + nlFirst + nlAfter + dollar + nlLast
    re = newRe
  }

  // if the re is not "" at this point, then we need to make sure
  // it doesn't match against an empty path part.
  // Otherwise a/* will match a/, which it should not.
  if (re !== '' && hasMagic) {
    re = '(?=.)' + re
  }

  if (addPatternStart) {
    re = patternStart + re
  }

  // parsing just a piece of a larger pattern.
  if (isSub === SUBPARSE) {
    return [re, hasMagic]
  }

  // skip the regexp for non-magical patterns
  // unescape anything in it, though, so that it'll be
  // an exact match against a file etc.
  if (!hasMagic) {
    return globUnescape(pattern)
  }

  try {
    var regExp = new RegExp('^' + re + '


, 'i')
  } catch (er) {
    // If it was an invalid regular expression, then it can't match
    // anything.  This trick looks for a character after the end of
    // the string, which is of course impossible, except in multi-line
    // mode, but it's not a /m regex.
    return new RegExp('$.')
  }

  regExp._glob = pattern
  regExp._src = re

  return regExp
}

module.exports = parse

What the code could have been:

// A constant representing special characters in glob patterns
const reSpecials = charSet('().*{}+?[]^$\\!');

// A function to unescape special characters in a glob pattern
function globUnescape(s) {
  return s.replace(/\\(.)/g, '$1');
}

// A function to create a character set from a string
function charSet(s) {
  return s.split('').reduce((set, c) => {
    set[c] = true;
    return set;
  }, {});
}

// A constant representing the sub-parsing flag
const SUBPARSE = {};

// The main function to parse a glob pattern into a regular expression
function parse(pattern, isSub) {
  // Check if the pattern is too long
  if (pattern.length > 1024 * 64) {
    throw new TypeError('Pattern is too long');
  }

  // Shortcuts for common patterns
  if (pattern === '**') return { glob: true };
  if (pattern === '') return '';

  let re = '';
  let hasMagic = false;
  let escaping = false;

  // State variables for parsing
  let patternListStack = [];
  let negativeLists = [];

  // Parsing state
  let stateChar;
  let inClass = false;
  let reClassStart = -1;
  let classStart = -1;

  // The pattern start
  let patternStart;
  if (pattern.charAt(0) === '.') {
    patternStart = '[^/]*'; // anything
  } else {
    patternStart = '(?!(?:^|\\/)\.{1,2}(?:$|\\/))'; // not (start or / followed by. or.. followed by / or end)
  }

  // Clear a state char at the end of a pass
  function clearStateChar() {
    if (stateChar) {
      switch (stateChar) {
        case '*':
          re += '[^/]*?';
          hasMagic = true;
          break;
        case '?':
          re += '[^/]";
          hasMagic = true;
          break;
        default:
          re += '\\' + stateChar;
          break;
      }
      stateChar = false;
    }
  }

  // Iterate over the pattern characters
  for (let i = 0, len = pattern.length, c; (i < len) && (c = pattern.charAt(i)); i++) {
    // Skip over any that are escaped
    if (escaping && reSpecials[c]) {
      re += '\\' + c;
      escaping = false;
      continue;
    }

    // Handle special characters
    switch (c) {
      case '/':
        // Completely not allowed, even escaped
        return false;
      case '\\':
        clearStateChar();
        escaping = true;
        continue;
      case '?':
      case '*':
      case '+':
      case '@':
      case '!':
        // All of those are literals inside a class, except that the glob [!a] means [^a] in regexp
        if (inClass) {
          if (c === '!' && i === classStart + 1) c = '^';
          re += c;
          continue;
        }

        // If we already have a stateChar, then it means that there was something like ** or +? in there
        clearStateChar();
        stateChar = c;
        continue;

      // Handle opening and closing parentheses
      case '(':
        if (inClass) {
          re += '(';
          continue;
        }

        if (!stateChar) {
          re += '\\(';
          continue;
        }

        patternListStack.push({
          type: stateChar,
          start: i - 1,
          reStart: re.length,
          open: '\\(',
          close: '\\)',
        });
        re += stateChar === '!'? '(?:(?!(?:' : '(?:';
        stateChar = false;
        continue;
      case ')':
        if (inClass ||!patternListStack.length) {
          re += '\\)';
          continue;
        }

        clearStateChar();
        hasMagic = true;
        const pl = patternListStack.pop();
        re += pl.close;
        if (pl.type === '!') {
          negativeLists.push(pl);
        }
        pl.reEnd = re.length;
        continue;

      // Handle the pipe character
      case '|':
        if (inClass ||!patternListStack.length || escaping) {
          re += '\\|';
          escaping = false;
          continue;
        }

        clearStateChar();
        re += '|';
        continue;

      // Handle the start and end of a class
      case '[':
        clearStateChar();
        if (inClass) {
          re += '\\' + c;
          continue;
        }

        inClass = true;
        classStart = i;
        reClassStart = re.length;
        re += c;
        continue;
      case ']':
        if (i === classStart + 1 ||!inClass) {
          re += '\\' + c;
          escaping = false;
          continue;
        }

        // Handle the case where we left a class open
        if (inClass) {
          const cs = pattern.substring(classStart + 1, i);
          try {
            RegExp('[' + cs + ']');
          } catch (er) {
            const sp = parse(cs, SUBPARSE);
            re = re.substr(0, reClassStart) + '\\[' + sp[0] + '\\]';
            hasMagic = hasMagic || sp[1];
            inClass = false;
            continue;
          }
        }

        // Finish up the class
        hasMagic = true;
        inClass = false;
        re += c;
        continue;

      // Handle default characters
      default:
        clearStateChar();
        if (escaping) {
          escaping = false;
        } else if (reSpecials[c] &&!(c === '^' && inClass)) {
          re += '\\';
        }

        re += c;
        continue;
    }
  }

  // Handle the case where we left a class open
  if (inClass) {
    const cs = pattern.substr(classStart + 1);
    const sp = parse(cs, SUBPARSE);
    re = re.substr(0, reClassStart) + '\\[' + sp[0] + sp[1]? '\\' : '';
  }

  // Handle trailing things that only matter at the very end
  clearStateChar();
  if (escaping) {
    re += '\\\\';
  }

  // Only need to apply the nodot start if the re starts with something that could conceivably capture a dot
  let addPatternStart = false;
  switch (re.charAt(0)) {
    case '.':
    case '[':
    case '(': addPatternStart = true;
  }

  // Hack to work around lack of negative lookbehind in JS
  // A pattern like: *.!(x).!(y|z) needs to ensure that a name like 'a.xyz.yz' doesn't match
  for (let n = negativeLists.length - 1; n > -1; n--) {
    const nl = negativeLists[n];

    const nlBefore = re.slice(0, nl.reStart);
    const nlFirst = re.slice(nl.reStart, nl.reEnd - 8);
    const nlLast = re.slice(nl.reEnd - 8, nl.reEnd);
    const nlAfter = re.slice(nl.reEnd);

    nlLast += nlAfter;

    // Handle nested stuff like *(*.js|!(*.json))
    const openParensBefore = nlBefore.split('(').length - 1;
    let cleanAfter = nlAfter.replace(/\)[+*?]?/, '');
    for (let i = 0; i < openParensBefore; i++) {
      cleanAfter = cleanAfter.replace(/\)[+*?]?/, '');
    }
    nlAfter = cleanAfter;

    const dollar = nlAfter === '' && isSub!== SUBPARSE? '


 : '';
    const newRe = nlBefore + nlFirst + nlAfter + dollar + nlLast;
    re = newRe;
  }

  // If the re is not "" at this point, then we need to make sure it doesn't match against an empty path part
  if (re!== '' && hasMagic) {
    re = '(?=.)' + re;
  }

  if (addPatternStart) {
    re = patternStart + re;
  }

  // Parsing just a piece of a larger pattern
  if (isSub === SUBPARSE) {
    return [re, hasMagic];
  }

  // Skip the regexp for non-magical patterns, unescape anything in it, so that it'll be an exact match against a file etc.
  if (!hasMagic) {
    return globUnescape(pattern);
  }

  try {
    const regExp = new RegExp('^' + re + '


, 'i');
    regExp._glob = pattern;
    regExp._src = re;
    return regExp;
  } catch (er) {
    return new RegExp('$.');
  }
}

module.exports = parse;

This code implements a parser for a simplified glob pattern language, similar to the one used in shell commands.

Here's a breakdown:

  1. Regular Expression Utilities:

  2. parse Function:

  3. Pattern Parsing Loop:

In essence, this code provides a way to convert glob patterns into regular expressions, enabling the matching of files and directories based on these patterns.