src/pcre.js - codemirror-mode-pcre (master) - kindwolf.org Git repositories

Tree @master (Download .tar.gz)

pcre.js @master — raw · history · blame

// Declare global variables to avoid warnings in JSHint
/* global CodeMirror, define */

(function (mod) {
	if (typeof exports === "object" && typeof module === "object") // CommonJS
		mod(require("codemirror/lib/codemirror"));
	else if (typeof define === "function" && define.amd) // AMD
		define(["codemirror/lib/codemirror"], mod);
	else // Plain browser env
		mod(CodeMirror);
})(function (CodeMirror) {
	"use strict";

	CodeMirror.defineMode('pcre', function(editor_options, mode_options) {
		// Default settings:
		var options = {
			extended: true,
		};
		// Override default settings with user-provided settings:
		if ('extended' in mode_options) options.extended = Boolean(mode_options.extended);

		var delimiters = {
			'<': '>',
			'[': ']',
			'{': '}',
			'(': ')',
		};
		// Behaviour of alphanumeric characters after a backslash character (normal context):
		var backslash_in_normal_context = {
			'0': 'non-printing-character',
			'1': 'backreference',
			'2': 'backreference',
			'3': 'backreference',
			'4': 'backreference',
			'5': 'backreference',
			'6': 'backreference',
			'7': 'backreference',
			'8': 'backreference',
			'9': 'backreference',
			'A': 'anchor', // \A  start of subject
			'B': 'anchor', // \B  not a word boundary
			'C': 'generic-character-type', // \C  one data unit, even in UTF mode (best avoided)
			'D': 'generic-character-type', // \D  any character that is not a decimal digit
			'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message
			'F': '', // \F matches F
			'G': 'anchor', // \G  first matching position in subject
			'H': 'generic-character-type', // \H  any character that is not a horizontal white space character
			'I': '', // \I matches I
			'J': '', // \J matches J
			'K': 'anchor', // \K  reset start of match (neither an anchor nor a simple assertion)
			'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'M': '', // \M matches M
			'N': 'generic-character-type', // \N  a character that is not a newline
			'O': '', // \O matches O
			'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
			'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences.
			'R': 'generic-character-type', // \R  a newline sequence
			'S': 'generic-character-type', // \S  any character that is not a white space character
			'T': '', // \T matches T
			'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'V': 'generic-character-type', // \V  any character that is not a vertical white space character
			'W': 'generic-character-type', // \W  any "non-word" character
			'X': 'generic-character-type', // \X  a Unicode extended grapheme cluster
			'Y': '', // \Y matches Y
			'Z': 'anchor', // \Z  matches at the end of the subject; also matches before a newline at the end of the subject
			'a': 'non-printing-character', // \a  alarm, that is, the BEL character (hex 07)
			'b': 'anchor', // \b  word boundary
			'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character
			'd': 'generic-character-type', // \d  any decimal digi
			'e': 'non-printing-character', // \e  escape (hex 1B)
			'f': 'non-printing-character', // \f  form feed (hex 0C)
			'g': 'err a-number-reference-must-not-be-zero', // a numbered reference must not be zero
			'h': 'generic-character-type', // \h  any horizontal white space character
			'i': '', // \i matches i
			'j': '', // \j matches j
			'k': 'err backslash-k-is-not-followed-by-a-name', // \k is not followed by a [...] name
			'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'm': '', // \m matches m
			'n': 'non-printing-character', // \n  linefeed (hex 0A)
			'o': '', // \o matches o
			'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
			'q': '', // \q matches q
			'r': 'non-printing-character', // \r  carriage return (hex 0D)
			's': 'generic-character-type', // \s  any white space character
			't': 'non-printing-character', // \t  tab (hex 09)
			'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'v': 'generic-character-type', // \v  any vertical white space character
			'w': 'generic-character-type', // any "word" character
			'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT)
			'y': '', // \y matches y
			'z': 'anchor', // \z  end of subject
		};
		// Behaviour of alphanumeric characters after a backslash character (character class context, i.e. [...]):
		var backslash_in_character_class = {
			'0': 'non-printing-character', // octal code
			'1': 'non-printing-character', // octal code
			'2': 'non-printing-character', // octal code
			'3': 'non-printing-character', // octal code
			'4': 'non-printing-character', // octal code
			'5': 'non-printing-character', // octal code
			'6': 'non-printing-character', // octal code
			'7': 'non-printing-character', // octal code
			'8': '', // \8 matches 8
			'9': '', // \9 matches 9
			'A': '', // \A matches A
			'B': '', // \B matches B -- \B, \R, and \X are not special inside a character class.
			'C': '', // \C matches C
			'D': 'generic-character-type', // \D  any character that is not a decimal digit
			'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message
			'F': '', // \F matches F
			'G': '', // \G matches G
			'H': 'generic-character-type', // \H  any character that is not a horizontal white space character
			'I': '', // \I matches I
			'J': '', // \J matches J
			'K': '', // \K matches K
			'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'M': '', // \M matches M
			'N': 'err backslash-n-is-not-supported-in-a-class', // \N is not allowed in a character class.
			'O': '', // \O matches O
			'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
			'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences.
			'R': '', // \R matches R -- \B, \R, and \X are not special inside a character class.
			'S': 'generic-character-type', // \S  any character that is not a white space character
			'T': '', // \T matches T
			'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'V': 'generic-character-type', // \V  any character that is not a vertical white space character
			'W': 'generic-character-type', // \W  any "non-word" character
			'X': '', // \X matches X -- \B, \R, and \X are not special inside a character class.
			'Y': '', // \Y matches Y
			'Z': '', // \Z matches Z
			'a': 'non-printing-character', // \a  alarm, that is, the BEL character (hex 07)
			'b': 'non-printing-character', // inside a character class, \b is interpreted as the backspace character (hex 08)
			'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character
			'd': 'generic-character-type', // \d  any decimal digi
			'e': 'non-printing-character', // \e  escape (hex 1B)
			'f': 'non-printing-character', // \f  form feed (hex 0C)
			'g': '', // \g matches g
			'h': 'generic-character-type', // \h  any horizontal white space character
			'i': '', // \i matches i
			'j': '', // \j matches j
			'k': '', // \k matches k
			'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'm': '', // \m matches m
			'n': 'non-printing-character', // \n  linefeed (hex 0A)
			'o': '', // \o matches o
			'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence
			'q': '', // \q matches q
			'r': 'non-printing-character', // \r  carriage return (hex 0D)
			's': 'generic-character-type', // \s  any white space character
			't': 'non-printing-character', // \t  tab (hex 09)
			'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u
			'v': 'generic-character-type', // \v  any vertical white space character
			'w': 'generic-character-type', // any "word" character
			'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT)
			'y': '', // \y matches y
			'z': '', // \z matches z
		};
		var backslask_p_properties = {
			// GENERAL CATEGORY PROPERTIES FOR \p and \P
			'C': 'Other',
			'Cc': 'Control',
			'Cf': 'Format',
			'Cn': 'Unassigned',
			'Co': 'Private use',
			'Cs': 'Surrogate',

			'L': 'Letter',
			'Ll': 'Lower case letter',
			'Lm': 'Modifier letter',
			'Lo': 'Other letter',
			'Lt': 'Title case letter',
			'Lu': 'Upper case letter',
			'L&': 'Ll, Lu, or Lt',

			'M': 'Mark',
			'Mc': 'Spacing mark',
			'Me': 'Enclosing mark',
			'Mn': 'Non-spacing mark',

			'N': 'Number',
			'Nd': 'Decimal number',
			'Nl': 'Letter number',
			'No': 'Other number',

			'P': 'Punctuation',
			'Pc': 'Connector punctuation',
			'Pd': 'Dash punctuation',
			'Pe': 'Close punctuation',
			'Pf': 'Final punctuation',
			'Pi': 'Initial punctuation',
			'Po': 'Other punctuation',
			'Ps': 'Open punctuation',

			'S': 'Symbol',
			'Sc': 'Currency symbol',
			'Sk': 'Modifier symbol',
			'Sm': 'Mathematical symbol',
			'So': 'Other symbol',

			'Z': 'Separator',
			'Zl': 'Line separator',
			'Zp': 'Paragraph separator',
			'Zs': 'Space separator',

			// PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P
			'Xan': 'Alphanumeric: union of properties L and N',
			'Xps': 'POSIX space: property Z or tab, NL, VT, FF, CR',
			'Xsp': 'Perl space: property Z or tab, NL, VT, FF, CR',
			'Xuc': 'Univerally-named character: one that can be represented by a Universal Character Name',
			'Xwd': 'Perl word: property Xan or underscore',

			// SCRIPT NAMES FOR \p AND \P
			'Arabic': true,
			'Armenian': true,
			'Avestan': true,
			'Balinese': true,
			'Bamum': true,
			'Bassa_Vah': true,
			'Batak': true,
			'Bengali': true,
			'Bopomofo': true,
			'Brahmi': true,
			'Braille': true,
			'Buginese': true,
			'Buhid': true,
			'Canadian_Aboriginal': true,
			'Carian': true,
			'Caucasian_Albanian': true,
			'Chakma': true,
			'Cham': true,
			'Cherokee': true,
			'Common': true,
			'Coptic': true,
			'Cuneiform': true,
			'Cypriot': true,
			'Cyrillic': true,
			'Deseret': true,
			'Devanagari': true,
			'Duployan': true,
			'Egyptian_Hieroglyphs': true,
			'Elbasan': true,
			'Ethiopic': true,
			'Georgian': true,
			'Glagolitic': true,
			'Gothic': true,
			'Grantha': true,
			'Greek': true,
			'Gujarati': true,
			'Gurmukhi': true,
			'Han': true,
			'Hangul': true,
			'Hanunoo': true,
			'Hebrew': true,
			'Hiragana': true,
			'Imperial_Aramaic': true,
			'Inherited': true,
			'Inscriptional_Pahlavi': true,
			'Inscriptional_Parthian': true,
			'Javanese': true,
			'Kaithi': true,
			'Kannada': true,
			'Katakana': true,
			'Kayah_Li': true,
			'Kharoshthi': true,
			'Khmer': true,
			'Khojki': true,
			'Khudawadi': true,
			'Lao': true,
			'Latin': true,
			'Lepcha': true,
			'Limbu': true,
			'Linear_A': true,
			'Linear_B': true,
			'Lisu': true,
			'Lycian': true,
			'Lydian': true,
			'Mahajani': true,
			'Malayalam': true,
			'Mandaic': true,
			'Manichaean': true,
			'Meetei_Mayek': true,
			'Mende_Kikakui': true,
			'Meroitic_Cursive': true,
			'Meroitic_Hieroglyphs': true,
			'Miao': true,
			'Modi': true,
			'Mongolian': true,
			'Mro': true,
			'Myanmar': true,
			'Nabataean': true,
			'New_Tai_Lue': true,
			'Nko': true,
			'Ogham': true,
			'Ol_Chiki': true,
			'Old_Italic': true,
			'Old_North_Arabian': true,
			'Old_Permic': true,
			'Old_Persian': true,
			'Old_South_Arabian': true,
			'Old_Turkic': true,
			'Oriya': true,
			'Osmanya': true,
			'Pahawh_Hmong': true,
			'Palmyrene': true,
			'Pau_Cin_Hau': true,
			'Phags_Pa': true,
			'Phoenician': true,
			'Psalter_Pahlavi': true,
			'Rejang': true,
			'Runic': true,
			'Samaritan': true,
			'Saurashtra': true,
			'Sharada': true,
			'Shavian': true,
			'Siddham': true,
			'Sinhala': true,
			'Sora_Sompeng': true,
			'Sundanese': true,
			'Syloti_Nagri': true,
			'Syriac': true,
			'Tagalog': true,
			'Tagbanwa': true,
			'Tai_Le': true,
			'Tai_Tham': true,
			'Tai_Viet': true,
			'Takri': true,
			'Tamil': true,
			'Telugu': true,
			'Thaana': true,
			'Thai': true,
			'Tibetan': true,
			'Tifinagh': true,
			'Tirhuta': true,
			'Ugaritic': true,
			'Vai': true,
			'Warang_Citi': true,
			'Yi': true,
		};
		var backslash_p_regex_string = '[pP]\\{\\^?([\\w&]+)\\}';
		var backslash_p_regex = new RegExp(backslash_p_regex_string);

		var posix_named_sets = {
			'alnum': 'alphanumeric',
			'alpha': 'alphabetic',
			'ascii': '0-127',
			'blank': 'space or tab',
			'cntrl': 'control character',
			'digit': 'decimal digit',
			'graph': 'printing, excluding space',
			'lower': 'lower case letter',
			'print': 'printing, including space',
			'punct': 'printing, excluding alphanumeric',
			'space': 'white space',
			'upper': 'upper case letter',
			'word': 'same as \\w',
			'xdigit': 'hexadecimal digit',
		};
		// Include '<' and '>' to spot errors such as [a[:<:]b]
		var posix_named_sets_regex_string = '\\[:\\^?([\\w<>]+):]';
		var posix_named_sets_regex = new RegExp(posix_named_sets_regex_string);

		var callout_regex_string = '\\(\\?C(\\d{0,3})\\)';
		var callout_regex = new RegExp(callout_regex_string);

		var assertion_regex_string = '\\(\\?<?[=!]';
		var assertion_regex = new RegExp(assertion_regex_string);

		var condition_callout_regex_string = callout_regex_string + assertion_regex_string;
		var condition_callout_regex = new RegExp(condition_callout_regex_string);

		// (?i)     caseless
		// (?J)     allow duplicate names
		// (?m)     multiline
		// (?s)     single line (dotall)
		// (?U)     default ungreedy (lazy)
		// (?x)     extended (ignore white space)
		// (?-...)  unset option(s)
		// + combinations e.g. (?im-sx) or (?iJm-s-U-x)
		var options_regex_string = '(?:-?[iJmsUx]+)+';

		// Standalone option sequence, e.g. (?x-i)
		var option_sequence_regex_string = '\\(\\?' + options_regex_string + '\\)';
		var option_sequence_regex = new RegExp(option_sequence_regex_string);

		// Start of non-capturing group with options, e.g. (?i-U:
		var group_options_regex_string = '\\(\\?' + options_regex_string + ':';
		var group_options_regex = new RegExp(group_options_regex_string);

		// Helper functions:
		function delimiter(ch) {
			return (ch in delimiters) ? delimiters[ch] : ch;
		}
		function current(state) {
			if (!state.context.length) return false;
			return state.context[state.context.length - 1];
		}
		function consume(stream) {
			// As a nested mode, we should not consume too much so as to let the nesting mode in charge.
			// That said, eating \w is usually safe:
			if (!stream.match(/\w+/)) stream.next();
		}
		function all_tokens(state, token) {
			var result = state.context.join(' ');
			if (token) {
				// Avoid leading spaces as they confuse matchbrackets (see issue #4):
				if (result) result += ' ';
				result += token;
			}
			return result;
		}
		function push(state, new_context, new_context_state, token) {
			var ret = all_tokens(state, token);
			state.context.push(new_context);
			state.context_state.push(new_context_state || {});
			return ret;
		}
		function pop(state, token) {
			var current_context = state.context.pop();
			state.context_state.pop();
			if (token) current_context += ' ' + token;
			return all_tokens(state, current_context);
		}
		function current_context_state(state) {
			return state.context_state[state.context_state.length - 1];
		}
		function expect_name(state) {
			state.name_value = '';
			return push(state, 'name');
		}
		function expect_end(state, end_string) {
			var context_state = current_context_state(state);
			var end_string_array = [];
			for (var i = 0; i < end_string.length; ++ i) end_string_array.push(end_string[i]);
			context_state.expected = end_string_array;
			return context_state;
		}
		function read_expected_end(stream, state) {
			var expected, expected_ch, ch;
			expected = current_context_state(state).expected;
			if (expected && expected.length) {
				expected_ch = expected.shift();
				ch = stream.next();
				if (ch === expected_ch) {
					if (!expected.length) {
						return pop(state);
					}
					return all_tokens(state);
				}
				// console.log('erroneous end:', ch, 'expected:', expected_ch, 'context', current(state));
				return all_tokens(state, 'err erroneous-end-of-token');
			}
			else {
				return false;
			}
		}

		function handle_backslash(stream, state) {
			stream.eat('\\');
			if (!stream.peek()) return 'err backslash-at-end-of-pattern';

			// The backslash character has several uses. Firstly, if it is followed by a character that is not a number
			// or a letter, it takes away any special meaning that character may have.
			if (stream.match(/[^0-9a-zA-Z]/)) return 'escaped-character';

			// \Q is used to start an escaped sequence:
			if (stream.match('Q') && current(state) != 'escaped-sequence') {
				push(state, 'escaped-sequence');
				return 'escaped-sequence-start';
			}

			// \cx       "control-x", where x is any ASCII character
			if (stream.match(/c[ -~]/)) return 'non-printing-character';

			// \0dd      character with octal code 0dd
			if (stream.match(/0[0-7]{0,2}/)) return 'non-printing-character';
			// \ddd      character with octal code ddd, or back reference
			if (stream.match(/[1-7][0-7]{1,2}/)) return 'non-printing-character';
			// \o{ddd..} character with octal code ddd..
			if (stream.match(/o\{[0-7]+\}/)) return 'non-printing-character';
			// \x{hhh..} character with hex code hhh.. (non-JavaScript mode)
			if (stream.match(/x\{[0-9a-fA-F]+}/)) return 'non-printing-character';
			// \xhh      character with hex code hh
			if (stream.match(/x[0-9a-fA-F]{0,2}/)) return 'non-printing-character';
			// \uhhhh    character with hex code hhhh (JavaScript mode only)
			if (stream.match(/u[0-9a-fA-F]{4}/)) return 'non-printing-character';

			// \p{...} and \P{...}:
			var rem = stream.match(backslash_p_regex);
			if (rem) {
				if (rem[1] in backslask_p_properties) return 'generic-character-type';
				else return 'err unknown-property-name-after-p';
			}

			var in_character_class = (current(state) === 'character-class');
			// Nothing in this condition can be found in a character class:
			if (!in_character_class) {
				// The sequence \g followed by an unsigned or a negative number, optionally enclosed in braces, is an
				// absolute or relative back reference. A named back reference can be coded as \g{name}.
				if (stream.match(/g-?[0-9]+/)) return 'backreference';
				if (stream.match(/g\{-?[0-9]+\}/)) return 'backreference';
				if (stream.match(/g\{/, false)) return push(state, 'backreference');
				// \k<name>  reference by name (Perl)
				// \k'name'  reference by name (Perl)
				// \k{name}  reference by name (.NET)
				if (stream.match(/k[<'{]/, false)) return push(state, 'backreference');
				if (stream.match(/[0-9]+/)) return 'backreference';

				// For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either
				// in angle brackets or single quotes, is an alternative syntax for referencing a subpattern as a
				// "subroutine".
				if (stream.match(/g<[-+]?[0-9]+>/)) return 'subroutine';
				if (stream.match(/g'[-+]?[0-9]+'/)) return 'subroutine';
				if (stream.match(/g[<']/, false)) return push(state, 'subroutine');
			}
			// At this stage, we have looked for:
			//   - a backslash followed by nothing
			//   - a backslash followed by a single non-alphanumeric character
			//   - a backslash followed by 1 or more characters to achieve a special, context-dependent meaning
			// Look for a backslash followed by a single alphanumeric character:
			var backslash_p = in_character_class ? backslash_in_character_class : backslash_in_normal_context;
			return backslash_p[stream.next()];
		}

		function handle_name(stream, state) {
			var ret, rem, consume_limit;
			var ch = stream.next();
			// Names must start with a non-digit.
			if (!state.name_value.length && (!ch.match(/\w/) || ch.match(/\d/))) {
				ret = 'err erroneous-start-of-name';
				consume_limit = 0;
			}
			// Names consist of up to 32 alphanumeric characters and underscores.
			else if (state.name_value.length > 31) {
				ret = 'err name-too-long';
				consume_limit = -1;
			} else consume_limit = 32 - state.name_value.length - 1;
			state.name_value += ch;
			if (consume_limit < 0) {
				if (rem = stream.match(/^\w+/)) state.name_value += rem[0];
			} else while (consume_limit --) {
				if (rem = stream.match(/^\w/)) state.name_value += rem[0];
				else break;
			}
			var next_char = stream.peek();
			if (!next_char || !next_char.match(/\w/)) return pop(state, ret);
			return all_tokens(state, ret);
		}

		function handle_callout(stream, state) {
			// (?C)       callout
			// (?Cn)      callout with data n
			var rem = stream.match(callout_regex);
			if (rem) {
				return Number(rem[1]) < 256 ? 'callout' : 'err erroneous-callout-number';
			}
			return false;
		}

		function handle_condition_subroutines(stream, state) {
			if (stream.peek() === ')') {
				pop(state);
				return tokenBase(stream, state);
			}
			stream.eat('R');
			if (stream.eat('&')) return expect_name(state);
			stream.match(/\d+/);
			return pop(state);
		}

		function handle_conditions(stream, state) {
			var condition_state = current_context_state(state);
			var expected_end = read_expected_end(stream, state);
			if (expected_end) return expected_end;
			if (condition_state.ok) {
				pop(state);
				return tokenBase(stream, state);
			}
			// (?(DEFINE)...  define subpattern for reference
			if (stream.match(/DEFINE(?=\))/)) {
				return pop(state, 'define');
			}
			// (?(R)...   overall recursion condition
			// (?(Rn)...  specific group recursion condition
			// (?(R&name)...) specific recursion condition
			if (stream.match(/R(\d+|&\w+|)\)/, false)) {
				condition_state.ok = true;
				push(state, 'condition-subroutine');
				return tokenBase(stream, state);
			}
			// (?(n)...   absolute reference condition
			// (?(+n)...  relative reference condition
			// (?(-n)...  relative reference condition
			if (stream.match(/(-|\+|)\d+/)) {
				condition_state.ok = true;
				return all_tokens(state, 'backreference');
			}
			var rem = stream.match(/([<'])/);
			if (rem) {
				condition_state.ok = false;
				expect_end(state, delimiter(rem[1]));
				return expect_name(state);
			}
			if (stream.match(/\w+/, false)) {
				condition_state.ok = true; // the "name" state will handle everything for us
				return expect_name(state);
			}
			// If the condition is not in any of the above formats, it must be an assertion.  This may be a positive or
			// negative lookahead or lookbehind assertion.
			if (stream.match(/\?<?[=!]/)) {
				condition_state.ok = true; // the "group" state will handle everything for us
				// Ensure "group" leaves the closing parenthesis untouched so "start-group" can consume it:
				var group_options = {'leave_closing_parenthesis': true};
				return push(state, 'group' + (++ state.group_level), group_options, 'start-group');
			}
			stream.next();
			return all_tokens(state, 'err erroneous-condition');
		}

		function handle_start_group(stream, state) {
			var start_group_state = current_context_state(state);
			var expected_end = read_expected_end(stream, state);
			if (expected_end) return expected_end;
			var rem;
			if (start_group_state.option_shorthand === 1) {
				// A shorthand option was spotted, handle it:
				start_group_state.option_shorthand = 2;
				stream.match(/[^:]+/);
				return all_tokens(state, 'option-sequence');
			}
			if (start_group_state.option_shorthand === 2) {
				// A shorthand option was handled, finish the job:
				stream.eat(':');
				return pop(state);
			}
			if (start_group_state.condition_callout === 1) {
				// A pre-condition callout was spotted, handle it:
				start_group_state.condition_callout = 2;
				return all_tokens(state, handle_callout(stream, state));
			}
			if (start_group_state.condition_callout === 2) {
				// A pre-condition callout was handled, resume
				stream.eat('(');
				expect_end(state, ')');
				return push(state, 'condition');
			}
			// (?<name>...)   named capturing group (Perl)
			// (?'name'...)   named capturing group (Perl)
			// (?P<name>...)  named capturing group (Python)
			rem = stream.match(/\(\?P?([<'])/);
			if (rem) {
				expect_end(state, delimiter(rem[1]));
				return expect_name(state);
			}
			// Same as (?: but with options, e.g. (?x-i:
			if (stream.match(group_options_regex, false)) {
				// As a convenient shorthand, if any option settings are required at the start of a non-capturing
				// subpattern, the option letters may appear between the "?" and the ":".
				stream.match('(?');
				start_group_state.option_shorthand = 1;
				return all_tokens(state);
			}
			// "(?(" typically marks the start of a condition: (?(condition)yes-pattern|no-pattern)
			if (stream.match('(?') && stream.peek() === '(') {
				// An explicit callout may be set just before an assertion condition: (?(?C7)(?<!abc)def|ghi)
				start_group_state.condition_callout = (stream.match(condition_callout_regex, false)) ? 1 : 2;
				return all_tokens(state);
			}
			stream.next();
			return all_tokens(state, 'err erroneous-start-of-start-group');
		}

		function handle_backreference(stream, state) {
			var expected_end = read_expected_end(stream, state);
			if (expected_end) return expected_end;
			var rem = stream.match(/k([<'{])/) || stream.match(/g(\{)/) || stream.match(/(\()\?P=/);
			if (rem) {
				expect_end(state, delimiter(rem[1]));
				return expect_name(state);
			}
			stream.next();
			return all_tokens(state, 'err erroneous-backreference');
		}

		function handle_subroutine(stream, state) {
			var expected_end = read_expected_end(stream, state);
			if (expected_end) return expected_end;
			var rem = stream.match(/g([<'])/) || stream.match(/(\()\?(P>|&)/);
			if (rem) {
				expect_end(state, delimiter(rem[1]));
				return expect_name(state);
			}
			stream.next();
			return all_tokens(state, 'err erroneous-subroutine');
		}

		function handle_verb(stream, state) {
			var expected_end = read_expected_end(stream, state);
			if (expected_end) return expected_end;
			expect_end(state, ')');
			return expect_name(state);
		}

		function update_options(state, options) {
			// We are only interested in x (extended mode).
			var enable = true, new_state = null, i = 0, c = null;
			for (; i < options.length; ++i) {
				c = options[i];
				if (c === '-') enable = false;
				else if (c === 'x') new_state = enable;
			}
			if (new_state !== null) state.extended = new_state;
		}

		function tokenBase(stream, state) {
			var rem, ret; // stand for Regular Expression Match and RETurn, respectively.

			// Get current state, current char, next char:
			var ch = stream.peek();
			if (!ch) return;
			var current_state = current(state);
			var group_state;

			if (current_state === 'name') return handle_name(stream, state);
			if (current_state === 'condition') return handle_conditions(stream, state);
			if (current_state === 'condition-subroutine') return handle_condition_subroutines(stream, state);
			if (current_state === 'start-group') return handle_start_group(stream, state);
			if (current_state === 'backreference') return handle_backreference(stream, state);
			if (current_state === 'subroutine') return handle_subroutine(stream, state);
			if (current_state === 'verb') return handle_verb(stream, state);

			if (current_state === 'escaped-sequence') {
				if (stream.match('\\E')) return pop(state, 'escaped-sequence-end');
				consume(stream);
				return all_tokens(state);
			}

			// Escaped characters:
			if (stream.match(/\\./, false)) return all_tokens(state, handle_backslash(stream, state));

			if (stream.match('[', false)) {
				if (current_state !== 'character-class') {
					if (stream.match(posix_named_sets_regex)) {
						return all_tokens(state, 'err posix-outside-class-unsupported');
					}
					// In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and
					// [[:>:]] is used for matching "start of word" and "end of word".
					if (stream.match('[[:<:]]') || stream.match('[[:>:]]')) return all_tokens(state, 'anchor');
					// At this stage, we do have a new character class:
					push(state, 'character-class');
					stream.eat('[');
					stream.eat('^');
					// If a closing square bracket is required as a member of the class, it should be the first data
					// character in the class (after an initial circumflex, if present) or escaped with a backslash.
					// Note: ']' should be on the same line as '[', even in extended mode.
					stream.eat(']');
					return all_tokens(state);
				}
			}

			if (current_state === 'character-class') {
				rem = stream.match(posix_named_sets_regex);
				if (rem) {
					if (rem[1] in posix_named_sets) return all_tokens(state, 'generic-character-type');
					else return all_tokens(state, 'err unknown-posix-class-name');
				}
				if (stream.eat(']')) return pop(state);
				consume(stream);
				return all_tokens(state);
			}

			// Regular comments in extended mode:
			if (state.extended && stream.eat('#')) {
				stream.skipToEnd();
				return 'comment';
			}

			if (stream.eat('{')) {
				// exactly n:
				if (stream.match(/\d+\}/)) return all_tokens(state, 'quantifier');
				// "at least n, no more than m" and "n or more", greedy, possessive or lazy:
				if (stream.match(/\d+,\d*\}[+?]?/)) return all_tokens(state, 'quantifier');
			}

			if (stream.eat('|')) {
				return all_tokens(state, 'alternation');
			}

			if (stream.peek() === '(') {
				if (stream.match(/\(\*(?:UTF(?:8|16|32|)|UCP|NO_AUTO_POSSESS|NO_START_OPT)\)/)) return all_tokens(state, 'option-sequence');
				if (stream.match(/\(\*LIMIT_(?:RECURSION|MATCH)=[0-9]+\)/)) return all_tokens(state, 'option-sequence');
				// Newline convention + what \R matches:
				if (stream.match(/\(\*(?:CR|LF|CRLF|ANYCRLF|ANY|BSR_(?:ANYCRLF|UNICODE))\)/)) return all_tokens(state, 'option-sequence');
				// Backtracking control:
				if (stream.match(/\(\*(?:ACCEPT|FAIL|F|COMMIT|PRUNE|SKIP|THEN)\)/)) return all_tokens(state, 'verb');
				if (stream.match(/\(\*(?:MARK|PRUNE|SKIP|THEN|):/)) return push(state, 'verb', {}, 'verb');
				rem = stream.match(option_sequence_regex);
				if (rem) {
					update_options(state, rem[0]);
					return all_tokens(state, 'option-sequence');
				}
				// (?#....)  comment (not nestable)
				if (stream.match(/\(\?#[^)]*\)/)) return all_tokens(state, 'comment');
				// (?P=name)  reference by name (Python)
				if (stream.match(/\(\?P=/, false)) return push(state, 'backreference');
				// (?&name)   call subpattern by name (Perl)
				// (?P>name)  call subpattern by name (Python)
				if (stream.match(/\(\?(P>|&)/, false)) return push(state, 'subroutine');
				// (?n)       call subpattern by absolute number
				// (?+n)      call subpattern by relative number
				// (?-n)      call subpattern by relative number
				if (stream.match(/\(\?(\-|\+|)\d+\)/)) return all_tokens(state, 'subroutine');
				// (?R)       recurse whole pattern
				if (stream.match('(?R)')) return all_tokens(state, 'subroutine');
				// Callouts:
				var callout = handle_callout(stream, state);
				if (callout) return all_tokens(state, callout);

				// At this stage, we have a new group:
				++ state.group_level;
				group_state = 'group' + state.group_level;
				push(state, group_state);

				// (?=...)   positive look ahead
				// (?!...)   negative look ahead
				// (?<=...)  positive look behind
				// (?<!...)  negative look behind
				if (stream.match(assertion_regex)) return all_tokens(state, 'start-group');
				// (?:...)   non-capturing group
				// (?|...)   non-capturing group; reset group numbers for capturing groups in each alternative
				// (?>...)   atomic, non-capturing group
				if (stream.match(/\(\?[:|>]/)) return all_tokens(state, 'start-group');

				if (stream.match('(?', false)) {
					push(state, 'start-group');
					return tokenBase(stream, state);
				}
				stream.eat('(');
				return all_tokens(state, 'start-group');
			}

			if (stream.peek() === ')') {
				if (current_state && current_state.match(/^group/)) {
					ret = 'start-group'; // formerly 'end-group' but that used to confuse matchbrackets (see issue #4)
					if (current_context_state(state).leave_closing_parenthesis) ret = '';
					else stream.next();
					-- state.group_level;
					return pop(state, ret);
				}
				stream.next();
				return all_tokens(state, 'err unmatched-closing-parenthesis');
			}

			// Anchors
			if (stream.eat('^') || stream.eat('$')) return all_tokens(state, 'anchor');
			if (stream.eat('.')) return all_tokens(state, 'generic-character-type');
			// Quantifiers: 0 or 1, 0 or more, 1 or more, greedy:
			if (stream.eat('?') || stream.eat('*') || stream.eat('+')) {
				// Handle possessive and lazy variants:
				stream.eat(/[+?]/);
				return all_tokens(state, 'quantifier');
			}
			consume(stream);
			return all_tokens(state);
		}

		function startState() {
			return {
				context: [],
				context_state: [],
				group_level: 0,
				name_value: '',
				extended: options.extended,
			};
		}

		function copyState(o) { // o = original
			var i, oo, oc, key, c = startState(); // c = copy, oo = original object, oc = object copy
			for (i = 0; i < o.context_state.length; ++i) {
				oo = o.context_state[i];
				oc = {};
				for (key in oo) oc[key] = (key === 'expected') ? oo[key].slice() : oo[key];
				c.context_state.push(oc);
			}
			c.context = o.context.slice();
			c.group_level = o.group_level;
			c.name_value = o.name_value;
			c.extended = o.extended;
			return c;
		}

		return {
			startState: startState,
			copyState: copyState,
			token: tokenBase,
		};
	});

	CodeMirror.defineMIME('text/x-regex', 'pcre');
	CodeMirror.defineMIME('text/x-pcre-regex', 'pcre');

});