moo - npm Package Compare versions

Comparing version 0.3.3 to 0.4.0

LICENSE

253

moo.js

		@@ -13,2 +13,4 @@ (function(root, factory) {
		var hasOwnProperty = Object.prototype.hasOwnProperty

		// polyfill assign(), so we support IE9+
		var assign = typeof Object.assign === 'function' ? Object.assign :
		@@ -37,5 +39,7 @@ // https://tc39.github.io/ecma262/#sec-object.assign

		/***************************************************************************/

		function isRegExp(o) { return o && o.constructor === RegExp }
		function isObject(o) { return o && typeof o === 'object' && o.constructor !== RegExp && !Array.isArray(o) }


		function reEscape(s) {
		@@ -58,3 +62,2 @@ return s.replace(/[-\/\\^$*+?.()\|[\]{}]/g, '\\$&')


		function compareLength(a, b) {
		@@ -86,3 +89,15 @@ return b.length - a.length
		var key = keys[i]
		result.push(ruleOptions(key, object[key]))
		var thing = object[key]
		var rules = Array.isArray(thing) ? thing : [thing]
		var match = []
		rules.forEach(function(rule) {
		if (isObject(rule)) {
		if (match.length) result.push(ruleOptions(key, match))
		result.push(ruleOptions(key, rule))
		match = []
		} else {
		match.push(rule)
		}
		})
		if (match.length) result.push(ruleOptions(key, match))
		}
		@@ -117,4 +132,5 @@ return result
		error: false,
		value: null,
		getType: null,
		}, obj)
		options.keywords = null

		@@ -124,57 +140,15 @@ // convert to array
		options.match = Array.isArray(match) ? match : match ? [match] : []
		options.match.sort(function(a, b) {
		return isRegExp(a) && isRegExp(b) ? 0
		: isRegExp(b) ? -1 : isRegExp(a) ? +1 : b.length - a.length
		})
		if (options.keywords) {
		options.getType = keywordTransform(options.keywords)
		}
		return options
		}

		function sortRules(rules) {
		var result = []
		for (var i=0; i<rules.length; i++) {
		var options = rules[i]
		var match = options.match

		// sort literals by length to ensure longest match
		var capturingPatterns = []
		var patterns = []
		var literals = []
		for (var j=0; j<match.length; j++) {
		var obj = match[j]
		if (!isRegExp(obj)) literals.push(obj)
		else if (reGroups(obj.source) > 0) capturingPatterns.push(obj)
		else patterns.push(obj)
		}
		literals.sort(compareLength)

		// append regexps to the end
		options.match = literals.concat(patterns)
		result.push(options)

		// add each capturing regexp as a separate rule
		for (var j=0; j<capturingPatterns.length; j++) {
		result.push(assign({}, options, {
		match: [capturingPatterns[j]],
		}))
		}
		}
		return result
		}

		function getIdentifier(literal, otherRules) {
		for (var i=0; i<otherRules.length; i++) {
		var rule = otherRules[i]
		var match = rule.match
		for (var j=0; j<match.length; j++) {
		var pat = match[j]
		if (!isRegExp(pat)) { continue }
		var m = pat.exec(literal)
		if (m && m[0] === literal) {
		return rule
		}
		}
		}
		}

		function compileRules(rules, hasStates) {
		rules = Array.isArray(rules) ? arrayToRules(rules) : objectToRules(rules)

		rules = sortRules(rules)

		var errorRule = null
		@@ -193,22 +167,2 @@ var groups = []

		// look for keywords
		var match = options.match
		var notKeywords = []
		for (var j=0; j<match.length; j++) {
		var word = match[j]
		if (typeof word === 'string') {
		// does it match an existing rule (e.g. identifier?)
		var other = getIdentifier(word, rules)
		if (other) {
		if (!other.keywords) {
		other.keywords = Object.create(null)
		}
		other.keywords[word] = options
		continue
		}
		}
		notKeywords.push(word)
		}
		options.match = notKeywords

		// skip rules with no match
		@@ -229,4 +183,4 @@ if (options.match.length === 0) {
		var groupCount = reGroups(pat)
		if (groupCount > 1) {
		throw new Error("RegExp has more than one capture group: " + regexp)
		if (groupCount > 0) {
		throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead")
		}
		@@ -243,5 +197,3 @@ if (!hasStates && (options.pop \|\| options.push \|\| options.next)) {
		// store regex
		var isCapture = !!groupCount
		if (!isCapture) pat = reCapture(pat)
		parts.push(pat)
		parts.push(reCapture(pat))
		}
		@@ -274,3 +226,3 @@
		for (var j=0; j<groups.length; j++) {
		var g = groups[i]
		var g = groups[j]
		var state = g && (g.push \|\| g.next)
		@@ -280,2 +232,6 @@ if (state && !map[state]) {
		}
		var pop = g && g.pop
		if (pop && typeof pop !== 'boolean') {
		throw new Error("pop must be true (in token '" + g.tokenType + "' of state '" + keys[i] + "')")
		}
		}
		@@ -287,3 +243,42 @@ }

		function keywordTransform(map) {
		var reverseMap = Object.create(null)
		var byLength = Object.create(null)
		var types = Object.getOwnPropertyNames(map)
		for (var i=0; i<types.length; i++) {
		var tokenType = types[i]
		var item = map[tokenType]
		var keywordList = Array.isArray(item) ? item : [item]
		keywordList.forEach(function(keyword) {
		(byLength[keyword.length] = byLength[keyword.length] \|\| []).push(keyword)
		reverseMap[keyword] = tokenType
		})
		}

		// fast string lookup
		// https://jsperf.com/string-lookups
		function str(x) { return JSON.stringify(x) }
		var source = ''
		source += '(function(value) {\n'
		source += 'switch (value.length) {\n'
		for (var length in byLength) {
		var keywords = byLength[length]
		source += 'case ' + length + ':\n'
		source += 'switch (value) {\n'
		keywords.forEach(function(keyword) {
		var tokenType = reverseMap[keyword]
		if (typeof tokenType !== 'string') {
		throw new Error('keyword type must be string: ' + name)
		}
		source += 'case ' + str(keyword) + ': return ' + str(tokenType) + '\n'
		})
		source += '}\n'
		}
		source += '}\n'
		source += '})'
		return eval(source) // getType
		}

		/***************************************************************************/

		var Lexer = function(states, state) {
		@@ -297,2 +292,19 @@ this.startState = state

		Lexer.prototype.reset = function(data, info) {
		this.buffer = data \|\| ''
		this.index = 0
		this.line = info ? info.line : 1
		this.col = info ? info.col : 1
		this.setState(info ? info.state : this.startState)
		return this
		}

		Lexer.prototype.save = function() {
		return {
		line: this.line,
		col: this.col,
		state: this.state,
		}
		}

		Lexer.prototype.setState = function(state) {
		@@ -303,3 +315,3 @@ if (!state \|\| this.state === state) return
		this.groups = info.groups
		this.error = info.error
		this.error = info.error \|\| {lineBreaks: true, shouldThrow: true}
		this.re = info.regexp
		@@ -317,3 +329,3 @@ }

		Lexer.prototype.eat = hasSticky ? function(re) { // assume re is /y
		Lexer.prototype._eat = hasSticky ? function(re) { // assume re is /y
		return re.exec(this.buffer)
		@@ -329,4 +341,18 @@ } : function(re) { // assume re is /g

		Lexer.prototype._getGroup = function(match) {
		if (match === null) {
		return -1
		}

		var groupCount = this.groups.length
		for (var i = 0; i < groupCount; i++) {
		if (match[i + 1] !== undefined) {
		return i
		}
		}
		throw new Error('oops')
		}

		function tokenToString() {
		return this.value \|\| this.type
		return this.value
		}
		@@ -343,27 +369,15 @@

		var match = this.eat(re)
		var group, value, text
		if (match === null) {
		var match = this._eat(re)
		var i = this._getGroup(match)

		var group, value
		if (i === -1) {
		group = this.error

		// consume rest of buffer
		text = value = buffer.slice(index)
		value = buffer.slice(index)

		} else {
		text = match[0]
		var groups = this.groups
		for (var i = 0; i < groups.length; i++) {
		value = match[i + 1]
		if (value !== undefined) {
		group = groups[i]
		// TODO is `buffer` being leaked here?
		break
		}
		}
		// assert(i < groupCount)

		// check for keywords
		if (group.keywords) {
		group = group.keywords[text] \|\| group
		}
		value = match[0] // i+1
		group = this.groups[i]
		}
		@@ -373,16 +387,16 @@
		var lineBreaks = 0
		if (!group \|\| group.lineBreaks) {
		if (group.lineBreaks) {
		var matchNL = /\n/g
		var nl = 1
		if (text === '\n') {
		if (value === '\n') {
		lineBreaks = 1
		} else {
		while (matchNL.exec(text)) { lineBreaks++; nl = matchNL.lastIndex }
		while (matchNL.exec(value)) { lineBreaks++; nl = matchNL.lastIndex }
		}
		}

		var size = text.length
		var size = value.length
		var token = {
		type: group && group.tokenType,
		value: value,
		type: (group.getType && group.getType(value)) \|\| group.tokenType,
		value: (group.value && group.value(value)) \|\| value,
		toString: tokenToString,
		@@ -404,3 +418,3 @@ offset: index,
		// throw, if no rule with {error: true}
		if (!group) {
		if (group.shouldThrow) {
		throw new Error(this.formatError(token, "invalid syntax"))
		@@ -425,2 +439,6 @@ }

		LexerIterator.prototype[Symbol.iterator] = function() {
		return this
		}

		Lexer.prototype[Symbol.iterator] = function() {
		@@ -443,19 +461,2 @@ return new LexerIterator(this)

		Lexer.prototype.reset = function(data, info) {
		this.buffer = data \|\| ''
		this.index = 0
		this.line = info ? info.line : 1
		this.col = info ? info.col : 1
		this.setState(info ? info.state : this.startState)
		return this
		}

		Lexer.prototype.save = function() {
		return {
		line: this.line,
		col: this.col,
		state: this.state,
		}
		}

		Lexer.prototype.clone = function() {
		@@ -462,0 +463,0 @@ return new Lexer(this.states, this.state)

package.json

		{
		"name": "moo",
		"version": "0.3.3",
		"version": "0.4.0",
		"description": "Optimised tokenizer/lexer generator! 🐄 Uses /y for performance. Moo!",
		@@ -5,0 +5,0 @@ "main": "moo.js",

README.md

		@@ -47,4 +47,4 @@ ![](cow.png)
		comment: /\/\/.*?$/,
		number: /(0\|[1-9][0-9]*)/,
		string: /"((?:\\["\\]\|[^\n"\\])*)"/,
		number: /0\|[1-9][0-9]*/,
		string: /"(?:\\["\\]\|[^\n"\\])*"/,
		lparen: '(',
		@@ -80,3 +80,3 @@ rparen: ')',
		let lexer = moo.compile({
		string: /"(.)"/, // greedy quantifier
		string: /"."/, // greedy quantifier
		// ...
		@@ -93,3 +93,3 @@ })
		let lexer = moo.compile({
		string: /"(.?)"/, // non-greedy quantifier ?
		string: /".?"/, // non-greedy quantifier ?
		// ...
		@@ -118,7 +118,5 @@ })

		(Note: moo [special-cases keywords](#keywords); in which case order is ignored.)

		* Moo uses multiline RegExps. This has a few quirks: for example, the dot `/./` doesn't include newlines. Use `[^]` instead if you want to match newlines too.

		* Since excluding capture groups like `/[^ ]/` (no spaces) _will_ include newlines, you have to be careful not to include them by accident! In particular, the whitespace metacharacter `\s` includes newlines.
		* Since an excluding character ranges like `/[^ ]/` (which matches anything but a space) _will_ include newlines, you have to be careful not to include them by accident! In particular, the whitespace metacharacter `\s` includes newlines.

		@@ -136,4 +134,3 @@
		* `type`: the name of the group, as passed to compile.
		* `value`: the contents of the capturing group (or the whole match, if the token RegExp doesn't define a capture).
		* `size`: the total length of the match (`value` may be shorter if you have capturing groups).
		* `value`: the match contents.
		* `offset`: the number of bytes from the start of the buffer where the match starts.
		@@ -165,3 +162,3 @@ * `lineBreaks`: the number of line breaks found in the match. (Always zero if this rule has `lineBreaks: false`.)

		Moo makes it convenient to define literals and keywords.
		Moo makes it convenient to define literals.

		@@ -178,26 +175,50 @@ ```js

		Important! Always write your literals like this:
		Keywords should be written using the `keywords` attribute.

		```js
		['while', 'if', 'else', 'moo', 'cows']
		moo.compile({
		IDEN: {match: /[a-zA-Z]+/, keywords: {
		KW: ['while', 'if', 'else', 'moo', 'cows']),
		}},
		SPACE: {match: /\s+/, lineBreaks: true},
		})
		```

		And not like this:
		You need to do this to ensure the longest match principle applies, even in edge cases.

		Imagine trying to parse the input `className` with the following rules:

		```js
		/while\|if\|else\|moo\|cows/
		['keyword', ['class']],
		['identifier', /[a-zA-Z]+/],
		```

		### Why? ###
		You'll get _two_ tokens — `['class', 'Name']` -- which is _not_ what you want! If you swap the order of the rules, you'll fix this example; but now you'll lex `class` wrong (as an `identifier`).

		The reason: Moo special-cases keywords to ensure the longest match principle applies, even in edge cases.
		The keywords helper checks matches against the list of keywords; if any of them match, it uses the type `'keyword'` instead of `'identifier'` (for this example).

		Imagine trying to parse the input `className` with the following rules:
		Keywords can also have individual types.

		['keyword', ['class']],
		['identifier', /[a-zA-Z]+/],
		```js
		let lexer = moo.compile({
		name: {match: /[a-zA-Z]+/, keywords: {
		'kw-class': 'class',
		'kw-def': 'def',
		'kw-if': 'if',
		}},
		// ...
		})
		lexer.reset('def foo')
		lexer.next() // -> { type: 'kw-def', value: 'def' }
		lexer.next() // space
		lexer.next() // -> { type: 'name', value: 'foo' }
		```

		You'll get _two_ tokens — `['class', 'Name']` -- which is _not_ what you want! If you swap the order of the rules, you'll fix this example; but now you'll lex `class` wrong (as an `identifier`).
		Use [itt](https://github.com/nathan/itt)'s iterator adapters to make constructing keyword objects easier:

		Moo solves this by checking to see if any of your literals can be matched by one of your other rules; if so, it doesn't lex the keyword separately, but instead handles it at a later stage (by checking identifiers against a list of keywords).
		```js
		itt(['class', 'def', 'if'])
		.map(k => ['kw-' + k, k])
		.toObject()
		```

		@@ -216,3 +237,3 @@
		lbrace: {match: '{', push: 'main'},
		rbrace: {match: '}', pop: 1},
		rbrace: {match: '}', pop: true},
		colon: ':',
		@@ -224,3 +245,3 @@ space: {match: /\s+/, lineBreaks: true},
		escape: /\\./,
		strend: {match: '`', pop: 1},
		strend: {match: '`', pop: true},
		const: {match: /(?:[^$`]\|\$(?!\{))+/, lineBreaks: true},
		@@ -311,2 +332,19 @@ },

		Transform
		---------

		Moo doesn't allow capturing groups, but you can supply a transform function, `value()`, which will be called on the value before storing it in the Token object.

		```js
		moo.compile({
		STRING: [
		{match: /"""[^]*?"""/, lineBreaks: true, value: x => x.slice(3, -3)},
		{match: /"(?:\\["\\rn]\|[^"\\])*?"/, lineBreaks: true, value: x => x.slice(1, -1)},
		{match: /'(?:\\['\\rn]\|[^'\\])*?'/, lineBreaks: true, value: x => x.slice(1, -1)},
		],
		// ...
		})
		```


		Contributing
		@@ -313,0 +351,0 @@ ------------

moo - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics