Comparing version 0.4.3 to 0.5.0
352
moo.js
@@ -13,25 +13,3 @@ (function(root, factory) { | ||
var hasOwnProperty = Object.prototype.hasOwnProperty | ||
// polyfill assign(), so we support IE9+ | ||
var assign = typeof Object.assign === 'function' ? Object.assign : | ||
// https://tc39.github.io/ecma262/#sec-object.assign | ||
function(target, sources) { | ||
if (target == null) { | ||
throw new TypeError('Target cannot be null or undefined'); | ||
} | ||
target = Object(target) | ||
for (var i = 1; i < arguments.length; i++) { | ||
var source = arguments[i] | ||
if (source == null) continue | ||
for (var key in source) { | ||
if (hasOwnProperty.call(source, key)) { | ||
target[key] = source[key] | ||
} | ||
} | ||
} | ||
return target | ||
} | ||
var toString = Object.prototype.toString | ||
var hasSticky = typeof new RegExp().sticky === 'boolean' | ||
@@ -41,4 +19,4 @@ | ||
function isRegExp(o) { return o && o.constructor === RegExp } | ||
function isObject(o) { return o && typeof o === 'object' && o.constructor !== RegExp && !Array.isArray(o) } | ||
function isRegExp(o) { return o && toString.call(o) === '[object RegExp]' } | ||
function isObject(o) { return o && typeof o === 'object' && !isRegExp(o) && !Array.isArray(o) } | ||
@@ -56,2 +34,3 @@ function reEscape(s) { | ||
function reUnion(regexps) { | ||
if (!regexps.length) return '(?!)' | ||
var source = regexps.map(function(s) { | ||
@@ -69,10 +48,11 @@ return "(?:" + s + ")" | ||
// TODO: consider /u support | ||
if (obj.ignoreCase) { throw new Error('RegExp /i flag not allowed') } | ||
if (obj.global) { throw new Error('RegExp /g flag is implied') } | ||
if (obj.sticky) { throw new Error('RegExp /y flag is implied') } | ||
if (obj.multiline) { throw new Error('RegExp /m flag is implied') } | ||
if (obj.ignoreCase) throw new Error('RegExp /i flag not allowed') | ||
if (obj.global) throw new Error('RegExp /g flag is implied') | ||
if (obj.sticky) throw new Error('RegExp /y flag is implied') | ||
if (obj.multiline) throw new Error('RegExp /m flag is implied') | ||
if (obj.unicode) throw new Error('RegExp /u flag is not allowed') | ||
return obj.source | ||
} else { | ||
throw new Error('not a pattern: ' + obj) | ||
throw new Error('Not a pattern: ' + obj) | ||
} | ||
@@ -84,6 +64,12 @@ } | ||
var result = [] | ||
for (var i=0; i<keys.length; i++) { | ||
for (var i = 0; i < keys.length; i++) { | ||
var key = keys[i] | ||
var thing = object[key] | ||
var rules = Array.isArray(thing) ? thing : [thing] | ||
var rules = [].concat(thing) | ||
if (key === 'include') { | ||
for (var j = 0; j < rules.length; j++) { | ||
result.push({include: rules[j]}) | ||
} | ||
continue | ||
} | ||
var match = [] | ||
@@ -106,8 +92,15 @@ rules.forEach(function(rule) { | ||
var result = [] | ||
for (var i=0; i<array.length; i++) { | ||
for (var i = 0; i < array.length; i++) { | ||
var obj = array[i] | ||
if (!obj.name) { | ||
throw new Error('Rule has no name: ' + JSON.stringify(obj)) | ||
if (obj.include) { | ||
var include = [].concat(obj.include) | ||
for (var j = 0; j < include.length; j++) { | ||
result.push({include: include[j]}) | ||
} | ||
continue | ||
} | ||
result.push(ruleOptions(obj.name, obj)) | ||
if (!obj.type) { | ||
throw new Error('Rule has no type: ' + JSON.stringify(obj)) | ||
} | ||
result.push(ruleOptions(obj.type, obj)) | ||
} | ||
@@ -117,11 +110,14 @@ return result | ||
function ruleOptions(name, obj) { | ||
if (typeof obj !== 'object' || Array.isArray(obj) || isRegExp(obj)) { | ||
function ruleOptions(type, obj) { | ||
if (!isObject(obj)) { | ||
obj = { match: obj } | ||
} | ||
if (obj.include) { | ||
throw new Error('Matching rules cannot also include states') | ||
} | ||
// nb. error implies lineBreaks | ||
var options = assign({ | ||
tokenType: name, | ||
lineBreaks: !!obj.error, | ||
// nb. error and fallback imply lineBreaks | ||
var options = { | ||
defaultType: type, | ||
lineBreaks: !!obj.error || !!obj.fallback, | ||
pop: false, | ||
@@ -131,6 +127,20 @@ next: null, | ||
error: false, | ||
fallback: false, | ||
value: null, | ||
getType: null, | ||
}, obj) | ||
type: null, | ||
shouldThrow: false, | ||
} | ||
// Avoid Object.assign(), so we support IE9+ | ||
for (var key in obj) { | ||
if (hasOwnProperty.call(obj, key)) { | ||
options[key] = obj[key] | ||
} | ||
} | ||
// type transform cannot be a string | ||
if (typeof options.type === 'string' && type !== options.type) { | ||
throw new Error("Type transform cannot be a string (type '" + options.type + "' for token '" + type + "')") | ||
} | ||
// convert to array | ||
@@ -143,20 +153,40 @@ var match = options.match | ||
}) | ||
if (options.keywords) { | ||
options.getType = keywordTransform(options.keywords) | ||
} | ||
return options | ||
} | ||
function toRules(spec) { | ||
return Array.isArray(spec) ? arrayToRules(spec) : objectToRules(spec) | ||
} | ||
var defaultErrorRule = ruleOptions('error', {lineBreaks: true, shouldThrow: true}) | ||
function compileRules(rules, hasStates) { | ||
rules = Array.isArray(rules) ? arrayToRules(rules) : objectToRules(rules) | ||
var errorRule = null | ||
var fast = Object.create(null) | ||
var fastAllowed = true | ||
var groups = [] | ||
var parts = [] | ||
for (var i=0; i<rules.length; i++) { | ||
// If there is a fallback rule, then disable fast matching | ||
for (var i = 0; i < rules.length; i++) { | ||
if (rules[i].fallback) { | ||
fastAllowed = false | ||
} | ||
} | ||
for (var i = 0; i < rules.length; i++) { | ||
var options = rules[i] | ||
if (options.error) { | ||
if (options.include) { | ||
// all valid inclusions are removed by states() preprocessor | ||
throw new Error('Inheritance is not allowed in stateless lexers') | ||
} | ||
if (options.error || options.fallback) { | ||
// errorRule can only be set once | ||
if (errorRule) { | ||
throw new Error("Multiple error rules not allowed: (for token '" + options.tokenType + "')") | ||
if (!options.fallback === !errorRule.fallback) { | ||
throw new Error("Multiple " + (options.fallback ? "fallback" : "error") + " rules not allowed (for token '" + options.defaultType + "')") | ||
} else { | ||
throw new Error("fallback and error are mutually exclusive (for token '" + options.defaultType + "')") | ||
} | ||
} | ||
@@ -166,10 +196,30 @@ errorRule = options | ||
// skip rules with no match | ||
if (options.match.length === 0) { | ||
var match = options.match | ||
if (fastAllowed) { | ||
while (match.length && typeof match[0] === 'string' && match[0].length === 1) { | ||
var word = match.shift() | ||
fast[word.charCodeAt(0)] = options | ||
} | ||
} | ||
// Warn about inappropriate state-switching options | ||
if (options.pop || options.push || options.next) { | ||
if (!hasStates) { | ||
throw new Error("State-switching options are not allowed in stateless lexers (for token '" + options.defaultType + "')") | ||
} | ||
if (options.fallback) { | ||
throw new Error("State-switching options are not allowed on fallback tokens (for token '" + options.defaultType + "')") | ||
} | ||
} | ||
// Only rules with a .match are included in the RegExp | ||
if (match.length === 0) { | ||
continue | ||
} | ||
fastAllowed = false | ||
groups.push(options) | ||
// convert to RegExp | ||
var pat = reUnion(options.match.map(regexpOrLiteral)) | ||
var pat = reUnion(match.map(regexpOrLiteral)) | ||
@@ -185,5 +235,2 @@ // validate | ||
} | ||
if (!hasStates && (options.pop || options.push || options.next)) { | ||
throw new Error("State-switching options are not allowed in stateless lexers (for token '" + options.tokenType + "')") | ||
} | ||
@@ -199,36 +246,84 @@ // try and detect rules matching newlines | ||
var suffix = hasSticky ? '' : '|(?:)' | ||
var flags = hasSticky ? 'ym' : 'gm' | ||
// If there's no fallback rule, use the sticky flag so we only look for | ||
// matches at the current index. | ||
// | ||
// If we don't support the sticky flag, then fake it using an irrefutable | ||
// match (i.e. an empty pattern). | ||
var fallbackRule = errorRule && errorRule.fallback | ||
var flags = hasSticky && !fallbackRule ? 'ym' : 'gm' | ||
var suffix = hasSticky || fallbackRule ? '' : '|' | ||
var combined = new RegExp(reUnion(parts) + suffix, flags) | ||
return {regexp: combined, groups: groups, error: errorRule} | ||
return {regexp: combined, groups: groups, fast: fast, error: errorRule || defaultErrorRule} | ||
} | ||
function compile(rules) { | ||
var result = compileRules(rules) | ||
var result = compileRules(toRules(rules)) | ||
return new Lexer({start: result}, 'start') | ||
} | ||
function checkStateGroup(g, name, map) { | ||
var state = g && (g.push || g.next) | ||
if (state && !map[state]) { | ||
throw new Error("Missing state '" + state + "' (in token '" + g.defaultType + "' of state '" + name + "')") | ||
} | ||
if (g && g.pop && +g.pop !== 1) { | ||
throw new Error("pop must be 1 (in token '" + g.defaultType + "' of state '" + name + "')") | ||
} | ||
} | ||
function compileStates(states, start) { | ||
var all = states.$all ? toRules(states.$all) : [] | ||
delete states.$all | ||
var keys = Object.getOwnPropertyNames(states) | ||
if (!start) start = keys[0] | ||
var ruleMap = Object.create(null) | ||
for (var i = 0; i < keys.length; i++) { | ||
var key = keys[i] | ||
ruleMap[key] = toRules(states[key]).concat(all) | ||
} | ||
for (var i = 0; i < keys.length; i++) { | ||
var key = keys[i] | ||
var rules = ruleMap[key] | ||
var included = Object.create(null) | ||
for (var j = 0; j < rules.length; j++) { | ||
var rule = rules[j] | ||
if (!rule.include) continue | ||
var splice = [j, 1] | ||
if (rule.include !== key && !included[rule.include]) { | ||
included[rule.include] = true | ||
var newRules = ruleMap[rule.include] | ||
if (!newRules) { | ||
throw new Error("Cannot include nonexistent state '" + rule.include + "' (in state '" + key + "')") | ||
} | ||
for (var k = 0; k < newRules.length; k++) { | ||
var newRule = newRules[k] | ||
if (rules.indexOf(newRule) !== -1) continue | ||
splice.push(newRule) | ||
} | ||
} | ||
rules.splice.apply(rules, splice) | ||
j-- | ||
} | ||
} | ||
var map = Object.create(null) | ||
for (var i=0; i<keys.length; i++) { | ||
for (var i = 0; i < keys.length; i++) { | ||
var key = keys[i] | ||
map[key] = compileRules(states[key], true) | ||
map[key] = compileRules(ruleMap[key], true) | ||
} | ||
for (var i=0; i<keys.length; i++) { | ||
var groups = map[keys[i]].groups | ||
for (var j=0; j<groups.length; j++) { | ||
var g = groups[j] | ||
var state = g && (g.push || g.next) | ||
if (state && !map[state]) { | ||
throw new Error("Missing state '" + state + "' (in token '" + g.tokenType + "' of state '" + keys[i] + "')") | ||
} | ||
if (g && g.pop && +g.pop !== 1) { | ||
throw new Error("pop must be 1 (in token '" + g.tokenType + "' of state '" + keys[i] + "')") | ||
} | ||
for (var i = 0; i < keys.length; i++) { | ||
var name = keys[i] | ||
var state = map[name] | ||
var groups = state.groups | ||
for (var j = 0; j < groups.length; j++) { | ||
checkStateGroup(groups[j], name, map) | ||
} | ||
var fastKeys = Object.getOwnPropertyNames(state.fast) | ||
for (var j = 0; j < fastKeys.length; j++) { | ||
checkStateGroup(state.fast[fastKeys[j]], name, map) | ||
} | ||
} | ||
@@ -243,3 +338,3 @@ | ||
var types = Object.getOwnPropertyNames(map) | ||
for (var i=0; i<types.length; i++) { | ||
for (var i = 0; i < types.length; i++) { | ||
var tokenType = types[i] | ||
@@ -261,3 +356,2 @@ var item = map[tokenType] | ||
var source = '' | ||
source += '(function(value) {\n' | ||
source += 'switch (value.length) {\n' | ||
@@ -275,4 +369,3 @@ for (var length in byLength) { | ||
source += '}\n' | ||
source += '})' | ||
return eval(source) // getType | ||
return Function('value', source) // type | ||
} | ||
@@ -295,3 +388,6 @@ | ||
this.col = info ? info.col : 1 | ||
this.queuedToken = info ? info.queuedToken : null | ||
this.queuedThrow = info ? info.queuedThrow : null | ||
this.setState(info ? info.state : this.startState) | ||
this.stack = info && info.stack ? info.stack.slice() : [] | ||
return this | ||
@@ -305,2 +401,5 @@ } | ||
state: this.state, | ||
stack: this.stack.slice(), | ||
queuedToken: this.queuedToken, | ||
queuedThrow: this.queuedThrow, | ||
} | ||
@@ -314,4 +413,5 @@ } | ||
this.groups = info.groups | ||
this.error = info.error || {lineBreaks: true, shouldThrow: true} | ||
this.error = info.error | ||
this.re = info.regexp | ||
this.fast = info.fast | ||
} | ||
@@ -328,6 +428,6 @@ | ||
Lexer.prototype._eat = hasSticky ? function(re) { // assume re is /y | ||
return re.exec(this.buffer) | ||
} : function(re) { // assume re is /g | ||
var match = re.exec(this.buffer) | ||
var eat = hasSticky ? function(re, buffer) { // assume re is /y | ||
return re.exec(buffer) | ||
} : function(re, buffer) { // assume re is /g | ||
var match = re.exec(buffer) | ||
// will always match, since we used the |(?:) trick | ||
@@ -341,13 +441,9 @@ if (match[0].length === 0) { | ||
Lexer.prototype._getGroup = function(match) { | ||
if (match === null) { | ||
return -1 | ||
} | ||
var groupCount = this.groups.length | ||
for (var i = 0; i < groupCount; i++) { | ||
if (match[i + 1] !== undefined) { | ||
return i | ||
return this.groups[i] | ||
} | ||
} | ||
throw new Error('oops') | ||
throw new Error('Cannot find token type for matched text') | ||
} | ||
@@ -360,6 +456,13 @@ | ||
Lexer.prototype.next = function() { | ||
var re = this.re | ||
var index = this.index | ||
// If a fallback token matched, we don't need to re-run the RegExp | ||
if (this.queuedGroup) { | ||
var token = this._token(this.queuedGroup, this.queuedText, index) | ||
this.queuedGroup = null | ||
this.queuedText = "" | ||
return token | ||
} | ||
var buffer = this.buffer | ||
var index = re.lastIndex = this.index | ||
if (index === buffer.length) { | ||
@@ -369,17 +472,34 @@ return // EOF | ||
var match = this._eat(re) | ||
var i = this._getGroup(match) | ||
// Fast matching for single characters | ||
var group = this.fast[buffer.charCodeAt(index)] | ||
if (group) { | ||
return this._token(group, buffer.charAt(index), index) | ||
} | ||
var group, text | ||
if (i === -1) { | ||
group = this.error | ||
// Execute RegExp | ||
var re = this.re | ||
re.lastIndex = index | ||
var match = eat(re, buffer) | ||
// consume rest of buffer | ||
text = buffer.slice(index) | ||
// Error tokens match the remaining buffer | ||
var error = this.error | ||
if (match == null) { | ||
return this._token(error, buffer.slice(index, buffer.length), index) | ||
} | ||
} else { | ||
text = match[0] | ||
group = this.groups[i] | ||
var group = this._getGroup(match) | ||
var text = match[0] | ||
if (error.fallback && match.index !== index) { | ||
this.queuedGroup = group | ||
this.queuedText = text | ||
// Fallback tokens contain the unmatched portion of the buffer | ||
return this._token(error, buffer.slice(index, match.index), index) | ||
} | ||
return this._token(group, text, index) | ||
} | ||
Lexer.prototype._token = function(group, text, offset) { | ||
// count line breaks | ||
@@ -398,7 +518,7 @@ var lineBreaks = 0 | ||
var token = { | ||
type: (group.getType && group.getType(text)) || group.tokenType, | ||
value: group.value ? group.value(text) : text, | ||
type: (typeof group.type === 'function' && group.type(text)) || group.defaultType, | ||
value: typeof group.value === 'function' ? group.value(text) : text, | ||
text: text, | ||
toString: tokenToString, | ||
offset: index, | ||
offset: offset, | ||
lineBreaks: lineBreaks, | ||
@@ -418,2 +538,3 @@ line: this.line, | ||
} | ||
// throw, if no rule with {error: true} | ||
@@ -427,2 +548,3 @@ if (group.shouldThrow) { | ||
else if (group.next) this.setState(group.next) | ||
return token | ||
@@ -451,3 +573,3 @@ } | ||
Lexer.prototype.formatError = function(token, message) { | ||
var value = token.value | ||
var value = token.text | ||
var index = token.offset | ||
@@ -468,13 +590,3 @@ var eol = token.lineBreaks ? value.indexOf('\n') : value.length | ||
Lexer.prototype.has = function(tokenType) { | ||
for (var s in this.states) { | ||
var groups = this.states[s].groups | ||
for (var i=0; i<groups.length; i++) { | ||
var group = groups[i] | ||
if (group.tokenType === tokenType) return true | ||
if (group.keywords && hasOwnProperty.call(group.keywords, tokenType)) { | ||
return true | ||
} | ||
} | ||
} | ||
return false | ||
return true | ||
} | ||
@@ -487,4 +599,6 @@ | ||
error: Object.freeze({error: true}), | ||
fallback: Object.freeze({fallback: true}), | ||
keywords: keywordTransform, | ||
} | ||
})) | ||
})); |
{ | ||
"name": "moo", | ||
"version": "0.4.3", | ||
"version": "0.5.0", | ||
"description": "Optimised tokenizer/lexer generator! π Much performance. Moo!", | ||
@@ -17,3 +17,2 @@ "main": "moo.js", | ||
"test": "jest .", | ||
"lint": "eslint moo.js", | ||
"benchmark": "benchr test/benchmark.js", | ||
@@ -25,4 +24,3 @@ "moo": "echo 'Mooooo!'" | ||
"chevrotain": "^0.27.1", | ||
"eslint": "^3.17.1", | ||
"jest": "^19.0.2", | ||
"jest": "^23.6.0", | ||
"lex": "^1.7.9", | ||
@@ -29,0 +27,0 @@ "lexing": "^0.8.0", |
@@ -86,5 +86,5 @@ ![](cow.png) | ||
``` | ||
Better: | ||
```js | ||
@@ -140,3 +140,4 @@ let lexer = moo.compile({ | ||
* **`type`**: the name of the group, as passed to compile. | ||
* **`value`**: the match contents. | ||
* **`text`**: the string that was matched. | ||
* **`value`**: the string that was matched, transformed by your `value` function (if any). | ||
* **`offset`**: the number of bytes from the start of the buffer where the match starts. | ||
@@ -148,2 +149,19 @@ * **`lineBreaks`**: the number of line breaks found in the match. (Always zero if this rule has `lineBreaks: false`.) | ||
### Value vs. Text ### | ||
The `value` is the same as the `text`, unless you provide a [value transform](#transform). | ||
```js | ||
const moo = require('moo') | ||
const lexer = moo.compile({ | ||
ws: /[ \t]+/, | ||
string: {match: /"(?:\\["\\]|[^\n"\\])*"/, value: s => s.slice(1, -1)}, | ||
}) | ||
lexer.reset('"test"') | ||
lexer.next() /* { value: 'test', text: '"test"', ... } */ | ||
``` | ||
### Reset ### | ||
@@ -181,9 +199,9 @@ | ||
**Keywords** should be written using the `keywords` attribute. | ||
**Keywords** should be written using the `keywords` transform. | ||
```js | ||
moo.compile({ | ||
IDEN: {match: /[a-zA-Z]+/, keywords: { | ||
KW: ['while', 'if', 'else', 'moo', 'cows']), | ||
}}, | ||
IDEN: {match: /[a-zA-Z]+/, type: moo.keywords({ | ||
KW: ['while', 'if', 'else', 'moo', 'cows'], | ||
})}, | ||
SPACE: {match: /\s+/, lineBreaks: true}, | ||
@@ -216,7 +234,7 @@ }) | ||
let lexer = moo.compile({ | ||
name: {match: /[a-zA-Z]+/, keywords: { | ||
name: {match: /[a-zA-Z]+/, type: moo.keywords({ | ||
'kw-class': 'class', | ||
'kw-def': 'def', | ||
'kw-if': 'if', | ||
}}, | ||
})}, | ||
// ... | ||
@@ -242,4 +260,14 @@ }) | ||
Sometimes you want your lexer to support different states. This is useful for string interpolation, for example: to tokenize `a${{c: d}}e`, you might use: | ||
Moo allows you to define multiple lexer **states**. Each state defines its own separate set of token rules. Your lexer will start off in the first state given to `moo.states({})`. | ||
Rules can be annotated with `next`, `push`, and `pop`, to change the current state after that token is matched. A "stack" of past states is kept, which is used by `push` and `pop`. | ||
* **`next: 'bar'`** moves to the state named `bar`. (The stack is not changed.) | ||
* **`push: 'bar'`** moves to the state named `bar`, and pushes the old state onto the stack. | ||
* **`pop: 1`** removes one state from the top of the stack, and moves to that state. (Only `1` is supported.) | ||
Only rules from the current state can be matched. You need to copy your rule into all the states you want it to be matched in. | ||
For example, to tokenize JS-style string interpolation such as `a${{c: d}}e`, you might use: | ||
```js | ||
@@ -266,17 +294,11 @@ let lexer = moo.states({ | ||
It's also nice to let states inherit rules from other states and be able to count things, e.g. the interpolated expression state needs a `}` rule that can tell if it's a closing brace or the end of the interpolation, but is otherwise identical to the normal expression state. | ||
The `rbrace` rule is annotated with `pop`, so it moves from the `main` state into either `lit` or `main`, depending on the stack. | ||
To support this, Moo allows annotating tokens with `push`, `pop` and `next`: | ||
* **`push`** moves the lexer to a new state, and pushes the old state onto the stack. | ||
* **`pop`** returns to a previous state, by removing one or more states from the stack. | ||
* **`next`** moves to a new state, but does not affect the stack. | ||
Errors | ||
------ | ||
If no token matches, Moo will throw an Error. | ||
If none of your rules match, Moo will throw an Error; since it doesn't know what else to do. | ||
If you'd rather treat errors as just another kind of token, you can ask Moo to do so. | ||
If you prefer, you can have moo return an error token instead of throwing an exception. The error token will contain the whole of the rest of the buffer. | ||
@@ -288,8 +310,8 @@ ```js | ||
}) | ||
moo.reset('invalid') | ||
moo.next() // -> { type: 'myError', value: 'invalid' } | ||
moo.next() // -> { type: 'myError', value: 'invalid', text: 'invalid', offset: 0, lineBreaks: 0, line: 1, col: 1 } | ||
moo.next() // -> undefined | ||
``` | ||
You can have a token type that both matches tokens _and_ contains error values. | ||
@@ -304,2 +326,4 @@ | ||
### Formatting errors ### | ||
If you want to throw an error from your parser, you might find `formatError` helpful. Call it with the offending token: | ||
@@ -311,3 +335,3 @@ | ||
And it returns a string with a pretty error message. | ||
It returns a string with a pretty error message. | ||
@@ -314,0 +338,0 @@ ``` |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
31117
7
500
386