Comparing version 0.3.3 to 0.4.0
253
moo.js
@@ -13,2 +13,4 @@ (function(root, factory) { | ||
var hasOwnProperty = Object.prototype.hasOwnProperty | ||
// polyfill assign(), so we support IE9+ | ||
var assign = typeof Object.assign === 'function' ? Object.assign : | ||
@@ -37,5 +39,7 @@ // https://tc39.github.io/ecma262/#sec-object.assign | ||
/***************************************************************************/ | ||
function isRegExp(o) { return o && o.constructor === RegExp } | ||
function isObject(o) { return o && typeof o === 'object' && o.constructor !== RegExp && !Array.isArray(o) } | ||
function reEscape(s) { | ||
@@ -58,3 +62,2 @@ return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&') | ||
function compareLength(a, b) { | ||
@@ -86,3 +89,15 @@ return b.length - a.length | ||
var key = keys[i] | ||
result.push(ruleOptions(key, object[key])) | ||
var thing = object[key] | ||
var rules = Array.isArray(thing) ? thing : [thing] | ||
var match = [] | ||
rules.forEach(function(rule) { | ||
if (isObject(rule)) { | ||
if (match.length) result.push(ruleOptions(key, match)) | ||
result.push(ruleOptions(key, rule)) | ||
match = [] | ||
} else { | ||
match.push(rule) | ||
} | ||
}) | ||
if (match.length) result.push(ruleOptions(key, match)) | ||
} | ||
@@ -117,4 +132,5 @@ return result | ||
error: false, | ||
value: null, | ||
getType: null, | ||
}, obj) | ||
options.keywords = null | ||
@@ -124,57 +140,15 @@ // convert to array | ||
options.match = Array.isArray(match) ? match : match ? [match] : [] | ||
options.match.sort(function(a, b) { | ||
return isRegExp(a) && isRegExp(b) ? 0 | ||
: isRegExp(b) ? -1 : isRegExp(a) ? +1 : b.length - a.length | ||
}) | ||
if (options.keywords) { | ||
options.getType = keywordTransform(options.keywords) | ||
} | ||
return options | ||
} | ||
function sortRules(rules) { | ||
var result = [] | ||
for (var i=0; i<rules.length; i++) { | ||
var options = rules[i] | ||
var match = options.match | ||
// sort literals by length to ensure longest match | ||
var capturingPatterns = [] | ||
var patterns = [] | ||
var literals = [] | ||
for (var j=0; j<match.length; j++) { | ||
var obj = match[j] | ||
if (!isRegExp(obj)) literals.push(obj) | ||
else if (reGroups(obj.source) > 0) capturingPatterns.push(obj) | ||
else patterns.push(obj) | ||
} | ||
literals.sort(compareLength) | ||
// append regexps to the end | ||
options.match = literals.concat(patterns) | ||
result.push(options) | ||
// add each capturing regexp as a separate rule | ||
for (var j=0; j<capturingPatterns.length; j++) { | ||
result.push(assign({}, options, { | ||
match: [capturingPatterns[j]], | ||
})) | ||
} | ||
} | ||
return result | ||
} | ||
function getIdentifier(literal, otherRules) { | ||
for (var i=0; i<otherRules.length; i++) { | ||
var rule = otherRules[i] | ||
var match = rule.match | ||
for (var j=0; j<match.length; j++) { | ||
var pat = match[j] | ||
if (!isRegExp(pat)) { continue } | ||
var m = pat.exec(literal) | ||
if (m && m[0] === literal) { | ||
return rule | ||
} | ||
} | ||
} | ||
} | ||
function compileRules(rules, hasStates) { | ||
rules = Array.isArray(rules) ? arrayToRules(rules) : objectToRules(rules) | ||
rules = sortRules(rules) | ||
var errorRule = null | ||
@@ -193,22 +167,2 @@ var groups = [] | ||
// look for keywords | ||
var match = options.match | ||
var notKeywords = [] | ||
for (var j=0; j<match.length; j++) { | ||
var word = match[j] | ||
if (typeof word === 'string') { | ||
// does it match an existing rule (e.g. identifier?) | ||
var other = getIdentifier(word, rules) | ||
if (other) { | ||
if (!other.keywords) { | ||
other.keywords = Object.create(null) | ||
} | ||
other.keywords[word] = options | ||
continue | ||
} | ||
} | ||
notKeywords.push(word) | ||
} | ||
options.match = notKeywords | ||
// skip rules with no match | ||
@@ -229,4 +183,4 @@ if (options.match.length === 0) { | ||
var groupCount = reGroups(pat) | ||
if (groupCount > 1) { | ||
throw new Error("RegExp has more than one capture group: " + regexp) | ||
if (groupCount > 0) { | ||
throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: β¦ ) instead") | ||
} | ||
@@ -243,5 +197,3 @@ if (!hasStates && (options.pop || options.push || options.next)) { | ||
// store regex | ||
var isCapture = !!groupCount | ||
if (!isCapture) pat = reCapture(pat) | ||
parts.push(pat) | ||
parts.push(reCapture(pat)) | ||
} | ||
@@ -274,3 +226,3 @@ | ||
for (var j=0; j<groups.length; j++) { | ||
var g = groups[i] | ||
var g = groups[j] | ||
var state = g && (g.push || g.next) | ||
@@ -280,2 +232,6 @@ if (state && !map[state]) { | ||
} | ||
var pop = g && g.pop | ||
if (pop && typeof pop !== 'boolean') { | ||
throw new Error("pop must be true (in token '" + g.tokenType + "' of state '" + keys[i] + "')") | ||
} | ||
} | ||
@@ -287,3 +243,42 @@ } | ||
function keywordTransform(map) { | ||
var reverseMap = Object.create(null) | ||
var byLength = Object.create(null) | ||
var types = Object.getOwnPropertyNames(map) | ||
for (var i=0; i<types.length; i++) { | ||
var tokenType = types[i] | ||
var item = map[tokenType] | ||
var keywordList = Array.isArray(item) ? item : [item] | ||
keywordList.forEach(function(keyword) { | ||
(byLength[keyword.length] = byLength[keyword.length] || []).push(keyword) | ||
reverseMap[keyword] = tokenType | ||
}) | ||
} | ||
// fast string lookup | ||
// https://jsperf.com/string-lookups | ||
function str(x) { return JSON.stringify(x) } | ||
var source = '' | ||
source += '(function(value) {\n' | ||
source += 'switch (value.length) {\n' | ||
for (var length in byLength) { | ||
var keywords = byLength[length] | ||
source += 'case ' + length + ':\n' | ||
source += 'switch (value) {\n' | ||
keywords.forEach(function(keyword) { | ||
var tokenType = reverseMap[keyword] | ||
if (typeof tokenType !== 'string') { | ||
throw new Error('keyword type must be string: ' + name) | ||
} | ||
source += 'case ' + str(keyword) + ': return ' + str(tokenType) + '\n' | ||
}) | ||
source += '}\n' | ||
} | ||
source += '}\n' | ||
source += '})' | ||
return eval(source) // getType | ||
} | ||
/***************************************************************************/ | ||
var Lexer = function(states, state) { | ||
@@ -297,2 +292,19 @@ this.startState = state | ||
Lexer.prototype.reset = function(data, info) { | ||
this.buffer = data || '' | ||
this.index = 0 | ||
this.line = info ? info.line : 1 | ||
this.col = info ? info.col : 1 | ||
this.setState(info ? info.state : this.startState) | ||
return this | ||
} | ||
Lexer.prototype.save = function() { | ||
return { | ||
line: this.line, | ||
col: this.col, | ||
state: this.state, | ||
} | ||
} | ||
Lexer.prototype.setState = function(state) { | ||
@@ -303,3 +315,3 @@ if (!state || this.state === state) return | ||
this.groups = info.groups | ||
this.error = info.error | ||
this.error = info.error || {lineBreaks: true, shouldThrow: true} | ||
this.re = info.regexp | ||
@@ -317,3 +329,3 @@ } | ||
Lexer.prototype.eat = hasSticky ? function(re) { // assume re is /y | ||
Lexer.prototype._eat = hasSticky ? function(re) { // assume re is /y | ||
return re.exec(this.buffer) | ||
@@ -329,4 +341,18 @@ } : function(re) { // assume re is /g | ||
Lexer.prototype._getGroup = function(match) { | ||
if (match === null) { | ||
return -1 | ||
} | ||
var groupCount = this.groups.length | ||
for (var i = 0; i < groupCount; i++) { | ||
if (match[i + 1] !== undefined) { | ||
return i | ||
} | ||
} | ||
throw new Error('oops') | ||
} | ||
function tokenToString() { | ||
return this.value || this.type | ||
return this.value | ||
} | ||
@@ -343,27 +369,15 @@ | ||
var match = this.eat(re) | ||
var group, value, text | ||
if (match === null) { | ||
var match = this._eat(re) | ||
var i = this._getGroup(match) | ||
var group, value | ||
if (i === -1) { | ||
group = this.error | ||
// consume rest of buffer | ||
text = value = buffer.slice(index) | ||
value = buffer.slice(index) | ||
} else { | ||
text = match[0] | ||
var groups = this.groups | ||
for (var i = 0; i < groups.length; i++) { | ||
value = match[i + 1] | ||
if (value !== undefined) { | ||
group = groups[i] | ||
// TODO is `buffer` being leaked here? | ||
break | ||
} | ||
} | ||
// assert(i < groupCount) | ||
// check for keywords | ||
if (group.keywords) { | ||
group = group.keywords[text] || group | ||
} | ||
value = match[0] // i+1 | ||
group = this.groups[i] | ||
} | ||
@@ -373,16 +387,16 @@ | ||
var lineBreaks = 0 | ||
if (!group || group.lineBreaks) { | ||
if (group.lineBreaks) { | ||
var matchNL = /\n/g | ||
var nl = 1 | ||
if (text === '\n') { | ||
if (value === '\n') { | ||
lineBreaks = 1 | ||
} else { | ||
while (matchNL.exec(text)) { lineBreaks++; nl = matchNL.lastIndex } | ||
while (matchNL.exec(value)) { lineBreaks++; nl = matchNL.lastIndex } | ||
} | ||
} | ||
var size = text.length | ||
var size = value.length | ||
var token = { | ||
type: group && group.tokenType, | ||
value: value, | ||
type: (group.getType && group.getType(value)) || group.tokenType, | ||
value: (group.value && group.value(value)) || value, | ||
toString: tokenToString, | ||
@@ -404,3 +418,3 @@ offset: index, | ||
// throw, if no rule with {error: true} | ||
if (!group) { | ||
if (group.shouldThrow) { | ||
throw new Error(this.formatError(token, "invalid syntax")) | ||
@@ -425,2 +439,6 @@ } | ||
LexerIterator.prototype[Symbol.iterator] = function() { | ||
return this | ||
} | ||
Lexer.prototype[Symbol.iterator] = function() { | ||
@@ -443,19 +461,2 @@ return new LexerIterator(this) | ||
Lexer.prototype.reset = function(data, info) { | ||
this.buffer = data || '' | ||
this.index = 0 | ||
this.line = info ? info.line : 1 | ||
this.col = info ? info.col : 1 | ||
this.setState(info ? info.state : this.startState) | ||
return this | ||
} | ||
Lexer.prototype.save = function() { | ||
return { | ||
line: this.line, | ||
col: this.col, | ||
state: this.state, | ||
} | ||
} | ||
Lexer.prototype.clone = function() { | ||
@@ -462,0 +463,0 @@ return new Lexer(this.states, this.state) |
{ | ||
"name": "moo", | ||
"version": "0.3.3", | ||
"version": "0.4.0", | ||
"description": "Optimised tokenizer/lexer generator! π Uses /y for performance. Moo!", | ||
@@ -5,0 +5,0 @@ "main": "moo.js", |
@@ -47,4 +47,4 @@ ![](cow.png) | ||
comment: /\/\/.*?$/, | ||
number: /(0|[1-9][0-9]*)/, | ||
string: /"((?:\\["\\]|[^\n"\\])*)"/, | ||
number: /0|[1-9][0-9]*/, | ||
string: /"(?:\\["\\]|[^\n"\\])*"/, | ||
lparen: '(', | ||
@@ -80,3 +80,3 @@ rparen: ')', | ||
let lexer = moo.compile({ | ||
string: /"(.*)"/, // greedy quantifier * | ||
string: /".*"/, // greedy quantifier * | ||
// ... | ||
@@ -93,3 +93,3 @@ }) | ||
let lexer = moo.compile({ | ||
string: /"(.*?)"/, // non-greedy quantifier *? | ||
string: /".*?"/, // non-greedy quantifier *? | ||
// ... | ||
@@ -118,7 +118,5 @@ }) | ||
(Note: moo [special-cases keywords](#keywords); in which case order is ignored.) | ||
* Moo uses **multiline RegExps**. This has a few quirks: for example, the **dot `/./` doesn't include newlines**. Use `[^]` instead if you want to match newlines too. | ||
* Since excluding capture groups like `/[^ ]/` (no spaces) _will_ include newlines, you have to be careful not to include them by accident! In particular, the whitespace metacharacter `\s` includes newlines. | ||
* Since an excluding character ranges like `/[^ ]/` (which matches anything but a space) _will_ include newlines, you have to be careful not to include them by accident! In particular, the whitespace metacharacter `\s` includes newlines. | ||
@@ -136,4 +134,3 @@ | ||
* **`type`**: the name of the group, as passed to compile. | ||
* **`value`**: the contents of the capturing group (or the whole match, if the token RegExp doesn't define a capture). | ||
* **`size`**: the total length of the match (`value` may be shorter if you have capturing groups). | ||
* **`value`**: the match contents. | ||
* **`offset`**: the number of bytes from the start of the buffer where the match starts. | ||
@@ -165,3 +162,3 @@ * **`lineBreaks`**: the number of line breaks found in the match. (Always zero if this rule has `lineBreaks: false`.) | ||
Moo makes it convenient to define literals and keywords. | ||
Moo makes it convenient to define literals. | ||
@@ -178,26 +175,50 @@ ```js | ||
Important! **Always write your literals like this:** | ||
**Keywords** should be written using the `keywords` attribute. | ||
```js | ||
['while', 'if', 'else', 'moo', 'cows'] | ||
moo.compile({ | ||
IDEN: {match: /[a-zA-Z]+/, keywords: { | ||
KW: ['while', 'if', 'else', 'moo', 'cows']), | ||
}}, | ||
SPACE: {match: /\s+/, lineBreaks: true}, | ||
}) | ||
``` | ||
And **not** like this: | ||
You need to do this to ensure the **longest match** principle applies, even in edge cases. | ||
Imagine trying to parse the input `className` with the following rules: | ||
```js | ||
/while|if|else|moo|cows/ | ||
['keyword', ['class']], | ||
['identifier', /[a-zA-Z]+/], | ||
``` | ||
### Why? ### | ||
You'll get _two_ tokens β `['class', 'Name']` -- which is _not_ what you want! If you swap the order of the rules, you'll fix this example; but now you'll lex `class` wrong (as an `identifier`). | ||
The reason: Moo special-cases keywords to ensure the **longest match** principle applies, even in edge cases. | ||
The keywords helper checks matches against the list of keywords; if any of them match, it uses the type `'keyword'` instead of `'identifier'` (for this example). | ||
Imagine trying to parse the input `className` with the following rules: | ||
Keywords can also have **individual types**. | ||
['keyword', ['class']], | ||
['identifier', /[a-zA-Z]+/], | ||
```js | ||
let lexer = moo.compile({ | ||
name: {match: /[a-zA-Z]+/, keywords: { | ||
'kw-class': 'class', | ||
'kw-def': 'def', | ||
'kw-if': 'if', | ||
}}, | ||
// ... | ||
}) | ||
lexer.reset('def foo') | ||
lexer.next() // -> { type: 'kw-def', value: 'def' } | ||
lexer.next() // space | ||
lexer.next() // -> { type: 'name', value: 'foo' } | ||
``` | ||
You'll get _two_ tokens β `['class', 'Name']` -- which is _not_ what you want! If you swap the order of the rules, you'll fix this example; but now you'll lex `class` wrong (as an `identifier`). | ||
Use [itt](https://github.com/nathan/itt)'s iterator adapters to make constructing keyword objects easier: | ||
Moo solves this by checking to see if any of your literals can be matched by one of your other rules; if so, it doesn't lex the keyword separately, but instead handles it at a later stage (by checking identifiers against a list of keywords). | ||
```js | ||
itt(['class', 'def', 'if']) | ||
.map(k => ['kw-' + k, k]) | ||
.toObject() | ||
``` | ||
@@ -216,3 +237,3 @@ | ||
lbrace: {match: '{', push: 'main'}, | ||
rbrace: {match: '}', pop: 1}, | ||
rbrace: {match: '}', pop: true}, | ||
colon: ':', | ||
@@ -224,3 +245,3 @@ space: {match: /\s+/, lineBreaks: true}, | ||
escape: /\\./, | ||
strend: {match: '`', pop: 1}, | ||
strend: {match: '`', pop: true}, | ||
const: {match: /(?:[^$`]|\$(?!\{))+/, lineBreaks: true}, | ||
@@ -311,2 +332,19 @@ }, | ||
Transform | ||
--------- | ||
Moo doesn't allow capturing groups, but you can supply a transform function, `value()`, which will be called on the value before storing it in the Token object. | ||
```js | ||
moo.compile({ | ||
STRING: [ | ||
{match: /"""[^]*?"""/, lineBreaks: true, value: x => x.slice(3, -3)}, | ||
{match: /"(?:\\["\\rn]|[^"\\])*?"/, lineBreaks: true, value: x => x.slice(1, -1)}, | ||
{match: /'(?:\\['\\rn]|[^'\\])*?'/, lineBreaks: true, value: x => x.slice(1, -1)}, | ||
], | ||
// ... | ||
}) | ||
``` | ||
Contributing | ||
@@ -313,0 +351,0 @@ ------------ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
25704
4
403
345