📅 You're Invited: Meet the Socket team at RSAC (April 28 – May 1).RSVP
Socket
Sign inDemoInstall
Socket

@desertnet/scanner

Package Overview
Dependencies
Maintainers
1
Versions
10
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@desertnet/scanner - npm Package Compare versions

Comparing version

to
2.0.0

.eslintrc.json

44

package.json
{
"name": "@desertnet/scanner",
"version": "1.1.0",
"version": "2.0.0",
"description": "A regex-based string scanner/tokenizer",
"main": "index.js",
"main": "dist/index.js",
"scripts": {
"test": "mocha --compilers js:babel-register",
"build": "rimraf ./dist && babel -d ./dist/es5 -s inline ./lib",
"preversion": "npm test",
"prepublish": "npm run build"
"build": "rimraf dist && babel -s inline -D src -d dist",
"clean": "rimraf dist coverage .nyc_output",
"lint": "eslint src test",
"prepare": "npm run build",
"prepublishOnly": "npm test",
"preversion": "npm test && npm run lint",
"test": "cross-env NODE_ENV=test mocha --require @babel/register --throw-deprecation",
"test:coverage": "cross-env NODE_ENV=test nyc mocha",
"test:watch": "chokidar src test mock fixtures --initial -c 'npm t'"
},

@@ -27,9 +32,24 @@ "repository": {

"devDependencies": {
"babel-cli": "^6.11.4",
"babel-preset-es2015": "^6.13.2",
"babel-register": "^6.11.6",
"chai": "^3.5.0",
"mocha": "^3.0.2",
"rimraf": "^2.5.4"
"@babel/cli": "^7.0.0-beta.54",
"@babel/core": "^7.0.0-beta.54",
"@babel/preset-env": "^7.0.0-beta.54",
"@babel/register": "^7.0.0-beta.54",
"babel-eslint": "^8.2.6",
"babel-plugin-istanbul": "^4.1.6",
"chai": "^4.1.2",
"chokidar-cli": "^1.2.0",
"cross-env": "^5.2.0",
"eslint": "^5.2.0",
"mocha": "^5.2.0",
"nyc": "^12.0.2",
"rimraf": "^2.6.2",
"sinon": "^6.1.4",
"sinon-chai": "^3.2.0"
},
"dependencies": {
"lodash.isplainobject": "^4.0.6",
"lodash.isstring": "^4.0.1",
"lodash.maxby": "^4.6.0",
"node-interval-tree": "^1.3.3"
}
}

@@ -1,150 +0,186 @@

# scanner
# @desertnet/scanner
A regex-based string scanner/tokenizer for JavaScript
A lexical analyzer for JavaScript.
## Compatibility
Version 2 is a complete rewrite with a new API, and requires a number of ES6 features. If you need compatibility with older browsers, the [version 1 API](https://github.com/desertnet/scanner/tree/v1) is still supported.
## Installation
```shell
npm install --save @desertnet/scanner
npm install @desertnet/scanner
```
## Usage
### An Example
Let’s say you want to design a language for embedding things like images or videos in arbitrary text, using tags that look something like `[img-1]`. We can use this package to find those embedded tags.
```javascript
const Scanner = require('@desertnet/scanner')
import {createDialect, BufferedRegExpScanner} from '@desertnet/scanner'
const body = `Hi! [img-1]
const input = `Hi! [img-1]
[img-2]
Bye[!]`
```
const embedTagScanner = new Scanner([
{'tag': /\[\w+-\d+\]/},
{'text': /[^\[]+/},
{'bracket': /\[/}
])
First, declare the types of tokens you want to recognize.
let token
embedTagScanner.setSource(body)
while (token = embedTagScanner.nextToken()) {
console.log(token)
```javascript
const tag = Symbol('tag')
const text = Symbol('text')
```
Describe your language using regular expressions. `String.raw` can help you avoid needing to escape backslashes.
```javascript
const r = String.raw
const embedTagLang = createDialect(
[ tag, r`\[\w-\d+\]` ],
[ text, r`[^\[]+` ],
[ text, r`\[` ],
)
```
Create a scanner to read the input string.
```javascript
const embedTagScanner = new BufferedRegExpScanner(input)
```
Iterate over the tokens and do something with them.
```javascript
for (const token of embedTagScanner.generateTokensUsingDialect(embedTagLang)) {
const {type, start, end, line, column, value} = token
console.log({type, start, end, line, column, value})
}
```
Outputs:
Here’s what that outputs.
```
Token { type: 'text', value: 'Hi! ', index: 0, line: 1, column: 1 }
Token { type: 'tag', value: '[img-1]', index: 4, line: 1, column: 5 }
Token { type: 'text', value: '\n', index: 11, line: 2, column: 12 }
Token { type: 'tag', value: '[img-2]', index: 12, line: 2, column: 1 }
Token { type: 'text', value: '\nBye', index: 19, line: 3, column: 8 }
Token { type: 'bracket', value: '[', index: 23, line: 3, column: 4 }
Token { type: 'text', value: '!]', index: 24, line: 3, column: 5 }
{ type: Symbol(text), start: 0, end: 4, line: 1, column: 1, value: 'Hi! ' }
{ type: Symbol(tag), start: 4, end: 11, line: 1, column: 5, value: '[img-1]' }
{ type: Symbol(text), start: 11, end: 12, line: 1, column: 12, value: '\n' }
{ type: Symbol(tag), start: 12, end: 19, line: 2, column: 1, value: '[img-2]' }
{ type: Symbol(text), start: 19, end: 23, line: 2, column: 8, value: '\nBye' }
{ type: Symbol(text), start: 23, end: 24, line: 3, column: 4, value: '[' }
{ type: Symbol(text), start: 24, end: 26, line: 3, column: 5, value: '!]' }
```
### Dialects
In the context of this module, a **dialect** is an ordered mapping of a token-type to a regular expression that describes the token. In the above example you will see a `Scanner` object instantiated with an array of objects, with each object containing a single `RegExp` property. This pattern allows you to succinctly define a dialect for your scanner.
## API
We call a dialect an ordered mapping because the order in which the token types are defined is important. The scanner will attempt to match the input string against the first regex in the dialect. If that fails, it will go on to the next one in the list. Once a match is made, a token with the matching type is generated. No attempts to match any remaining regexes in the dialect are undertaken.
### new BufferedRegExpScanner(inputString)
It is generally a very good idea to ensure that your dialect accepts all inputs. If you do not, then an input string could cause the scanner to throw an "unexpected character" error. Notice in the example how the `text` token description matches everything that is not a `[` character, and the last token description matches a lone `[` character. This ensures that all inputs will generate a token. An alternate way of doing this would be to end your dialect with the following token description to match any single character:
A `Scanner` subclass that scans a string using JavaScript `RegExp` patterns. “Buffered” refers to the fact that the entire input must be buffered into a string before scanning can start. It operates on the principle of “maximal munch”: if more than one possible `TokenDefinition` matches, the longest match is the token that is produced. In the event of a tie, the first `TokenDefinition` passed to `Dialect` is chosen amongst the longest matches.
```javascript
{'catchall': /[^]/} // This is the same as `qr/./s` in Perl
```
Supported `TokenDefinition` `flags`:
### Multi-Dialect Scanners
- `ignoreCase`: When set to `true`, acts like the `i` flag for regular expressions, causing the pattern to be case insensitive.
Often a language is simply not describable using a single dialect. For example, HTML often contains inline CSS. Or JavaDoc annotations inside Java comments. Even something simple like HTML attribute values denoted by `"` and HTML attribute values denoted by `'` may necessitate distinct dialect definitions.
There are no specific properties for this `Scanner` subclass, see `Scanner` on how to use it to extract tokens from `inputString`.
You can create a multi-dialect scanner by passing an object to the `Scanner` constructor. This object should be a mapping of dialect names to dialect definitions. For example, here is the start of a rather stripped down HTML multi-dialect scanner:
```javascript
const htmlScanner = new Scanner({
// Starting dialect, for content "outside of a tag".
"content": [
{"text": /[^<>]+/},
{"tagStart": /<[a-z][^\t\n\ \/\>]*/i},
{"closeTagStart": /<\/[a-z][^\t\n\ \/\>]*/i},
{"error": /[<>]/}
],
### createDialect(...tokenDefinitions)
// Dialect for the inside of tags.
"tag": [
{"tagEnd": />/},
{"whitespace": /\s+/},
{"selfClose": /\//},
{"error": /['"<=]/},
{"attributeStart": /[^>=\s\/]+/i}
],
This is a convenience function for creating a `Dialect` object with `TokenDefinition` objects. You pass it arrays of arguments to the `TokenDefinition` constructor, and a new `Dialect` object will be returned with those definitions.
// Initial dialect for attributes.
"attribute": [
{"whitespace": /\s+/},
{"attributeValueQuotedStart": /=['"]/},
{"attributeValueStart": /=/},
{"tagEnd": />/},
{"selfClose": /\//},
{"error": /['"<]/},
],
// Dialect for closing tags.
"closeTag": [
{"tagEnd": />/},
{"whitespace": /\s+/},
{"error": /[^\s>]+/}
],
### new Dialect(tokenDefinitions)
// ...
A `Dialect` object is an ordered collection of `TokenDefinition` objects, passed as an array to this constructor. A dialect may be a complete language definition, or it may be a subset of a language.
})
```
#### dialect.tokenDefinitions
Once you have a multi-dialect scanner you must call `scanner.pushDialect(dialectName)` to set the initial dialect. While you tokenize your string, you can call `.pushDialect()` and `.popDialect()` whenever your input changes contexts to a different dialect. (Alternately you may call `.setDialect()` directly, but do not mix this with using the scanner's dialect stack.)
An array of `TokenDefinition` objects that define the dialect.
## API
### new Scanner(dialectDefinition)
### Scanner
Constructs a new `Scanner` object with the given dialect definition(s). If `dialectDefinition` is an Array, then it is expected that the array contains a list of token descriptors, which is an object containing a single property that maps a token-type to a `RegExp`. If it is an object, then it is expected that each property maps a dialect name to an array of token descriptors. See the above definition of Dialects and Multi-Dialect scanners.
The `Scanner` class should not be instantiated directly. Instead, instantiate a subclass like `BufferedRegExpScanner`.
#### .setSource(input)
#### scanner.generateTokensUsingDialect(dialect)
Sets the input string for the scanner. If you are reusing a scanner instance, calling this method will reset the scanner to the beginning of the new string.
Returns a JavaScript `Generator` that yields `Token`s based on the passed `Dialect`. You can use this as the `Iterable` in a `for...of` loop.
#### .currentDialect()
In the event that no `TokenDefinition` in `dialect` matches, the generator will produce a final `Token` with a `type` property equal to the `UnexpectedCharacter` symbol and containing the `start` and `end` of the character. The `UnexpectedCharacter` symbol is exported by `@desertnet/scanner`. When this token is produced, the `position` property of scanner will not be updated.
Returns a string which is the name of the current dialect. For single-dialect scanners this will always be `'main'`.
#### scanner.lineNumberForOffset(offset)
#### .setDialect(name)
Returns the line number for the passed `offset` of the input string.
Sets the current dialect to the dialect specified by the string `name`. You do not need to call this for single-dialect scanners. For multi-dialect scanners, you are probably better off using the methods for managing the scanner's dialect stack.
#### scanner.columnNumberForOffset(offset)
#### .pushDialect(name)
Returns the column number for the passed `offset` of the input string.
Pushes the dialect specified by name onto the scanner's dialect stack, and sets it as the current dialect.
#### scanner.determineNextTokenUsingDialect(dialect)
#### .popDialect()
This method should never be invoked by anything other than the `Scanner` class. However, if you are making a subclass of `Scanner` you must implement this method. The expected return value is an array with two values: a token type identifier (the value of `token.type`), and the offset within the input string where the token ends (the value of `token.end`). If the end of the input is reached, it should return an array with the first value being that of the `EOF` symbol exported by `@desertnet/scanner`. If no token can be matched, `undefined` should be returned.
Pops the topmost dialect from the scanner's dialect stack, and sets the current dialect to the dialect that is now at the top of the stack.
#### scanner.subject
#### .nextToken([expectedTokens])
The input string.
Returns a `Token` object for the next matching token in the source string. To scan a string, you keep calling this method until it returns `null`, which indicates the entire string has been scanned. This method can throw an error if the dialect cannot match the next token, so be sure to define your dialects to cover all inputs.
#### scanner.position
The optional `expectedTokens` parameter is an array of token type names for the current dialect. Use this if it makes sense to only scan for a subset of token types. However, if you find yourself doing this, it may be better to switch to a multi-dialect scanner.
The offset into the input string that the scanner is currently evaluating.
### new Scanner.Token(type, value, index, line, column)
Generally, instances of the `Token` class should not be instantiated directly. However the constructor is publicly available as it can be convenient to instantiate your own in tests.
### Token
In typical usage, they are returned to you by `scanner.nextToken()`. They have the following readable properties:
The `Token` class should only be instantiated by the `Scanner` class.
- `type`: The name of the token type. (A string.)
- `value`: The matching substring from the input.
- `index`: The zero-based index of the start of the token in the input string.
- `line`: The one-based index of the line the token starts at in the input string.
- `column`: The one-based index of the column the token starts at within the line.
#### Memory Usage
Instances of `Token` retain a reference to the `Scanner` that generated them, which will include a reference to the input string. This is so that properties like `value`, `line` and `column` can be computed on demand. However it does mean that to reclaim the memory used by the input string you must release any `Token` instances.
#### token.type
The token’s type identifier.
#### token.start
The offset into the input string where the token begins.
#### token.end
The offset into the input string where the token has ended. This is exclusive, so the offset of the last character of the token is actually `token.end - 1`. This is in keeping with how JavaScript string methods like `.slice()` work. It also allows for zero-length tokens to be represented, though it’s not clear if it is useful for a language to define tokens that can not have a length.
#### token.value
The string value of the token.
#### token.line
The line number that the token starts on. Note that this is a computed property, and the first access of it (or `token.column`) will trigger line number indexing of the input string. This means the first access of either `line` or `column` will be relatively slow, but subsequent accesses will be very fast.
#### token.column
The column number that the token starts on. Like `line`, this is a computed property; see `token.line` above for details.
### new TokenDefinition(typeIdentifier, pattern, flags)
A `TokenDefinition` defines a mapping of a pattern to a token type identifier. In other words, you provide a `pattern`, and when the scanner matches that pattern in the input string, a `Token` object with its `type` property set to the provided `typeIdentifier` is generated.
- `typeIdentifier`: A value used to identify the type of `Token` objects this definition produces. It can be of any type, but it is recommended that you use `Symbol`s for efficiency and to prevent unintended naming collisions.
- `pattern`: A string defining a pattern to be used by the scanner to match the input string. The format of this depends on the scanner implementation, but `BufferedRegExpScanner` expects JavaScript `RegExp` syntax.
- `flags`: A plain object with flags for the scanner implementation. For example, to tell `BufferedRegExpScanner` to match case insensitively for this pattern, you would pass `{ignoreCase: true}`.
#### tokenDefinition.identifier
The passed `typeIdentifier` constructor parameter.
#### tokenDefinition.pattern
The passed `pattern` constructor parameter.
#### tokenDefinition.flags
The passed `flags` constructor parameter.
import {expect} from 'chai'
import sinon from 'sinon'
import Scanner from '../lib/Scanner'
import {
Scanner, Dialect, TokenDefinition, Token, UnexpectedCharacter
} from '../src'
describe("Scanner", function () {
var scanner = null;
var dialectedScanner = null;
beforeEach(function () {
scanner = new Scanner([
{"dot": /\./},
{"ident": /\w+/},
{"space": /\s+/}
]);
dialectedScanner = new Scanner({
"foo": [
{"num": /\d+/},
{"other": /.+/}
],
"bar": [
{"word": /[a-z_]/i},
{"other": /.+/}
]
});
});
describe("#setSource", function () {
it("should reset the dialect stack", function () {
dialectedScanner.pushDialect("foo");
dialectedScanner.pushDialect("bar");
dialectedScanner.setSource("hi");
expect(dialectedScanner.dialectStack()).to.deep.equal([]);
describe(`Scanner`, function () {
describe(`constructor()`, function () {
it(`should throw an error if no subject is passed`, function () {
expect(() => new Scanner()).to.throw(TypeError)
})
it("should set the current dialect to null if scanner has more than one dialect", function () {
dialectedScanner.pushDialect("foo");
dialectedScanner.pushDialect("bar");
dialectedScanner.setSource("hi");
expect(dialectedScanner.currentDialect()).to.be.null;
it(`should set the subject property`, function () {
const scanner = new Scanner('foo')
expect(scanner.subject).to.equal('foo')
})
it("should not set the current dialect to null if the scanner has only one dialect", function () {
scanner.setSource("hi");
expect(scanner.currentDialect()).not.to.be.null;
it(`should set the position property to 0`, function () {
const scanner = new Scanner('foo')
expect(scanner).to.have.property('position').that.equals(0)
})
})
describe("#nextToken", function () {
it("should be able to produce tokens", function () {
scanner.setSource("net.desert");
describe(`instance property`, function () {
const dialect = new Dialect([new TokenDefinition('foo', 'foo')])
var netTok = scanner.nextToken();
var dotTok = scanner.nextToken();
var desTok = scanner.nextToken();
var eofTok = scanner.nextToken();
let scanner
beforeEach(function () {
scanner = new Scanner('foo\nbar\n🤣\n\uD83E\n\uD83E')
})
expect(netTok.type).to.equal("ident");
expect(netTok.value).to.equal("net");
expect(dotTok.type).to.equal("dot");
expect(desTok.type).to.equal("ident");
expect(desTok.value).to.equal("desert");
expect(eofTok).to.equal(null);
});
describe(`determineNextTokenUsingDialect()`, function () {
it(`should always throw`, function () {
expect(() => scanner.determineNextTokenUsingDialect())
.to.throw(/overridden/i)
})
})
it("should produce tokens with correct column and line numbers", function () {
scanner.setSource("one two\n three\n\nfour");
describe(`generateTokensUsingDialect()`, function () {
it(`should throw when passed no arguments`, async function () {
await expect(() => scanner.generateTokensUsingDialect().next())
.to.be.throw(TypeError)
})
var tok;
var tokens = [];
while (tok = scanner.nextToken()) {
if (tok.type === "space") continue;
tokens.push(tok);
}
it(`should throw when passed anything other than a Dialect`, async function () {
await expect(() => scanner.generateTokensUsingDialect('foo').next())
.to.throw(TypeError)
})
expect(tokens[0].line).to.equal(1);
expect(tokens[0].column).to.equal(1);
expect(tokens[1].line).to.equal(1);
expect(tokens[1].column).to.equal(5);
expect(tokens[2].line).to.equal(2);
expect(tokens[2].column).to.equal(2);
expect(tokens[3].line).to.equal(4);
expect(tokens[3].column).to.equal(1);
});
it(`should return an iterable`, function () {
expect(scanner.generateTokensUsingDialect(dialect))
.to.have.property(Symbol.iterator)
.that.is.a('function')
})
it("should throw errors with relevant line and column numbers", function () {
scanner.setSource("one\r\ntwo\r\nfail:here\r\nneversee");
context(`when nothing in dialect matches`, function () {
beforeEach(function () {
sinon.stub(scanner, 'determineNextTokenUsingDialect')
.returns(undefined)
})
var tok;
var error = null;
var tokens = [];
try {
while (tok = scanner.nextToken()) {
if (tok.type === "space") continue;
tokens.push(tok);
}
}
catch (e) {
error = e;
}
describe(`returned Token`, function () {
let token
expect(tokens.length).to.equal(3);
expect(error).not.to.be.null;
expect(error.name).to.equal("ScannerError");
expect(error.index).to.equal(14);
expect(error.line).to.equal(3);
expect(error.column).to.equal(5);
});
beforeEach(function () {
token = scanner.generateTokensUsingDialect(dialect).next().value
})
it("should only process expected tokens", function () {
scanner.setSource("word. word");
it(`should have type of UnexpectedCharacter`, function () {
expect(token)
.to.be.an.instanceof(Token)
.and.to.include({
type: UnexpectedCharacter,
value: 'f'
})
})
var tok = scanner.nextToken(["dot", "ident"]);
expect(tok.type).to.equal("ident");
it(`should have the correct length when next character is single-code-unit`, function () {
expect(token).to.have.property('start').that.equals(0)
expect(token).to.have.property('end').that.equals(1)
})
tok = scanner.nextToken(["dot", "ident"]);
expect(tok.type).to.equal("dot");
it(`should have the correct length when next character is multi-code-unit`, function () {
scanner.position = 8
const token = scanner.generateTokensUsingDialect(dialect).next().value
expect(token).to.have.property('start').that.equals(8)
expect(token).to.have.property('end').that.equals(10)
})
var error = null;
try {
tok = scanner.nextToken(["dot", "ident"]);
}
catch (e) {
error = e;
}
expect(error).not.to.be.null;
});
it(`should have the correct length when next character is invalid mutli-code-unit`, function () {
scanner.position = 11
const token = scanner.generateTokensUsingDialect(dialect).next().value
expect(token).to.have.property('start').that.equals(11)
expect(token).to.have.property('end').that.equals(12)
})
it("should throw an error when there is no current dialect", function () {
expect(function () { dialectedScanner.nextToken() }).to.throw();
it(`should have the correct length when next character incomplete multi-code-unit`, function () {
scanner.position = 13
const token = scanner.generateTokensUsingDialect(dialect).next().value
expect(token).to.have.property('start').that.equals(13)
expect(token).to.have.property('end').that.equals(14)
})
})
})
})
it("should not crash when a LINE SEPARATOR character is in the input string", function () {
scanner = new Scanner([ {"tagstart": /</}, {"text": /[^<]+/} ]);
scanner.setSource("hello\n world \u2028 <br>\n foo\n");
var generateTokens = function () { while (scanner.nextToken()) { /* noop */ } };
expect(generateTokens).not.to.throw();
describe(`lineNumberForOffset()`, function () {
it(`should return the line number for the offset in the subject`, function () {
expect(scanner.lineNumberForOffset(0)).to.equal(1)
expect(scanner.lineNumberForOffset(4)).to.equal(2)
})
})
})
describe("#currentDialect", function () {
it("should return 'main' when the original dialect is the active one", function () {
expect(scanner.currentDialect()).to.equal("main");
describe(`columnNumberForOffset()`, function () {
it(`should return the column number for the offset in the subject`, function () {
expect(scanner.columnNumberForOffset(0)).to.equal(1)
expect(scanner.columnNumberForOffset(1)).to.equal(2)
expect(scanner.columnNumberForOffset(4)).to.equal(1)
})
})
})
describe("#dialects", function () {
it("should return an array of available dialect names", function () {
expect(scanner.dialects()).to.deep.equal(["main"]);
})
})
describe("#setDialect", function () {
it("should cause the scanner to use the specified dialect when fetching the next token", function () {
var str = "42hello";
dialectedScanner.setSource(str);
dialectedScanner.setDialect("foo");
var tok1 = dialectedScanner.nextToken();
expect(tok1.type).to.equal("num");
dialectedScanner.setDialect("bar");
var tok2 = dialectedScanner.nextToken();
expect(tok2.type).to.equal("word");
})
});
describe("#pushDialect", function () {
it("should set the current dialect", function () {
dialectedScanner.pushDialect("bar");
expect(dialectedScanner.currentDialect()).to.equal("bar");
})
})
describe("#popDialect", function () {
it("should set the current dialect to the dialect on top of the stack", function () {
dialectedScanner.pushDialect("foo");
dialectedScanner.pushDialect("bar");
dialectedScanner.popDialect();
expect(dialectedScanner.currentDialect()).to.equal("foo");
})
})
});
})

Sorry, the diff of this file is not supported yet