Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

lexing

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

lexing - npm Package Compare versions

Comparing version 0.1.4 to 0.2.0

test/simple.js

388

index.js

@@ -1,27 +0,56 @@

var __extends = this.__extends || function (d, b) {
for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
function __() { this.constructor = d; }
__.prototype = b.prototype;
d.prototype = new __();
};
/// <reference path="type_declarations/DefinitelyTyped/node/node.d.ts" />
var fs = require('fs');
var BufferedBufferReader = (function () {
function BufferedBufferReader(buffer) {
this.buffer = buffer;
// #############################################################################
// BASIC BUFFER READER
/**
Wraps a Buffer as a stateful iterable.
*/
var BufferIterator = (function () {
function BufferIterator(_buffer, _position) {
if (_position === void 0) { _position = 0; }
this._buffer = _buffer;
this._position = _position;
}
BufferedBufferReader.prototype.peekByte = function () {
return this.buffer[0];
BufferIterator.fromString = function (str, encoding) {
var buffer = new Buffer(str, encoding);
return new BufferIterator(buffer);
};
BufferedBufferReader.prototype.peekBuffer = function (length) {
return this.buffer.slice(0, length);
Object.defineProperty(BufferIterator.prototype, "position", {
/**
Return the current position within the underlying Buffer.
*/
get: function () {
return this._position;
},
enumerable: true,
configurable: true
});
Object.defineProperty(BufferIterator.prototype, "size", {
/**
Return the total length of the underlying Buffer.
*/
get: function () {
return this._buffer.length;
},
enumerable: true,
configurable: true
});
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, without advancing our position within the Buffer. Returns a Buffer slice.
*/
BufferIterator.prototype.peek = function (length) {
return this._buffer.slice(this._position, this._position + length);
};
BufferedBufferReader.prototype.readByte = function () {
var byte = this.peekByte();
this.buffer = this.buffer.slice(1);
return byte;
};
BufferedBufferReader.prototype.readBuffer = function (length) {
var buffer = this.peekBuffer(length);
this.buffer = this.buffer.slice(length);
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, and advance our position within the Buffer. Returns a Buffer slice.
Buffer#slice never returns entries beyond the end of the buffer:
`new Buffer([1, 2, 3, 4]).slice(2, 10)` produces `<Buffer 03 04>`
*/
BufferIterator.prototype.next = function (length) {
var buffer = this._buffer.slice(this._position, this._position + length);
this._position += buffer.length;
return buffer;

@@ -32,50 +61,83 @@ };

characters (which may be < `length` iff EOF has been reached).
We do not allow skipping beyond the end of the buffer.
*/
BufferedBufferReader.prototype.skip = function (length) {
// we cannot skip more than `this.buffer.length` bytes
var bytesSkipped = Math.min(length, this.buffer.length);
this.buffer = this.buffer.slice(length);
BufferIterator.prototype.skip = function (length) {
var bytesSkipped = Math.min(length, this._buffer.length - this._position);
this._position += bytesSkipped;
return bytesSkipped;
};
BufferedBufferReader.prototype.toString = function () {
return this.buffer.toString();
return BufferIterator;
})();
exports.BufferIterator = BufferIterator;
/**
Wrap an Array as an iterable.
*/
var ArrayIterator = (function () {
function ArrayIterator(_array, position) {
if (position === void 0) { position = 0; }
this._array = _array;
this.position = position;
}
Object.defineProperty(ArrayIterator.prototype, "size", {
get: function () {
return this._array.length;
},
enumerable: true,
configurable: true
});
ArrayIterator.prototype.next = function () {
return this._array[this.position++];
};
return BufferedBufferReader;
ArrayIterator.prototype.peek = function () {
return this._array[this.position + 1];
};
ArrayIterator.prototype.skip = function () {
if (this.position < this._array.length) {
this.position++;
return true;
}
return false;
};
return ArrayIterator;
})();
exports.BufferedBufferReader = BufferedBufferReader;
var BufferedStringReader = (function (_super) {
__extends(BufferedStringReader, _super);
function BufferedStringReader(input, encoding) {
_super.call(this, new Buffer(input, encoding));
}
return BufferedStringReader;
})(BufferedBufferReader);
exports.BufferedStringReader = BufferedStringReader;
exports.ArrayIterator = ArrayIterator;
// #############################################################################
// FILE READERS
// SYNCHRONOUS FILE READER
/**
Provide buffered (and Buffer-friendly) access to a file.
Provide iterative access to a file.
It is buffered, which means you can call `peek(same_number)` repeatedly without
triggering a `read(2)` system call on the underlying file each time. Likewise,
calling `read(small_number)` repeatedly will issue a `read(2)` system call only
when the buffer doesn't have enough data.
When calling `read()` on the underlying file, it will read batches of
`_block_size` (default: 1024) bytes.
*/
var BufferedFileReader = (function () {
function BufferedFileReader(fd, file_position) {
if (file_position === void 0) { file_position = 0; }
this.fd = fd;
this.file_position = file_position;
this.buffer = new Buffer(0);
var FileIterator = (function () {
// when reading more data, pull in chunks of `_block_size` bytes.
function FileIterator(_fd, _position, _block_size) {
if (_position === void 0) { _position = 0; }
if (_block_size === void 0) { _block_size = 1024; }
this._fd = _fd;
this._position = _position;
this._block_size = _block_size;
this._buffer = new Buffer(0);
}
BufferedFileReader.open = function (filepath) {
FileIterator.open = function (filepath) {
var fd = fs.openSync(filepath, 'r');
return new BufferedFileReader(fd);
return new FileIterator(fd);
};
BufferedFileReader.prototype.close = function () {
fs.closeSync(this.fd);
FileIterator.prototype.close = function () {
fs.closeSync(this._fd);
};
Object.defineProperty(BufferedFileReader.prototype, "position", {
Object.defineProperty(FileIterator.prototype, "position", {
/**
Return the position in the file that would be read from if we called
readBuffer(...). This is different from the internally-held position, which
read(...). This is different from the internally-held position, which
points to the end of the currently held buffer.
*/
get: function () {
return this.file_position - this.buffer.length;
return this._position - this._buffer.length;
},

@@ -85,25 +147,24 @@ enumerable: true,

});
Object.defineProperty(FileIterator.prototype, "size", {
/**
Return the total size (in bytes) of the underlying file.
*/
get: function () {
return fs.fstatSync(this._fd).size;
},
enumerable: true,
configurable: true
});
/**
Calls fs.readSync on the underlying file descriptor with pretty much the same
argument signature.
Returns `bytesRead`, the number of bytes that were read into the given Buffer.
Node.js documentation for fs.read() / fs.readSync():
> position is an integer specifying where to begin reading from in the file.
> If position is null, data will be read from the current file position.
*/
BufferedFileReader.prototype.read = function (buffer, offset, length, position) {
return fs.readSync(this.fd, buffer, offset, length, position);
};
/**
Ensure that the available buffer is at least `length` bytes long.
This may return without the condition being met of this.buffer.length >= length,
This may return without the condition being met (this.buffer.length >= length),
if the end of the underlying file has been reached.
TODO: pull _fillBuffer into the loop, with the Buffer declaration outside.
*/
BufferedFileReader.prototype.ensureLength = function (length) {
while (length > this.buffer.length) {
FileIterator.prototype._ensureLength = function (length) {
while (length > this._buffer.length) {
// all the action happens only if we need more bytes than are in the buffer
var EOF = this.fillBuffer(BufferedFileReader.BLOCK_SIZE);
var EOF = this._fillBuffer(this._block_size);
if (EOF) {

@@ -118,30 +179,22 @@ break;

Returns false iff EOF has been reached, otherwise returns true. */
BufferedFileReader.prototype.fillBuffer = function (length) {
FileIterator.prototype._fillBuffer = function (length) {
var buffer = new Buffer(length);
// always read from the reader's current position
var bytesRead = this.read(buffer, 0, length, this.file_position);
// always read from the current position
var bytesRead = fs.readSync(this._fd, buffer, 0, length, this._position);
// and update it accordingly
this.file_position += bytesRead;
this._position += bytesRead;
// use the Buffer.concat totalLength argument to slice the fresh buffer if needed
this.buffer = Buffer.concat([this.buffer, buffer], this.buffer.length + bytesRead);
this._buffer = Buffer.concat([this._buffer, buffer], this._buffer.length + bytesRead);
return bytesRead < length;
};
BufferedFileReader.prototype.peekByte = function () {
this.ensureLength(1);
return this.buffer[0];
FileIterator.prototype.next = function (length) {
this._ensureLength(length);
var buffer = this._buffer.slice(0, length);
this._buffer = this._buffer.slice(length);
return buffer;
};
BufferedFileReader.prototype.peekBuffer = function (length) {
this.ensureLength(length);
return this.buffer.slice(0, length);
FileIterator.prototype.peek = function (length) {
this._ensureLength(length);
return this._buffer.slice(0, length);
};
BufferedFileReader.prototype.readByte = function () {
var byte = this.peekByte();
this.buffer = this.buffer.slice(1);
return byte;
};
BufferedFileReader.prototype.readBuffer = function (length) {
var buffer = this.peekBuffer(length);
this.buffer = this.buffer.slice(length);
return buffer;
};
/**

@@ -151,37 +204,61 @@ Skip over the next `length` characters, returning the number of skipped

*/
BufferedFileReader.prototype.skip = function (length) {
this.ensureLength(length);
FileIterator.prototype.skip = function (length) {
this._ensureLength(length);
// we cannot skip more than `this.buffer.length` bytes
var bytesSkipped = Math.min(length, this.buffer.length);
this.buffer = this.buffer.slice(length);
var bytesSkipped = Math.min(length, this._buffer.length);
this._buffer = this._buffer.slice(length);
return bytesSkipped;
};
// when reading more data, pull in chunks of `BLOCK_SIZE` bytes.
BufferedFileReader.BLOCK_SIZE = 1024;
return BufferedFileReader;
return FileIterator;
})();
exports.BufferedFileReader = BufferedFileReader;
var BufferedLexer = (function () {
function BufferedLexer(default_rules, state_rules) {
exports.FileIterator = FileIterator;
function Token(name, value) {
if (value === void 0) { value = null; }
return { name: name, value: value };
}
exports.Token = Token;
/**
The type T is the type of each token value, usually `any` (the token name is
always a string).
BufferIterable
*/
var Tokenizer = (function () {
function Tokenizer(default_rules, state_rules) {
if (state_rules === void 0) { state_rules = {}; }
this.default_rules = default_rules;
this.state_rules = state_rules;
this.reset();
}
Tokenizer.prototype.getRules = function (state_name) {
return (state_name === undefined) ? this.default_rules : this.state_rules[state_name];
};
/**
Reset the Lexer back to its initial state.
Create a closure around the iterable.
Unfortunately, it seems that TypeScript doesn't like inline functions, so we
use a helper class (TokenizerIterator).
*/
BufferedLexer.prototype.reset = function () {
this.states = [];
Tokenizer.prototype.map = function (iterable, states) {
if (states === void 0) { states = []; }
return new TokenizerIterator(this, iterable, states);
};
return Tokenizer;
})();
exports.Tokenizer = Tokenizer;
var TokenizerIterator = (function () {
function TokenizerIterator(tokenizer, iterable, states) {
this.tokenizer = tokenizer;
this.iterable = iterable;
this.states = states;
}
/**
Returns the next available pair from the input reader (usually [token, data]).
Returns the next available Token from the input reader.
If the matching rule's action returns null, this will return null.
If the matching rule's action returns null, this will return null.
TODO: optimize string conversion; abstract out the peek + toString, back into the reader?
*/
BufferedLexer.prototype.read = function () {
// TODO: abstract out the peekBuffer + toString, back into the reader?
// optimize string conversion
var input = this.reader.peekBuffer(256).toString('utf8');
TokenizerIterator.prototype._next = function () {
var state = this.states[this.states.length - 1];
var rules = state ? this.state_rules[state] : this.default_rules;
var rules = this.tokenizer.getRules(state);
var input = this.iterable.peek(256).toString('utf8');
for (var i = 0, rule; (rule = rules[i]); i++) {

@@ -191,3 +268,3 @@ var match = input.match(rule[0]);

var byteLength = Buffer.byteLength(match[0], 'utf8');
this.reader.skip(byteLength);
this.iterable.skip(byteLength);
return rule[1].call(this, match);

@@ -202,14 +279,83 @@ }

This will never return null.
This will never return null, but may return undefined if one of the rules
returns undefined, which the rule should not do! It will never a Token with
a null name.
*/
BufferedLexer.prototype.next = function () {
var result;
do {
result = this.read();
} while (result === null);
return result;
TokenizerIterator.prototype.next = function () {
while (1) {
var token = this._next();
if (token !== null && token.name !== null) {
return token;
}
}
};
return BufferedLexer;
return TokenizerIterator;
})();
exports.BufferedLexer = BufferedLexer;
/**
Recombine a stream of tokens using a stack of lists, e.g.,
WORD:BT START:STRING CHAR:A CHAR:b CHAR:c END:STRING WORD:ET
becomes:
WORD:BT STRING:Abc WORD:ET
*/
var Combiner = (function () {
function Combiner(rules) {
this.rules = rules;
}
Combiner.prototype.findRule = function (name) {
for (var i = 0, rule; (rule = this.rules[i]); i++) {
if (rule[0] === name) {
return rule;
}
}
throw new Error("No combiner rule found with the name: " + name);
};
Combiner.prototype.map = function (iterable, stack) {
if (stack === void 0) { stack = []; }
return new CombinerIterator(this, iterable, stack);
};
return Combiner;
})();
exports.Combiner = Combiner;
var CombinerIterator = (function () {
function CombinerIterator(combiner, iterable, stack) {
this.combiner = combiner;
this.iterable = iterable;
this.stack = stack;
}
/**
Returns the next available pair from the input reader (usually [token, data]).
If the matching rule's action returns null, this will return null.
*/
CombinerIterator.prototype.next = function () {
var token = this.iterable.next();
if (token.name == 'END') {
// TODO: check that the preceding START token has the same value
var tokens = this.stack.pop();
// type hack with <any>
var rule = this.combiner.findRule(token.value);
// reduce into combined token
token = rule[1](tokens);
}
if (token.name == 'START') {
// TODO: store the START token's value somewhere so that we can verify the END token's value matches
this.stack.push([]);
return this.next();
}
else if (this.stack.length > 0) {
// push it onto the list at the top of the stack
this.stack[this.stack.length - 1].push(token);
return this.next();
}
else {
// tokens at root level pass through transparently
return token;
}
};
return CombinerIterator;
})();
//// }

@@ -7,41 +7,80 @@ /// <reference path="type_declarations/DefinitelyTyped/node/node.d.ts" />

// #############################################################################
// READERS
// ITERABLES
export interface Reader {
/**
Reads a single byte.
*/
readByte(): number;
/**
Reads a series of bytes.
*/
readBuffer(length: number): Buffer;
/**
Any sort of sequence can implement Iterable<T>. It's a lot like Array<T>, but
read-only, and without random access.
*/
export interface Iterable<T> {
next(): T;
}
export interface StatefulIterable<T> extends Iterable<T> {
position: number;
size: number;
peek(): T;
skip(): boolean;
}
export interface BufferedReader extends Reader {
peekByte(): number;
peekBuffer(length: number): Buffer;
/**
In some cases, it makes more sense to iterate in batches, or chunks, through an
iterable of a particular type. The T in ChunkedIterable<T> should itself be a
sequence type, like string[] or Buffer.
*/
export interface ChunkedIterable<T> {
next(length: number): T;
}
export interface StatefulChunkedIterable<T> extends ChunkedIterable<T> {
position: number;
size: number;
peek(length: number): T;
skip(length: number): number;
}
export class BufferedBufferReader implements BufferedReader {
constructor(private buffer: Buffer) { }
// #############################################################################
// BASIC BUFFER READER
peekByte(): number {
return this.buffer[0];
/**
Wraps a Buffer as a stateful iterable.
*/
export class BufferIterator implements StatefulChunkedIterable<Buffer> {
constructor(private _buffer: Buffer, private _position = 0) { }
static fromString(str: string, encoding?: string): BufferIterator {
var buffer = new Buffer(str, encoding);
return new BufferIterator(buffer);
}
peekBuffer(length: number): Buffer {
return this.buffer.slice(0, length);
/**
Return the current position within the underlying Buffer.
*/
get position(): number {
return this._position;
}
readByte(): number {
var byte = this.peekByte();
this.buffer = this.buffer.slice(1);
return byte;
/**
Return the total length of the underlying Buffer.
*/
get size(): number {
return this._buffer.length;
}
readBuffer(length: number): Buffer {
var buffer = this.peekBuffer(length);
this.buffer = this.buffer.slice(length);
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, without advancing our position within the Buffer. Returns a Buffer slice.
*/
peek(length: number): Buffer {
return this._buffer.slice(this._position, this._position + length);
}
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, and advance our position within the Buffer. Returns a Buffer slice.
Buffer#slice never returns entries beyond the end of the buffer:
`new Buffer([1, 2, 3, 4]).slice(2, 10)` produces `<Buffer 03 04>`
*/
next(length: number): Buffer {
var buffer = this._buffer.slice(this._position, this._position + length);
this._position += buffer.length;
return buffer;

@@ -53,41 +92,66 @@ }

characters (which may be < `length` iff EOF has been reached).
We do not allow skipping beyond the end of the buffer.
*/
skip(length: number): number {
// we cannot skip more than `this.buffer.length` bytes
var bytesSkipped = Math.min(length, this.buffer.length);
this.buffer = this.buffer.slice(length);
var bytesSkipped = Math.min(length, this._buffer.length - this._position);
this._position += bytesSkipped;
return bytesSkipped;
}
}
toString(): string {
return this.buffer.toString();
/**
Wrap an Array as an iterable.
*/
export class ArrayIterator<T> implements StatefulIterable<T> {
constructor(private _array: Array<T>, public position = 0) { }
get size(): number {
return this._array.length;
}
}
export class BufferedStringReader extends BufferedBufferReader {
constructor(input: string, encoding?: string) {
super(new Buffer(input, encoding))
next(): T {
return this._array[this.position++];
}
peek(): T {
return this._array[this.position + 1];
}
skip(): boolean {
if (this.position < this._array.length) {
this.position++;
return true;
}
return false;
}
}
// #############################################################################
// FILE READERS
// SYNCHRONOUS FILE READER
/**
Provide buffered (and Buffer-friendly) access to a file.
Provide iterative access to a file.
It is buffered, which means you can call `peek(same_number)` repeatedly without
triggering a `read(2)` system call on the underlying file each time. Likewise,
calling `read(small_number)` repeatedly will issue a `read(2)` system call only
when the buffer doesn't have enough data.
When calling `read()` on the underlying file, it will read batches of
`_block_size` (default: 1024) bytes.
*/
export class BufferedFileReader implements BufferedReader {
// when reading more data, pull in chunks of `BLOCK_SIZE` bytes.
static BLOCK_SIZE = 1024;
private buffer: Buffer = new Buffer(0);
export class FileIterator implements StatefulChunkedIterable<Buffer> {
private _buffer: Buffer = new Buffer(0);
constructor(private fd: number, private file_position: number = 0) { }
// when reading more data, pull in chunks of `_block_size` bytes.
constructor(private _fd: number, private _position = 0, private _block_size = 1024) { }
static open(filepath: string): BufferedFileReader {
static open(filepath: string): FileIterator {
var fd = fs.openSync(filepath, 'r');
return new BufferedFileReader(fd);
return new FileIterator(fd);
}
close(): void {
fs.closeSync(this.fd);
fs.closeSync(this._fd);
}

@@ -97,21 +161,14 @@

Return the position in the file that would be read from if we called
readBuffer(...). This is different from the internally-held position, which
read(...). This is different from the internally-held position, which
points to the end of the currently held buffer.
*/
get position(): number {
return this.file_position - this.buffer.length;
return this._position - this._buffer.length;
}
/**
Calls fs.readSync on the underlying file descriptor with pretty much the same
argument signature.
Returns `bytesRead`, the number of bytes that were read into the given Buffer.
Node.js documentation for fs.read() / fs.readSync():
> position is an integer specifying where to begin reading from in the file.
> If position is null, data will be read from the current file position.
Return the total size (in bytes) of the underlying file.
*/
read(buffer: Buffer, offset: number, length: number, position: number): number {
return fs.readSync(this.fd, buffer, offset, length, position);
get size(): number {
return fs.fstatSync(this._fd).size;
}

@@ -122,9 +179,11 @@

This may return without the condition being met of this.buffer.length >= length,
This may return without the condition being met (this.buffer.length >= length),
if the end of the underlying file has been reached.
TODO: pull _fillBuffer into the loop, with the Buffer declaration outside.
*/
private ensureLength(length: number): void {
while (length > this.buffer.length) {
private _ensureLength(length: number): void {
while (length > this._buffer.length) {
// all the action happens only if we need more bytes than are in the buffer
var EOF = this.fillBuffer(BufferedFileReader.BLOCK_SIZE);
var EOF = this._fillBuffer(this._block_size);
if (EOF) {

@@ -141,35 +200,25 @@ // exit regardless

Returns false iff EOF has been reached, otherwise returns true. */
private fillBuffer(length: number): boolean {
private _fillBuffer(length: number): boolean {
var buffer = new Buffer(length);
// always read from the reader's current position
var bytesRead = this.read(buffer, 0, length, this.file_position);
// always read from the current position
var bytesRead = fs.readSync(this._fd, buffer, 0, length, this._position);
// and update it accordingly
this.file_position += bytesRead;
this._position += bytesRead;
// use the Buffer.concat totalLength argument to slice the fresh buffer if needed
this.buffer = Buffer.concat([this.buffer, buffer], this.buffer.length + bytesRead);
this._buffer = Buffer.concat([this._buffer, buffer], this._buffer.length + bytesRead);
return bytesRead < length;
}
peekByte(): number {
this.ensureLength(1);
return this.buffer[0];
next(length: number): Buffer {
this._ensureLength(length);
var buffer = this._buffer.slice(0, length);
this._buffer = this._buffer.slice(length);
return buffer;
}
peekBuffer(length: number): Buffer {
this.ensureLength(length);
return this.buffer.slice(0, length);
peek(length: number): Buffer {
this._ensureLength(length);
return this._buffer.slice(0, length);
}
readByte(): number {
var byte = this.peekByte();
this.buffer = this.buffer.slice(1);
return byte;
}
readBuffer(length: number): Buffer {
var buffer = this.peekBuffer(length);
this.buffer = this.buffer.slice(length);
return buffer;
}
/**

@@ -180,6 +229,6 @@ Skip over the next `length` characters, returning the number of skipped

skip(length: number): number {
this.ensureLength(length);
this._ensureLength(length);
// we cannot skip more than `this.buffer.length` bytes
var bytesSkipped = Math.min(length, this.buffer.length);
this.buffer = this.buffer.slice(length);
var bytesSkipped = Math.min(length, this._buffer.length);
this._buffer = this._buffer.slice(length);
return bytesSkipped;

@@ -192,33 +241,77 @@ }

// LEXERS
// (the good stuff)
export interface RuleAction<T> { (match: RegExpMatchArray): [string, T]; }
export interface Rule<T> extends Array<RegExp | RuleAction<T>> { 0: RegExp; 1: RuleAction<T>; }
/**
Commonly used special case.
*/
export interface BufferIterable extends StatefulChunkedIterable<Buffer> { }
export class BufferedLexer<T> {
reader: BufferedReader;
states: string[];
/**
Tokenizer#map() and Combiner#map() both return Token iterators.
constructor(private default_rules: Rule<T>[], private state_rules: {[index: string]: Rule<T>[]}) {
this.reset();
Tokens with a null name and null Tokens should be treated the same way (as
insignificant / ignorable objects that should be skipped).
Tokens with an undefined name and undefined Tokens are always errors.
*/
export interface Token<T> {
name: string;
value: T;
}
/**
Another generic but frequently used alias.
*/
export interface TokenIterable<T> extends Iterable<Token<T>> { }
export function Token<T>(name: string, value: T = null): Token<T> {
return {name: name, value: value};
}
// -----------------------------------------------------------------------------
// TOKENIZER
export interface RegexAction<T> { (match: RegExpMatchArray): Token<T>; }
export interface RegexRule<T> extends Array<RegExp | RegexAction<T>> { 0: RegExp; 1: RegexAction<T>; }
/**
The type T is the type of each token value, usually `any` (the token name is
always a string).
BufferIterable
*/
export class Tokenizer<T> {
constructor(private default_rules: RegexRule<T>[],
private state_rules: {[index: string]: RegexRule<T>[]} = {}) { }
getRules(state_name: string): RegexRule<T>[] {
return (state_name === undefined) ? this.default_rules : this.state_rules[state_name];
}
/**
Reset the Lexer back to its initial state.
Create a closure around the iterable.
Unfortunately, it seems that TypeScript doesn't like inline functions, so we
use a helper class (TokenizerIterator).
*/
reset(): void {
this.states = [];
map(iterable: BufferIterable, states: string[] = []): TokenIterable<T> {
return new TokenizerIterator(this, iterable, states);
}
}
class TokenizerIterator<T> implements TokenIterable<T> {
constructor(private tokenizer: Tokenizer<T>,
public iterable: BufferIterable,
public states: string[]) { }
/**
Returns the next available pair from the input reader (usually [token, data]).
Returns the next available Token from the input reader.
If the matching rule's action returns null, this will return null.
If the matching rule's action returns null, this will return null.
TODO: optimize string conversion; abstract out the peek + toString, back into the reader?
*/
read(): T {
// TODO: abstract out the peekBuffer + toString, back into the reader?
// optimize string conversion
var input = this.reader.peekBuffer(256).toString('utf8');
private _next(): Token<T> {
var state = this.states[this.states.length - 1];
var rules = state ? this.state_rules[state] : this.default_rules;
var rules = this.tokenizer.getRules(state);
var input = this.iterable.peek(256).toString('utf8');
for (var i = 0, rule; (rule = rules[i]); i++) {

@@ -228,3 +321,3 @@ var match = input.match(rule[0]);

var byteLength = Buffer.byteLength(match[0], 'utf8');
this.reader.skip(byteLength);
this.iterable.skip(byteLength);
return rule[1].call(this, match);

@@ -241,13 +334,88 @@ }

This will never return null.
This will never return null, but may return undefined if one of the rules
returns undefined, which the rule should not do! It will never a Token with
a null name.
*/
next(): T {
var result;
do {
result = this.read();
} while (result === null);
return result;
public next(): Token<T> {
while (1) {
var token = this._next();
if (token !== null && token.name !== null) {
return token;
}
}
}
}
// -----------------------------------------------------------------------------
// COMBINER
export interface CombinerAction<T, U> { (tokens: Token<T>[]): Token<U>; }
export interface CombinerRule<T, U> extends Array<string | CombinerAction<T, U>> { 0: string; 1: CombinerAction<T, U>; }
/**
Recombine a stream of tokens using a stack of lists, e.g.,
WORD:BT START:STRING CHAR:A CHAR:b CHAR:c END:STRING WORD:ET
becomes:
WORD:BT STRING:Abc WORD:ET
*/
export class Combiner<T> {
constructor(private rules: CombinerRule<T, T>[]) { }
findRule(name: string): CombinerRule<T, T> {
for (var i = 0, rule; (rule = this.rules[i]); i++) {
if (rule[0] === name) {
return rule;
}
}
throw new Error(`No combiner rule found with the name: ${name}`);
}
map(iterable: TokenIterable<T>, stack: Array<Array<Token<T>>> = []): TokenIterable<T> {
return new CombinerIterator(this, iterable, stack);
}
}
class CombinerIterator<T> implements TokenIterable<T> {
constructor(private combiner: Combiner<T>,
public iterable: TokenIterable<T>,
public stack: Array<Array<Token<T>>>) { }
/**
Returns the next available pair from the input reader (usually [token, data]).
If the matching rule's action returns null, this will return null.
*/
next(): Token<T> {
var token = this.iterable.next();
if (token.name == 'END') {
// TODO: check that the preceding START token has the same value
var tokens = this.stack.pop();
// type hack with <any>
var rule = this.combiner.findRule(<any>token.value);
// reduce into combined token
token = rule[1](tokens);
}
if (token.name == 'START') {
// TODO: store the START token's value somewhere so that we can verify the END token's value matches
this.stack.push([]);
return this.next();
}
else if (this.stack.length > 0) {
// push it onto the list at the top of the stack
this.stack[this.stack.length - 1].push(token);
return this.next();
}
else {
// tokens at root level pass through transparently
return token;
}
}
}
//// }
/// <reference path="../../type_declarations/DefinitelyTyped/node/node.d.ts" />
declare module "lexing" {
interface Reader {
/**
Any sort of sequence can implement Iterable<T>. It's a lot like Array<T>, but
read-only, and without random access.
*/
interface Iterable<T> {
next(): T;
}
interface StatefulIterable<T> extends Iterable<T> {
position: number;
size: number;
peek(): T;
skip(): boolean;
}
/**
In some cases, it makes more sense to iterate in batches, or chunks, through an
iterable of a particular type. The T in ChunkedIterable<T> should itself be a
sequence type, like string[] or Buffer.
*/
interface ChunkedIterable<T> {
next(length: number): T;
}
interface StatefulChunkedIterable<T> extends ChunkedIterable<T> {
position: number;
size: number;
peek(length: number): T;
skip(length: number): number;
}
/**
Wraps a Buffer as a stateful iterable.
*/
class BufferIterator implements StatefulChunkedIterable<Buffer> {
private _buffer;
private _position;
constructor(_buffer: Buffer, _position?: number);
static fromString(str: string, encoding?: string): BufferIterator;
/**
Reads a single byte.
Return the current position within the underlying Buffer.
*/
readByte(): number;
position: number;
/**
Reads a series of bytes.
Return the total length of the underlying Buffer.
*/
readBuffer(length: number): Buffer;
}
interface BufferedReader extends Reader {
peekByte(): number;
peekBuffer(length: number): Buffer;
skip(length: number): number;
}
class BufferedBufferReader implements BufferedReader {
private buffer;
constructor(buffer: Buffer);
peekByte(): number;
peekBuffer(length: number): Buffer;
readByte(): number;
readBuffer(length: number): Buffer;
size: number;
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, without advancing our position within the Buffer. Returns a Buffer slice.
*/
peek(length: number): Buffer;
/**
Read the next `length` bytes from the underlying Buffer, or fewer iff we reach
EOF, and advance our position within the Buffer. Returns a Buffer slice.
Buffer#slice never returns entries beyond the end of the buffer:
`new Buffer([1, 2, 3, 4]).slice(2, 10)` produces `<Buffer 03 04>`
*/
next(length: number): Buffer;
/**
Skip over the next `length` characters, returning the number of skipped
characters (which may be < `length` iff EOF has been reached).
We do not allow skipping beyond the end of the buffer.
*/
skip(length: number): number;
toString(): string;
}
class BufferedStringReader extends BufferedBufferReader {
constructor(input: string, encoding?: string);
/**
Wrap an Array as an iterable.
*/
class ArrayIterator<T> implements StatefulIterable<T> {
private _array;
position: number;
constructor(_array: Array<T>, position?: number);
size: number;
next(): T;
peek(): T;
skip(): boolean;
}
/**
Provide buffered (and Buffer-friendly) access to a file.
Provide iterative access to a file.
It is buffered, which means you can call `peek(same_number)` repeatedly without
triggering a `read(2)` system call on the underlying file each time. Likewise,
calling `read(small_number)` repeatedly will issue a `read(2)` system call only
when the buffer doesn't have enough data.
When calling `read()` on the underlying file, it will read batches of
`_block_size` (default: 1024) bytes.
*/
class BufferedFileReader implements BufferedReader {
private fd;
private file_position;
static BLOCK_SIZE: number;
private buffer;
constructor(fd: number, file_position?: number);
static open(filepath: string): BufferedFileReader;
class FileIterator implements StatefulChunkedIterable<Buffer> {
private _fd;
private _position;
private _block_size;
private _buffer;
constructor(_fd: number, _position?: number, _block_size?: number);
static open(filepath: string): FileIterator;
close(): void;
/**
Return the position in the file that would be read from if we called
readBuffer(...). This is different from the internally-held position, which
read(...). This is different from the internally-held position, which
points to the end of the currently held buffer.

@@ -53,19 +106,14 @@ */

/**
Calls fs.readSync on the underlying file descriptor with pretty much the same
argument signature.
Returns `bytesRead`, the number of bytes that were read into the given Buffer.
Node.js documentation for fs.read() / fs.readSync():
> position is an integer specifying where to begin reading from in the file.
> If position is null, data will be read from the current file position.
Return the total size (in bytes) of the underlying file.
*/
read(buffer: Buffer, offset: number, length: number, position: number): number;
size: number;
/**
Ensure that the available buffer is at least `length` bytes long.
This may return without the condition being met of this.buffer.length >= length,
This may return without the condition being met (this.buffer.length >= length),
if the end of the underlying file has been reached.
TODO: pull _fillBuffer into the loop, with the Buffer declaration outside.
*/
private ensureLength(length);
private _ensureLength(length);
/**

@@ -75,7 +123,5 @@ Read data from the underlying file and append it to the buffer.

Returns false iff EOF has been reached, otherwise returns true. */
private fillBuffer(length);
peekByte(): number;
peekBuffer(length: number): Buffer;
readByte(): number;
readBuffer(length: number): Buffer;
private _fillBuffer(length);
next(length: number): Buffer;
peek(length: number): Buffer;
/**

@@ -87,35 +133,76 @@ Skip over the next `length` characters, returning the number of skipped

}
interface RuleAction<T> {
(match: RegExpMatchArray): [string, T];
/**
Commonly used special case.
*/
interface BufferIterable extends StatefulChunkedIterable<Buffer> {
}
interface Rule<T> extends Array<RegExp | RuleAction<T>> {
/**
Tokenizer#map() and Combiner#map() both return Token iterators.
Tokens with a null name and null Tokens should be treated the same way (as
insignificant / ignorable objects that should be skipped).
Tokens with an undefined name and undefined Tokens are always errors.
*/
interface Token<T> {
name: string;
value: T;
}
/**
Another generic but frequently used alias.
*/
interface TokenIterable<T> extends Iterable<Token<T>> {
}
function Token<T>(name: string, value?: T): Token<T>;
interface RegexAction<T> {
(match: RegExpMatchArray): Token<T>;
}
interface RegexRule<T> extends Array<RegExp | RegexAction<T>> {
0: RegExp;
1: RuleAction<T>;
1: RegexAction<T>;
}
class BufferedLexer<T> {
/**
The type T is the type of each token value, usually `any` (the token name is
always a string).
BufferIterable
*/
class Tokenizer<T> {
private default_rules;
private state_rules;
reader: BufferedReader;
states: string[];
constructor(default_rules: Rule<T>[], state_rules: {
[index: string]: Rule<T>[];
constructor(default_rules: RegexRule<T>[], state_rules?: {
[index: string]: RegexRule<T>[];
});
getRules(state_name: string): RegexRule<T>[];
/**
Reset the Lexer back to its initial state.
*/
reset(): void;
/**
Returns the next available pair from the input reader (usually [token, data]).
Create a closure around the iterable.
If the matching rule's action returns null, this will return null.
Unfortunately, it seems that TypeScript doesn't like inline functions, so we
use a helper class (TokenizerIterator).
*/
read(): T;
/**
Returns the next available non-null token / symbol output from the input
reader (usually a token_data: [string, any] tuple).
This will never return null.
*/
next(): T;
map(iterable: BufferIterable, states?: string[]): TokenIterable<T>;
}
interface CombinerAction<T, U> {
(tokens: Token<T>[]): Token<U>;
}
interface CombinerRule<T, U> extends Array<string | CombinerAction<T, U>> {
0: string;
1: CombinerAction<T, U>;
}
/**
Recombine a stream of tokens using a stack of lists, e.g.,
WORD:BT START:STRING CHAR:A CHAR:b CHAR:c END:STRING WORD:ET
becomes:
WORD:BT STRING:Abc WORD:ET
*/
class Combiner<T> {
private rules;
constructor(rules: CombinerRule<T, T>[]);
findRule(name: string): CombinerRule<T, T>;
map(iterable: TokenIterable<T>, stack?: Array<Array<Token<T>>>): TokenIterable<T>;
}
}
{
"name": "lexing",
"version": "0.1.4",
"version": "0.2.0",
"description": "Regex-based lexer",
"keywords": [],
"keywords": [
"lexer",
"parser"
],
"homepage": "https://github.com/chbrown/lexing",

@@ -11,4 +14,8 @@ "repository": "git://github.com/chbrown/lexing.git",

"devDependencies": {
"mocha": "*",
"typescript": "*"
},
"scripts": {
"test": "make test"
}
}

@@ -26,6 +26,27 @@ # lexing

The `new lexing.BufferedLexer(default_rules [, state_rules])` implementation provided in this module represents state as a stack of things (hopefully just strings), but this could be abused. The lexer constructor takes an optional second argument: an object mapping state names to lists of rules that apply only in those states. These operate like exclusive conditional states in `flex`, except there are no exceptions to the exclusivity, i.e., there is no `<*>` condition specifier. The current state is the last (top) state in the state stack. The `default_rules` rules apply only when the state stack is empty (the default).
The lexer has one main function, `lexer.read()`. This reads an input_string from the `lexer.reader` instance, and iterates over the rules that apply in the current state.
## Implementation
The `new lexing.Tokenizer(default_rules [, state_rules])` implementation provided in this module is the most basic lexer provided, representing state as a stack of strings. The `lexing.Tokenizer` constructor takes an optional second argument: an object mapping state names to lists of rules that apply only in those states. These operate like exclusive conditional states in `flex`, except there are no exceptions to the exclusivity, i.e., there is no `<*>` condition specifier. The current state is the last (top) state in the state stack. The `default_rules` rules apply only when the state stack is empty (the default).
The tokenizer has one main function, `tokenizer.map(buffer_iterable)`, which returns a `TokenIterable`. `buffer_iterable` should implement the `BufferIterable` interface, i.e.:
interface BufferIterable {
position: number;
size: number;
next(length: number): Buffer;
peek(length: number): Buffer;
skip(length: number): number;
}
The following readers defined in `lexing` all return instances implementing the `BufferIterable` interface:
* `new lexing.BufferIterator(buffer)`
* `lexing.BufferIterator.fromString(str, encoding)`
* `new lexing.FileIterator(file_descriptor)`
* `lexing.FileIterator.open(file_path)`
The `TokenIterable` instance returned by `tokenizer.map(...)` has one method: `next()`, which returns a non-null `Token`.
Every `Token` has a non-null `name` field (a string) and a `value` field (of any type; potentially null or undefined).
Each rule is a `[RegExp, Function]` tuple. When a rule's regular expression matches the input, the following happens:

@@ -35,9 +56,7 @@

2. The input cursor is advanced over the length of the full match (`match[0]`).
3. The lexer returns the result of calling `input_string.match(rule[0])`, with the lexer bound as `this` inside the function.
3. The tokenizer returns the result of calling `input_string.match(rule[0])`, with the tokenizer bound as `this` inside the rule's function.
If no rules in the current state match the current input, the lexer will throw an "Invalid language" error.
If no rules in the current state match the current input, the tokenizer will throw an "Invalid language" error.
The lexer has another function: `lexer.next()`, which calls `lexer.read()` in a loop until it returns a non-null result, and returns that result.
## Quickstart

@@ -47,3 +66,3 @@

npm install lexing --save
npm installl lexing --save

@@ -58,3 +77,3 @@ In your code:

[/^$/, function(match) {
return ['EOF', null];
return lexing.Token('EOF', null);
}],

@@ -64,18 +83,18 @@ [/^\s+/, function(match) {

}],
[/^[^!"#$%&'()*+,\-./:;<=>?@[\\\]\^_`{|}~]+/, function(match) {
return ['TOKEN', match[0]];
[/^[^!"#$%&'()*+,\-./:;<=>?@[\\\]\^_`{|}~\s]+/, function(match) {
return lexing.Token('WORD', match[0]);
}],
[/^./, function(match) {
return ['PUNCTUATION', match[0]];
return lexing.Token('PUNCTUATION', match[0]);
}],
];
var lexer = new lexing.BufferedLexer(rules);
var tokenizer = new lexing.Tokenizer(rules);
var input = lexing.BufferIterator.fromString("'It wasn't at all my fault', I cried.");
var output = tokenizer.map(input);
lexer.reader = new lexing.BufferedStringReader("'It wasn't at all my fault', I cried.");
do {
var token_value = lexer.next();
console.log('token=%s => %j', token_value[0], token_value[1]);
} while (token_value[0] !== 'EOF');
var token = output.next();
console.log('token=%s => %j', token.name, token.value);
} while (token.name !== 'EOF');

@@ -82,0 +101,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc