var tokenizer = new TokenizeThis();
var str = 'Tokenize this!';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Tokenize', 'this', '!']);

By default, it can tokenize math-based strings.

var tokenizer = new TokenizeThis();
var str = '5 + 6 -(4/2) + gcd(10, 5)';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});

equals(tokens, [5, '+', 6, '-', '(', 4, '/', 2, ')', '+', 'gcd', '(', 10, ',', 5, ')']);

...Or SQL.

var tokenizer = new TokenizeThis();
var str = 'SELECT COUNT(id), 5+6 FROM `users` WHERE name = "shaun persad" AND hobby IS NULL';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    
    if (surroundedBy) {
        tokens.push(surroundedBy+token+surroundedBy);
    } else {
        tokens.push(token);
    }
});
equals(tokens, [
    'SELECT',
    'COUNT', '(', 'id', ')',
    ',',
    5, '+', 6,
    'FROM', '`users`',
    'WHERE',
    'name', '=', '"shaun persad"',
    'AND',
    'hobby', 'IS', null
]);

Installation

npm install tokenize-this.

// or if in the browser: <script src="tokenize-this/tokenize-this.min.js"></script>

Usage

require it, create a new instance, then call tokenize.

// var TokenizeThis = require('tokenize-this');
// OR
// var TokenizeThis = require('tokenize-this/tokenize-this.min.js'); // for node.js < 4.0
// OR
// <script src="tokenize-this/tokenize-this.min.js"></script> <!-- if in browser -->

var tokenizer = new TokenizeThis();

var str = 'Hi!, I want to add 5+6';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Hi', '!', ',', 'I', 'want', 'to', 'add', 5, '+', 6]);

Advanced Usage

Supplying a config object to the constructor

See here for all options

This can be used to tokenize many forms of data, like JSON into key-value pairs.

var jsonConfig = {
    shouldTokenize: ['{', '}', '[', ']'],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t", ':', ','],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(jsonConfig);
var str = '[{name:"Shaun Persad", id: 5}, { gender : null}]';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['[', '{', 'name', 'Shaun Persad', 'id', 5, '}', '{', 'gender', null, '}', ']']);

Here it is tokenizing XML like a boss.

var xmlConfig = {
    shouldTokenize: ['<?', '?>', '<!', '<', '</', '>', '/>', '='],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(xmlConfig);
var str = `
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
   <product description="Cardigan Sweater" product_image="cardigan.jpg">
      <size description="Large" />
      <color_swatch image="red_cardigan.jpg">
        Red
      </color_swatch>
   </product>
</catalog>                
`;
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens,
    [
        '<?', 'xml-stylesheet', 'href', '=', 'catalog.xsl', 'type', '=', 'text/xsl', '?>',
        '<!', 'DOCTYPE', 'catalog', 'SYSTEM', 'catalog.dtd', '>',
        '<', 'catalog', '>',
        '<', 'product', 'description', '=', 'Cardigan Sweater', 'product_image', '=', 'cardigan.jpg', '>',
        '<', 'size', 'description', '=', 'Large', '/>',
        '<', 'color_swatch', 'image', '=', 'red_cardigan.jpg', '>',
        'Red',
        '</', 'color_swatch', '>',
        '</', 'product', '>',
        '</', 'catalog', '>'
    ]
);

The above examples are the first steps in writing parsers for those formats. The next would be parsing the stream of tokens based on the format-specific rules, e.g. SQL.

API

Methods

#tokenize(str:String, forEachToken:Function)

sends each token to the forEachToken(token:String, surroundedBy:String) callback.

var tokenizer = new TokenizeThis();
var str = 'Tokenize "this"!';
var tokens = [];
var forEachToken = function(token, surroundedBy) {
    tokens.push(surroundedBy+token+surroundedBy);
};
tokenizer.tokenize(str, forEachToken);
equals(tokens, ['Tokenize', '"this"', '!']);

it converts true, false, null, and numbers into their literal versions.

var tokenizer = new TokenizeThis();
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, [true, false, null, true, false, null, 1, 2, 3.4, 5.6789]);

.defaultConfig:Object

The default config object used when no config is supplied.

var config = {
    shouldTokenize: ['(', ')', ',', '*', '/', '%', '+', '-', '=', '!=', '!', '<', '>', '<=', '>=', '^'],
    shouldMatch: ['"', "'", '`'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true,
    escapeCharacter: "\\"
};
equals(TokenizeThis.defaultConfig, config);

You can change converting to literals with the convertLiterals config option.

var config = {
    convertLiterals: false
};
var tokenizer = new TokenizeThis(config);
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['true', 'false', 'null', 'TRUE', 'FALSE', 'NULL', '1', '2', '3.4', '5.6789']);

Any strings surrounded by the quotes specified in the shouldMatch option are treated as whole tokens.

var config = {
    shouldMatch: ['"', '`', '#']
};
var tokenizer = new TokenizeThis(config);
var str = '"hi there" `this is a test` #of quotes#';
var tokens = [];
var tokensQuoted = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
    tokensQuoted.push(surroundedBy+token+surroundedBy);
});
equals(tokens, ['hi there', 'this is a test', 'of quotes']);
equals(tokensQuoted, ['"hi there"', '`this is a test`', '#of quotes#']);

Quotes can be escaped via a backslash.

var tokenizer = new TokenizeThis();
var str = 'These are "\\"quotes\\""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});

equals(tokens, ['These', 'are', '"quotes"']);

The escape character can be specified with the escapeCharacter option.

var config = {
    escapeCharacter: '#'
};
var tokenizer = new TokenizeThis(config);
var str = 'These are "#"quotes#""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);

Keywords

FAQs

What is tokenize-this?

Is tokenize-this popular?

Is tokenize-this well maintained?

Package last updated on 29 Dec 2016

Did you know?

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

tokenize-this

TokenizeThis

Quickstart