New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

tokenize-this

Package Overview
Dependencies
Maintainers
1
Versions
19
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tokenize-this

Turns a string into tokens.

1.4.2
latest
Source
npm
Version published
Weekly downloads
7.3K
8.44%
Maintainers
1
Weekly downloads
 
Created
Source

TokenizeThis

Quickstart

It turns a string into tokens.

var tokenizer = new TokenizeThis();
var str = 'Tokenize this!';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Tokenize', 'this', '!']);

By default, it can tokenize math-based strings.

var tokenizer = new TokenizeThis();
var str = '5 + 6 -(4/2) + gcd(10, 5)';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});

equals(tokens, [5, '+', 6, '-', '(', 4, '/', 2, ')', '+', 'gcd', '(', 10, ',', 5, ')']);

...Or SQL.

var tokenizer = new TokenizeThis();
var str = 'SELECT COUNT(id), 5+6 FROM `users` WHERE name = "shaun persad" AND hobby IS NULL';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    
    if (surroundedBy) {
        tokens.push(surroundedBy+token+surroundedBy);
    } else {
        tokens.push(token);
    }
});
equals(tokens, [
    'SELECT',
    'COUNT', '(', 'id', ')',
    ',',
    5, '+', 6,
    'FROM', '`users`',
    'WHERE',
    'name', '=', '"shaun persad"',
    'AND',
    'hobby', 'IS', null
]);

Installation

npm install tokenize-this.

// or if in the browser: <script src="tokenize-this/tokenize-this.min.js"></script>

Usage

require it, create a new instance, then call tokenize.

// var TokenizeThis = require('tokenize-this');
// OR
// var TokenizeThis = require('tokenize-this/tokenize-this.min.js'); // for node.js < 4.0
// OR
// <script src="tokenize-this/tokenize-this.min.js"></script> <!-- if in browser -->

var tokenizer = new TokenizeThis();

var str = 'Hi!, I want to add 5+6';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['Hi', '!', ',', 'I', 'want', 'to', 'add', 5, '+', 6]);

Advanced Usage

Supplying a config object to the constructor

See here for all options

This can be used to tokenize many forms of data, like JSON into key-value pairs.

var jsonConfig = {
    shouldTokenize: ['{', '}', '[', ']'],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t", ':', ','],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(jsonConfig);
var str = '[{name:"Shaun Persad", id: 5}, { gender : null}]';
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens, ['[', '{', 'name', 'Shaun Persad', 'id', 5, '}', '{', 'gender', null, '}', ']']);

Here it is tokenizing XML like a boss.

var xmlConfig = {
    shouldTokenize: ['<?', '?>', '<!', '<', '</', '>', '/>', '='],
    shouldMatch: ['"'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true
};
var tokenizer = new TokenizeThis(xmlConfig);
var str = `
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
   <product description="Cardigan Sweater" product_image="cardigan.jpg">
      <size description="Large" />
      <color_swatch image="red_cardigan.jpg">
        Red
      </color_swatch>
   </product>
</catalog>                
`;
var tokens = [];
tokenizer.tokenize(str, function(token) {
    tokens.push(token);
});
equals(tokens,
    [
        '<?', 'xml-stylesheet', 'href', '=', 'catalog.xsl', 'type', '=', 'text/xsl', '?>',
        '<!', 'DOCTYPE', 'catalog', 'SYSTEM', 'catalog.dtd', '>',
        '<', 'catalog', '>',
        '<', 'product', 'description', '=', 'Cardigan Sweater', 'product_image', '=', 'cardigan.jpg', '>',
        '<', 'size', 'description', '=', 'Large', '/>',
        '<', 'color_swatch', 'image', '=', 'red_cardigan.jpg', '>',
        'Red',
        '</', 'color_swatch', '>',
        '</', 'product', '>',
        '</', 'catalog', '>'
    ]
);

The above examples are the first steps in writing parsers for those formats. The next would be parsing the stream of tokens based on the format-specific rules, e.g. SQL.

API

Methods

#tokenize(str:String, forEachToken:Function)

sends each token to the forEachToken(token:String, surroundedBy:String, index:Integer) callback.

var tokenizer = new TokenizeThis();
var str = 'Tokenize "this"!';

var tokens = [];
var indices = [];
var forEachToken = function(token, surroundedBy, index) {

    tokens.push(surroundedBy+token+surroundedBy);
    indices.push(index);
};

tokenizer.tokenize(str, forEachToken);

equals(tokens, ['Tokenize', '"this"', '!']);
equals(indices, [8, 14, 15]);

it converts true, false, null, and numbers into their literal versions.

var tokenizer = new TokenizeThis();
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, [true, false, null, true, false, null, 1, 2, 3.4, 5.6789]);
.defaultConfig:Object

The default config object used when no config is supplied.

var config = {
    shouldTokenize: ['(', ')', ',', '*', '/', '%', '+', '-', '=', '!=', '!', '<', '>', '<=', '>=', '^'],
    shouldMatch: ['"', "'", '`'],
    shouldDelimitBy: [' ', "\n", "\r", "\t"],
    convertLiterals: true,
    escapeCharacter: "\\"
};
equals(TokenizeThis.defaultConfig, config);

You can change converting to literals with the convertLiterals config option.

var config = {
    convertLiterals: false
};
var tokenizer = new TokenizeThis(config);
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['true', 'false', 'null', 'TRUE', 'FALSE', 'NULL', '1', '2', '3.4', '5.6789']);

Any strings surrounded by the quotes specified in the shouldMatch option are treated as whole tokens.

var config = {
    shouldMatch: ['"', '`', '#']
};
var tokenizer = new TokenizeThis(config);
var str = '"hi there" `this is a test` #of quotes#';
var tokens = [];
var tokensQuoted = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
    tokensQuoted.push(surroundedBy+token+surroundedBy);
});
equals(tokens, ['hi there', 'this is a test', 'of quotes']);
equals(tokensQuoted, ['"hi there"', '`this is a test`', '#of quotes#']);

Quotes can be escaped via a backslash.

var tokenizer = new TokenizeThis();
var str = 'These are "\\"quotes\\""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});

equals(tokens, ['These', 'are', '"quotes"']);

The escape character can be specified with the escapeCharacter option.

var config = {
    escapeCharacter: '#'
};
var tokenizer = new TokenizeThis(config);
var str = 'These are "#"quotes#""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
    tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);

Keywords

string

FAQs

Package last updated on 16 Oct 2019

Did you know?

Socket

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

Related posts