
Security News
Open Source Maintainers Demand Ability to Block Copilot-Generated Issues and PRs
Open source maintainers are urging GitHub to let them block Copilot from submitting AI-generated issues and pull requests to their repositories.
tokenize-this
Advanced tools
It turns a string into tokens.
var tokenizer = new TokenizeThis();
var str = 'Tokenize this!';
var tokens = [];
tokenizer.tokenize(str, function(token) {
tokens.push(token);
});
equals(tokens, ['Tokenize', 'this', '!']);
By default, it can tokenize math-based strings.
var tokenizer = new TokenizeThis();
var str = '5 + 6 -(4/2) + gcd(10, 5)';
var tokens = [];
tokenizer.tokenize(str, function(token) {
tokens.push(token);
});
equals(tokens, [5, '+', 6, '-', '(', 4, '/', 2, ')', '+', 'gcd', '(', 10, ',', 5, ')']);
...Or SQL.
var tokenizer = new TokenizeThis();
var str = 'SELECT COUNT(id), 5+6 FROM `users` WHERE name = "shaun persad" AND hobby IS NULL';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
if (surroundedBy) {
tokens.push(surroundedBy+token+surroundedBy);
} else {
tokens.push(token);
}
});
equals(tokens, [
'SELECT',
'COUNT', '(', 'id', ')',
',',
5, '+', 6,
'FROM', '`users`',
'WHERE',
'name', '=', '"shaun persad"',
'AND',
'hobby', 'IS', null
]);
npm install tokenize-this
.
// or if in the browser: <script src="tokenize-this/tokenize-this.min.js"></script>
require
it, create a new instance, then call tokenize
.
// var TokenizeThis = require('tokenize-this');
// OR
// var TokenizeThis = require('tokenize-this/tokenize-this.min.js'); // for node.js < 4.0
// OR
// <script src="tokenize-this/tokenize-this.min.js"></script> <!-- if in browser -->
var tokenizer = new TokenizeThis();
var str = 'Hi!, I want to add 5+6';
var tokens = [];
tokenizer.tokenize(str, function(token) {
tokens.push(token);
});
equals(tokens, ['Hi', '!', ',', 'I', 'want', 'to', 'add', 5, '+', 6]);
This can be used to tokenize many forms of data, like JSON into key-value pairs.
var jsonConfig = {
shouldTokenize: ['{', '}', '[', ']'],
shouldMatch: ['"'],
shouldDelimitBy: [' ', "\n", "\r", "\t", ':', ','],
convertLiterals: true
};
var tokenizer = new TokenizeThis(jsonConfig);
var str = '[{name:"Shaun Persad", id: 5}, { gender : null}]';
var tokens = [];
tokenizer.tokenize(str, function(token) {
tokens.push(token);
});
equals(tokens, ['[', '{', 'name', 'Shaun Persad', 'id', 5, '}', '{', 'gender', null, '}', ']']);
Here it is tokenizing XML like a boss.
var xmlConfig = {
shouldTokenize: ['<?', '?>', '<!', '<', '</', '>', '/>', '='],
shouldMatch: ['"'],
shouldDelimitBy: [' ', "\n", "\r", "\t"],
convertLiterals: true
};
var tokenizer = new TokenizeThis(xmlConfig);
var str = `
<?xml-stylesheet href="catalog.xsl" type="text/xsl"?>
<!DOCTYPE catalog SYSTEM "catalog.dtd">
<catalog>
<product description="Cardigan Sweater" product_image="cardigan.jpg">
<size description="Large" />
<color_swatch image="red_cardigan.jpg">
Red
</color_swatch>
</product>
</catalog>
`;
var tokens = [];
tokenizer.tokenize(str, function(token) {
tokens.push(token);
});
equals(tokens,
[
'<?', 'xml-stylesheet', 'href', '=', 'catalog.xsl', 'type', '=', 'text/xsl', '?>',
'<!', 'DOCTYPE', 'catalog', 'SYSTEM', 'catalog.dtd', '>',
'<', 'catalog', '>',
'<', 'product', 'description', '=', 'Cardigan Sweater', 'product_image', '=', 'cardigan.jpg', '>',
'<', 'size', 'description', '=', 'Large', '/>',
'<', 'color_swatch', 'image', '=', 'red_cardigan.jpg', '>',
'Red',
'</', 'color_swatch', '>',
'</', 'product', '>',
'</', 'catalog', '>'
]
);
The above examples are the first steps in writing parsers for those formats. The next would be parsing the stream of tokens based on the format-specific rules, e.g. SQL.
sends each token to the forEachToken(token:String, surroundedBy:String)
callback.
var tokenizer = new TokenizeThis();
var str = 'Tokenize "this"!';
var tokens = [];
var forEachToken = function(token, surroundedBy) {
tokens.push(surroundedBy+token+surroundedBy);
};
tokenizer.tokenize(str, forEachToken);
equals(tokens, ['Tokenize', '"this"', '!']);
it converts true
, false
, null
, and numbers into their literal versions.
var tokenizer = new TokenizeThis();
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
tokens.push(token);
});
equals(tokens, [true, false, null, true, false, null, 1, 2, 3.4, 5.6789]);
The default config object used when no config is supplied.
var config = {
shouldTokenize: ['(', ')', ',', '*', '/', '%', '+', '-', '=', '!=', '!', '<', '>', '<=', '>=', '^'],
shouldMatch: ['"', "'", '`'],
shouldDelimitBy: [' ', "\n", "\r", "\t"],
convertLiterals: true,
escapeCharacter: "\\"
};
equals(TokenizeThis.defaultConfig, config);
You can change converting to literals with the convertLiterals
config option.
var config = {
convertLiterals: false
};
var tokenizer = new TokenizeThis(config);
var str = 'true false null TRUE FALSE NULL 1 2 3.4 5.6789';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
tokens.push(token);
});
equals(tokens, ['true', 'false', 'null', 'TRUE', 'FALSE', 'NULL', '1', '2', '3.4', '5.6789']);
Any strings surrounded by the quotes specified in the shouldMatch
option are treated as whole tokens.
var config = {
shouldMatch: ['"', '`', '#']
};
var tokenizer = new TokenizeThis(config);
var str = '"hi there" `this is a test` #of quotes#';
var tokens = [];
var tokensQuoted = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
tokens.push(token);
tokensQuoted.push(surroundedBy+token+surroundedBy);
});
equals(tokens, ['hi there', 'this is a test', 'of quotes']);
equals(tokensQuoted, ['"hi there"', '`this is a test`', '#of quotes#']);
Quotes can be escaped via a backslash.
var tokenizer = new TokenizeThis();
var str = 'These are "\\"quotes\\""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);
The escape character can be specified with the escapeCharacter
option.
var config = {
escapeCharacter: '#'
};
var tokenizer = new TokenizeThis(config);
var str = 'These are "#"quotes#""';
var tokens = [];
tokenizer.tokenize(str, function(token, surroundedBy) {
tokens.push(token);
});
equals(tokens, ['These', 'are', '"quotes"']);
FAQs
Turns a string into tokens.
The npm package tokenize-this receives a total of 5,696 weekly downloads. As such, tokenize-this popularity was classified as popular.
We found that tokenize-this demonstrated a not healthy version release cadence and project activity because the last version was released a year ago. It has 1 open source maintainer collaborating on the project.
Did you know?
Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.
Security News
Open source maintainers are urging GitHub to let them block Copilot from submitting AI-generated issues and pull requests to their repositories.
Research
Security News
Malicious Koishi plugin silently exfiltrates messages with hex strings to a hardcoded QQ account, exposing secrets in chatbots across platforms.
Research
Security News
Malicious PyPI checkers validate stolen emails against TikTok and Instagram APIs, enabling targeted account attacks and dark web credential sales.