Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

gettext-parser

Package Overview
Dependencies
Maintainers
1
Versions
44
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

gettext-parser - npm Package Compare versions

Comparing version 1.0.0 to 1.1.0

5

CHANGELOG.md
# Changelog
## v1.1.0 2015-01-21
* Added `po.createParseStream` method for parsing PO files from a Stream source
* Updated documentation
## v1.0.0 2015-01-21

@@ -4,0 +9,0 @@

5

index.js
'use strict';
var poParser = require('./lib/poparser');
module.exports = {
po: {
parse: require('./lib/poparser'),
parse: poParser.parse,
createParseStream: poParser.stream,
compile: require('./lib/pocompiler')

@@ -7,0 +10,0 @@ },

324

lib/poparser.js

@@ -5,2 +5,4 @@ 'use strict';

var sharedFuncs = require('./shared');
var Transform = require('stream').Transform;
var util = require('util');

@@ -14,3 +16,3 @@ /**

*/
module.exports = function(buffer, defaultCharset) {
module.exports.parse = function(buffer, defaultCharset) {
var parser = new Parser(buffer, defaultCharset);

@@ -21,2 +23,13 @@ return parser.parse();

/**
* Parses a PO stream, emits translation table in object mode
*
* @param {String} [defaultCharset] Default charset to use
* @param {String} [options] Stream options
* @return {Stream} Transform stream
*/
module.exports.stream = function(defaultCharset, options) {
return new PoParserTransform(defaultCharset, options);
};
/**
* Creates a PO parser object. If PO object is a string,

@@ -33,2 +46,7 @@ * UTF-8 will be used as the charset

this._lex = [];
this._escaped = false;
this._node;
this._state = this.states.none;
if (typeof fileContents === 'string') {

@@ -40,6 +58,15 @@ this._charset = 'utf-8';

}
}
/**
* Parses the PO object and returns translation table
*
* @return {Object} Translation table
*/
Parser.prototype.parse = function() {
this._lexer(this._fileContents);
return this._finalize(this._lex);
};
/**
* Detects charset for PO strings from the header

@@ -67,6 +94,10 @@ *

} else {
this._fileContents = encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
this._fileContents = this._toString(buf);
}
};
Parser.prototype._toString = function(buf) {
return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
};
/**

@@ -102,19 +133,15 @@ * State constants for parsing FSM

/**
* Token parser
* Token parser. Parsed state can be found from this._lex
*
* @return {Object} Parsed tokens
* @param {String} chunk String
*/
Parser.prototype._lexer = function() {
var chr,
escaped = false,
lex = [],
node,
state = this.states.none;
Parser.prototype._lexer = function(chunk) {
var chr;
for (var i = 0, len = this._fileContents.length; i < len; i++) {
chr = this._fileContents.charAt(i);
switch (state) {
for (var i = 0, len = chunk.length; i < len; i++) {
chr = chunk.charAt(i);
switch (this._state) {
case this.states.none:
if (chr.match(this.symbols.quotes)) {
node = {
this._node = {
type: this.types.string,

@@ -124,18 +151,18 @@ value: '',

};
lex.push(node);
state = this.states.string;
this._lex.push(this._node);
this._state = this.states.string;
} else if (chr.match(this.symbols.comments)) {
node = {
this._node = {
type: this.types.comments,
value: ''
};
lex.push(node);
state = this.states.comments;
this._lex.push(this._node);
this._state = this.states.comments;
} else if (!chr.match(this.symbols.whitespace)) {
node = {
this._node = {
type: this.types.key,
value: chr
};
lex.push(node);
state = this.states.key;
this._lex.push(this._node);
this._state = this.states.key;
}

@@ -145,33 +172,33 @@ break;

if (chr === '\n') {
state = this.states.none;
this._state = this.states.none;
} else if (chr !== '\r') {
node.value += chr;
this._node.value += chr;
}
break;
case this.states.string:
if (escaped) {
if (this._escaped) {
switch (chr) {
case 't':
node.value += '\t';
this._node.value += '\t';
break;
case 'n':
node.value += '\n';
this._node.value += '\n';
break;
case 'r':
node.value += '\r';
this._node.value += '\r';
break;
default:
node.value += chr;
this._node.value += chr;
}
escaped = false;
this._escaped = false;
} else {
if (chr === node.quote) {
state = this.states.none;
if (chr === this._node.quote) {
this._state = this.states.none;
} else if (chr === '\\') {
escaped = true;
this._escaped = true;
break;
} else {
node.value += chr;
this._node.value += chr;
}
escaped = false;
this._escaped = false;
}

@@ -181,6 +208,6 @@ break;

if (!chr.match(this.symbols.key)) {
state = this.states.none;
this._state = this.states.none;
i--;
} else {
node.value += chr;
this._node.value += chr;
}

@@ -190,4 +217,2 @@ break;

}
return lex;
};

@@ -198,16 +223,16 @@

*
* @param {Object} lex Parsed tokens
* @param {Object} tokens Parsed tokens
* @return {Object} Parsed tokens, with multi line strings joined into one
*/
Parser.prototype._joinStringValues = function(lex) {
Parser.prototype._joinStringValues = function(tokens) {
var lastNode, response = [];
for (var i = 0, len = lex.length; i < len; i++) {
if (lastNode && lex[i].type === this.types.string && lastNode.type === this.types.string) {
lastNode.value += lex[i].value;
} else if (lastNode && lex[i].type === this.types.comments && lastNode.type === this.types.comments) {
lastNode.value += '\n' + lex[i].value;
for (var i = 0, len = tokens.length; i < len; i++) {
if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) {
lastNode.value += tokens[i].value;
} else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) {
lastNode.value += '\n' + tokens[i].value;
} else {
response.push(lex[i]);
lastNode = lex[i];
response.push(tokens[i]);
lastNode = tokens[i];
}

@@ -222,7 +247,7 @@ }

*
* @param {Object} lex Parsed tokens
* @param {Object} tokens Parsed tokens
*/
Parser.prototype._parseComments = function(lex) {
Parser.prototype._parseComments = function(tokens) {
// parse comments
lex.forEach((function(node) {
tokens.forEach((function(node) {
var comment, lines;

@@ -272,21 +297,21 @@

*
* @param {Object} lex Parsed tokens
* @param {Object} tokens Parsed tokens
* @return {Object} Tokens
*/
Parser.prototype._handleKeys = function(lex) {
Parser.prototype._handleKeys = function(tokens) {
var response = [],
lastNode;
for (var i = 0, len = lex.length; i < len; i++) {
if (lex[i].type === this.types.key) {
for (var i = 0, len = tokens.length; i < len; i++) {
if (tokens[i].type === this.types.key) {
lastNode = {
key: lex[i].value
key: tokens[i].value
};
if (i && lex[i - 1].type === this.types.comments) {
lastNode.comments = lex[i - 1].value;
if (i && tokens[i - 1].type === this.types.comments) {
lastNode.comments = tokens[i - 1].value;
}
lastNode.value = '';
response.push(lastNode);
} else if (lex[i].type === this.types.string && lastNode) {
lastNode.value += lex[i].value;
} else if (tokens[i].type === this.types.string && lastNode) {
lastNode.value += tokens[i].value;
}

@@ -301,16 +326,16 @@ }

*
* @param {Object} lex Parsed tokens
* @param {Object} tokens Parsed tokens
* @return {Object} Tokens
*/
Parser.prototype._handleValues = function(lex) {
Parser.prototype._handleValues = function(tokens) {
var response = [],
lastNode, curContext, curComments;
for (var i = 0, len = lex.length; i < len; i++) {
if (lex[i].key.toLowerCase() === 'msgctxt') {
curContext = lex[i].value;
curComments = lex[i].comments;
} else if (lex[i].key.toLowerCase() === 'msgid') {
for (var i = 0, len = tokens.length; i < len; i++) {
if (tokens[i].key.toLowerCase() === 'msgctxt') {
curContext = tokens[i].value;
curComments = tokens[i].comments;
} else if (tokens[i].key.toLowerCase() === 'msgid') {
lastNode = {
msgid: lex[i].value
msgid: tokens[i].value
};

@@ -326,4 +351,4 @@

if (lex[i].comments && !lastNode.comments) {
lastNode.comments = lex[i].comments;
if (tokens[i].comments && !lastNode.comments) {
lastNode.comments = tokens[i].comments;
}

@@ -334,9 +359,9 @@

response.push(lastNode);
} else if (lex[i].key.toLowerCase() === 'msgid_plural') {
} else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
if (lastNode) {
lastNode.msgid_plural = lex[i].value;
lastNode.msgid_plural = tokens[i].value;
}
if (lex[i].comments && !lastNode.comments) {
lastNode.comments = lex[i].comments;
if (tokens[i].comments && !lastNode.comments) {
lastNode.comments = tokens[i].comments;
}

@@ -346,9 +371,9 @@

curComments = false;
} else if (lex[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
} else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
if (lastNode) {
lastNode.msgstr = (lastNode.msgstr || []).concat(lex[i].value);
lastNode.msgstr = (lastNode.msgstr || []).concat(tokens[i].value);
}
if (lex[i].comments && !lastNode.comments) {
lastNode.comments = lex[i].comments;
if (tokens[i].comments && !lastNode.comments) {
lastNode.comments = tokens[i].comments;
}

@@ -367,6 +392,6 @@

*
* @param {Object} lex Parsed tokens
* @param {Object} tokens Parsed tokens
* @return {Object} Translation table
*/
Parser.prototype._normalize = function(lex) {
Parser.prototype._normalize = function(tokens) {
var msgctxt,

@@ -379,4 +404,4 @@ table = {

for (var i = 0, len = lex.length; i < len; i++) {
msgctxt = lex[i].msgctxt || '';
for (var i = 0, len = tokens.length; i < len; i++) {
msgctxt = tokens[i].msgctxt || '';

@@ -387,7 +412,7 @@ if (!table.translations[msgctxt]) {

if (!table.headers && !msgctxt && !lex[i].msgid) {
table.headers = sharedFuncs.parseHeader(lex[i].msgstr[0]);
if (!table.headers && !msgctxt && !tokens[i].msgid) {
table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
}
table.translations[msgctxt][lex[i].msgid] = lex[i];
table.translations[msgctxt][tokens[i].msgid] = tokens[i];
}

@@ -399,15 +424,124 @@

/**
* Parses the PO object and returns translation table
* Converts parsed tokens to a translation table
*
* @return {Object} Translation table
* @param {Object} tokens Parsed tokens
* @returns {Object} Translation table
*/
Parser.prototype.parse = function() {
var lex = this._lexer();
Parser.prototype._finalize = function(tokens) {
var data = this._joinStringValues(tokens);
this._parseComments(data);
data = this._handleKeys(data);
data = this._handleValues(data);
lex = this._joinStringValues(lex);
this._parseComments(lex);
lex = this._handleKeys(lex);
lex = this._handleValues(lex);
return this._normalize(data);
};
return this._normalize(lex);
/**
* Creates a transform stream for parsing PO input
*
* @constructor
* @param {String} [defaultCharset] Default charset to use
* @param {String} [options] Stream options
*/
function PoParserTransform(defaultCharset, options) {
if (!options && defaultCharset && typeof defaultCharset === 'object') {
options = defaultCharset;
defaultCharset = undefined;
}
this.defaultCharset = defaultCharset;
this._parser = false;
this._tokens = {};
this._cache = [];
this._cacheSize = 0;
this.initialTreshold = options.initialTreshold || 2 * 1024;
Transform.call(this, options);
this._writableState.objectMode = false;
this._readableState.objectMode = true;
}
util.inherits(PoParserTransform, Transform);
/**
* Processes a chunk of the input stream
*/
PoParserTransform.prototype._transform = function(chunk, encoding, done) {
var i, len = 0;
if (!chunk || !chunk.length) {
return done();
}
if (!this._parser) {
this._cache.push(chunk);
this._cacheSize += chunk.length;
// wait until the first 1kb before parsing headers for charset
if (this._cacheSize < this.initialTreshold) {
return setImmediate(done);
} else if (this._cacheSize) {
chunk = Buffer.concat(this._cache, this._cacheSize);
this._cacheSize = 0;
this._cache = [];
}
this._parser = new Parser(chunk, this.defaultCharset);
} else if (this._cacheSize) {
// this only happens if we had an uncompleted 8bit sequence from the last iteration
this._cache.push(chunk);
this._cacheSize += chunk.length;
chunk = Buffer.concat(this._cache, this._cacheSize);
this._cacheSize = 0;
this._cache = [];
}
// cache 8bit bytes from the end of the chunk
// helps if the chunk ends in the middle of an utf-8 sequence
for (i = chunk.length - 1; i >= 0; i--) {
if (chunk[i] >= 0x80) {
len++;
continue;
}
break;
}
// it seems we found some 8bit bytes from the end of the string, so let's cache these
if (len) {
this._cache = [chunk.slice(chunk.length - len)];
this._cacheSize = this._cache[0].length;
chunk = chunk.slice(0, chunk.length - len);
}
// chunk might be empty if it only contined of 8bit bytes and these were all cached
if (chunk.length) {
this._parser._lexer(this._parser._toString(chunk));
}
setImmediate(done);
};
/**
* Once all input has been processed emit the parsed translation table as an object
*/
PoParserTransform.prototype._flush = function(done) {
var chunk;
if (this._cacheSize) {
chunk = Buffer.concat(this._cache, this._cacheSize);
}
if (!this._parser && chunk) {
this._parser = new Parser(chunk, this.defaultCharset);
}
if (chunk) {
this._parser._lexer(this._parser._toString(chunk));
}
if (this._parser) {
this.push(this._parser._finalize(this._parser._lex));
}
setImmediate(done);
};
{
"name": "gettext-parser",
"description": "Parse and compile gettext po and mo files to/from json, nothing more, nothing less",
"version": "1.0.0",
"version": "1.1.0",
"author": "Andris Reinman",

@@ -6,0 +6,0 @@ "homepage": "http://github.com/andris9/gettext-parser",

@@ -11,6 +11,2 @@ gettext-parser

## ICONV NOTICE
By default *gettext-parser* uses pure JS [iconv-lite](https://github.com/ashtuchkin/iconv-lite) for encoding and decoding non UTF-8 charsets. If you need to support more complex encodings like EUC or Shift_JIS, you need to add [iconv](https://github.com/bnoordhuis/node-iconv) as a dependency for your project.
## Usage

@@ -22,17 +18,125 @@

Available methods:
* `gettextParser.po.parse(buf[, defaultCharset])` where `buf` is a *po* file as a Buffer or an unicode string. `defaultCharset` is the charset to use if charset is not defined or is the default `"CHARSET"`. Returns gettext-parser specific translation object (see below)
* `gettextParser.po.compile(obj)` where `obj` is a translation object, returns a *po* file as a Buffer
* `gettextParser.mo.parse(buf[, defaultCharset])` where `buf` is a *mo* file as a Buffer (*mo* is binary format, so do not use strings). `defaultCharset` is the charset to use if charset is not defined or is the default `"CHARSET"`. Returns translation object
* `gettextParser.mo.compile(obj)` where `obj` is a translation object, returns a *mo* file as a Buffer
### Parse PO files
**NB** if you are compiling a previously parsed translation object, you can override the output charset with the `charset` property (applies both for compiling *mo* and *po* files).
Parse a PO file with
var obj = gettextParser.po.parse(inputBuf);
obj.charset = "windows-1257";
outputBuf = gettextParser.po.compile(obj);
gettextParser.po.parse(input[, defaultCharset]) → Object
Where
* **input** is a *po* file as a Buffer or an unicode string. Charset is converted to unicode from other encodings only if the input is a Buffer, otherwise the charset information is discarded
* **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"` (applies only if *input* is a Buffer)
Method returns gettext-parser specific translation object (see below)
**Example**
```javascript
var input = require('fs').readFileSync('en.po');
var po = gettextParser.po.parse(input);
console.log(po.translations['']); // output translations for the default context
```
### Parse PO as a Stream
PO files can also be parsed from a stream source. After all input is processed the parser emits a single 'data' event which contains the parsed translation object.
gettextParser.po.createParseStream([defaultCharset][, streamOptions]) → Transform Stream
Where
* **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"`
* **streamOptions** are the standard stream options
**Example**
```javascript
var input = require('fs').createReadStream('en.po');
var po = gettextParser.po.createParseStream();
input.pipe(po);
po.on('data', function(data){
console.log(data.translations['']); // output translations for the default context
});
```
### Compile PO from a translation object
If you have a translation object you can convert this to a valid PO file with
gettextParser.po.compile(data) → Buffer
Where
* **data** is a translation object either got from parsing a PO/MO file or composed by other means
**Example**
```javascript
var data = {
...
};
var output = gettextParser.po.compile(data);
require('fs').writeFileSync(output);
```
### Parse MO files
Parse a MO file with
gettextParser.mo.parse(input[, defaultCharset]) → Object
Where
* **input** is a *mo* file as a Buffer
* **defaultCharset** is the charset to use if charset is not defined or is the default `"CHARSET"`
Method returns gettext-parser specific translation object (see below)
**Example**
```javascript
var input = require('fs').readFileSync('en.mo');
var mo = gettextParser.mo.parse(input);
console.log(mo.translations['']); // output translations for the default context
```
### Compile MO from a translation object
If you have a translation object you can convert this to a valid MO file with
gettextParser.mo.compile(data) → Buffer
Where
* **data** is a translation object either got from parsing a PO/MO file or composed by other means
**Example**
```javascript
var data = {
...
};
var output = gettextParser.mo.compile(data);
require('fs').writeFileSync(output);
```
### Notes
#### Overriding charset
If you are compiling a previously parsed translation object, you can override the output charset with the `charset` property (applies both for compiling *mo* and *po* files).
```javascript
var obj = gettextParser.po.parse(inputBuf);
obj.charset = "windows-1257";
outputBuf = gettextParser.po.compile(obj);
```
Headers for the output are modified to match the updated charset.
#### ICONV support
By default *gettext-parser* uses pure JS [iconv-lite](https://github.com/ashtuchkin/iconv-lite) for encoding and decoding non UTF-8 charsets. If you need to support more complex encodings that are not supported by *iconv-lite*, you need to add [iconv](https://github.com/bnoordhuis/node-iconv) as an additional dependency for your project (*gettext-parser* will detect if it is available and tries to use it instead of *iconv-lite*).
## Data structure of parsed mo/po files

@@ -42,4 +146,5 @@

The data is always in unicode but the original charset of the file can
be found from the `charset` property.
Parsed data is always in unicode but the original charset of the file can
be found from the `charset` property. This value is also used when compiling translations
to a *mo* or *po* file.

@@ -66,30 +171,30 @@ ### Headers

{
"charset": "iso-8859-1",
"charset": "iso-8859-1",
"headers": {
"content-type": "text/plain; charset=iso-8859-1",
"plural-forms": "nplurals=2; plural=(n!=1);"
},
"headers": {
"content-type": "text/plain; charset=iso-8859-1",
"plural-forms": "nplurals=2; plural=(n!=1);"
},
"translations":{
"": {
"": {
"msgid": "",
"msgstr": ["Content-Type: text/plain; charset=iso-8859-1\n..."]
}
},
"translations": {
"": {
"": {
"msgid": "",
"msgstr": ["Content-Type: text/plain; charset=iso-8859-1\n..."]
}
}
},
"another context":{
"%s example":{
"msgctx": "another context",
"msgid": "%s example",
"msgid_plural": "%s examples",
"msgstr": ["% näide", "%s näidet"],
"comments": {
"translator": "This is regular comment",
"reference": "/path/to/file:123"
}
}
}
"another context": {
"%s example": {
"msgctx": "another context",
"msgid": "%s example",
"msgid_plural": "%s examples",
"msgstr": ["% näide", "%s näidet"],
"comments": {
"translator": "This is regular comment",
"reference": "/path/to/file:123"
}
}
}
}

@@ -96,0 +201,0 @@ ```

@@ -7,3 +7,3 @@ 'use strict';

var expect = chai.expect;
chai.Assertion.includeStack = true;
chai.config.includeStack = true;

@@ -10,0 +10,0 @@ describe('Folding tests', function() {

@@ -8,3 +8,3 @@ 'use strict';

var expect = chai.expect;
chai.Assertion.includeStack = true;
chai.config.includeStack = true;

@@ -11,0 +11,0 @@ describe('MO Compiler', function() {

@@ -8,3 +8,3 @@ 'use strict';

var expect = chai.expect;
chai.Assertion.includeStack = true;
chai.config.includeStack = true;

@@ -11,0 +11,0 @@ describe('MO Parser', function() {

@@ -8,3 +8,3 @@ 'use strict';

var expect = chai.expect;
chai.Assertion.includeStack = true;
chai.config.includeStack = true;

@@ -11,0 +11,0 @@ describe('PO Compiler', function() {

@@ -8,3 +8,3 @@ 'use strict';

var expect = chai.expect;
chai.Assertion.includeStack = true;
chai.config.includeStack = true;

@@ -31,2 +31,24 @@ describe('PO Parser', function() {

describe('Stream input', function() {
it('should parse', function(done) {
var po = fs.createReadStream(__dirname + '/fixtures/utf8.po', {
highWaterMark: 1 // ensure that any utf-8 sequences will be broken when streaming
});
var json = JSON.parse(fs.readFileSync(__dirname + '/fixtures/utf8-po.json', 'utf-8'));
var parsed;
var stream = po.pipe(gettextParser.po.createParseStream({
initialTreshold: 800 // home many bytes to cache for parsing the header
}));
stream.on('data', function(data) {
parsed = data;
});
stream.on('end', function() {
expect(parsed).to.deep.equal(json);
done();
});
});
});
describe('Latin-13', function() {

@@ -33,0 +55,0 @@ it('should parse', function() {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc