# Changelog

		## v1.1.0 2015-01-21

		* Added `po.createParseStream` method for parsing PO files from a Stream source
		* Updated documentation

		## v1.0.0 2015-01-21
		@@ -4,0 +9,0 @@

index.js

		'use strict';

		var poParser = require('./lib/poparser');

		module.exports = {
		po: {
		parse: require('./lib/poparser'),
		parse: poParser.parse,
		createParseStream: poParser.stream,
		compile: require('./lib/pocompiler')
		@@ -7,0 +10,0 @@ },

324

lib/poparser.js

		@@ -5,2 +5,4 @@ 'use strict';
		var sharedFuncs = require('./shared');
		var Transform = require('stream').Transform;
		var util = require('util');

		@@ -14,3 +16,3 @@ /**
		*/
		module.exports = function(buffer, defaultCharset) {
		module.exports.parse = function(buffer, defaultCharset) {
		var parser = new Parser(buffer, defaultCharset);
		@@ -21,2 +23,13 @@ return parser.parse();
		/**
		* Parses a PO stream, emits translation table in object mode
		*
		* @param {String} [defaultCharset] Default charset to use
		* @param {String} [options] Stream options
		* @return {Stream} Transform stream
		*/
		module.exports.stream = function(defaultCharset, options) {
		return new PoParserTransform(defaultCharset, options);
		};

		/**
		* Creates a PO parser object. If PO object is a string,
		@@ -33,2 +46,7 @@ * UTF-8 will be used as the charset

		this._lex = [];
		this._escaped = false;
		this._node;
		this._state = this.states.none;

		if (typeof fileContents === 'string') {
		@@ -40,6 +58,15 @@ this._charset = 'utf-8';
		}

		}

		/**
		* Parses the PO object and returns translation table
		*
		* @return {Object} Translation table
		*/
		Parser.prototype.parse = function() {
		this._lexer(this._fileContents);
		return this._finalize(this._lex);
		};

		/**
		* Detects charset for PO strings from the header
		@@ -67,6 +94,10 @@ *
		} else {
		this._fileContents = encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
		this._fileContents = this._toString(buf);
		}
		};

		Parser.prototype._toString = function(buf) {
		return encoding.convert(buf, 'utf-8', this._charset).toString('utf-8');
		};

		/**
		@@ -102,19 +133,15 @@ * State constants for parsing FSM
		/**
		* Token parser
		* Token parser. Parsed state can be found from this._lex
		*
		* @return {Object} Parsed tokens
		* @param {String} chunk String
		*/
		Parser.prototype._lexer = function() {
		var chr,
		escaped = false,
		lex = [],
		node,
		state = this.states.none;
		Parser.prototype._lexer = function(chunk) {
		var chr;

		for (var i = 0, len = this._fileContents.length; i < len; i++) {
		chr = this._fileContents.charAt(i);
		switch (state) {
		for (var i = 0, len = chunk.length; i < len; i++) {
		chr = chunk.charAt(i);
		switch (this._state) {
		case this.states.none:
		if (chr.match(this.symbols.quotes)) {
		node = {
		this._node = {
		type: this.types.string,
		@@ -124,18 +151,18 @@ value: '',
		};
		lex.push(node);
		state = this.states.string;
		this._lex.push(this._node);
		this._state = this.states.string;
		} else if (chr.match(this.symbols.comments)) {
		node = {
		this._node = {
		type: this.types.comments,
		value: ''
		};
		lex.push(node);
		state = this.states.comments;
		this._lex.push(this._node);
		this._state = this.states.comments;
		} else if (!chr.match(this.symbols.whitespace)) {
		node = {
		this._node = {
		type: this.types.key,
		value: chr
		};
		lex.push(node);
		state = this.states.key;
		this._lex.push(this._node);
		this._state = this.states.key;
		}
		@@ -145,33 +172,33 @@ break;
		if (chr === '\n') {
		state = this.states.none;
		this._state = this.states.none;
		} else if (chr !== '\r') {
		node.value += chr;
		this._node.value += chr;
		}
		break;
		case this.states.string:
		if (escaped) {
		if (this._escaped) {
		switch (chr) {
		case 't':
		node.value += '\t';
		this._node.value += '\t';
		break;
		case 'n':
		node.value += '\n';
		this._node.value += '\n';
		break;
		case 'r':
		node.value += '\r';
		this._node.value += '\r';
		break;
		default:
		node.value += chr;
		this._node.value += chr;
		}
		escaped = false;
		this._escaped = false;
		} else {
		if (chr === node.quote) {
		state = this.states.none;
		if (chr === this._node.quote) {
		this._state = this.states.none;
		} else if (chr === '\\') {
		escaped = true;
		this._escaped = true;
		break;
		} else {
		node.value += chr;
		this._node.value += chr;
		}
		escaped = false;
		this._escaped = false;
		}
		@@ -181,6 +208,6 @@ break;
		if (!chr.match(this.symbols.key)) {
		state = this.states.none;
		this._state = this.states.none;
		i--;
		} else {
		node.value += chr;
		this._node.value += chr;
		}
		@@ -190,4 +217,2 @@ break;
		}

		return lex;
		};
		@@ -198,16 +223,16 @@
		*
		* @param {Object} lex Parsed tokens
		* @param {Object} tokens Parsed tokens
		* @return {Object} Parsed tokens, with multi line strings joined into one
		*/
		Parser.prototype._joinStringValues = function(lex) {
		Parser.prototype._joinStringValues = function(tokens) {
		var lastNode, response = [];

		for (var i = 0, len = lex.length; i < len; i++) {
		if (lastNode && lex[i].type === this.types.string && lastNode.type === this.types.string) {
		lastNode.value += lex[i].value;
		} else if (lastNode && lex[i].type === this.types.comments && lastNode.type === this.types.comments) {
		lastNode.value += '\n' + lex[i].value;
		for (var i = 0, len = tokens.length; i < len; i++) {
		if (lastNode && tokens[i].type === this.types.string && lastNode.type === this.types.string) {
		lastNode.value += tokens[i].value;
		} else if (lastNode && tokens[i].type === this.types.comments && lastNode.type === this.types.comments) {
		lastNode.value += '\n' + tokens[i].value;
		} else {
		response.push(lex[i]);
		lastNode = lex[i];
		response.push(tokens[i]);
		lastNode = tokens[i];
		}
		@@ -222,7 +247,7 @@ }
		*
		* @param {Object} lex Parsed tokens
		* @param {Object} tokens Parsed tokens
		*/
		Parser.prototype._parseComments = function(lex) {
		Parser.prototype._parseComments = function(tokens) {
		// parse comments
		lex.forEach((function(node) {
		tokens.forEach((function(node) {
		var comment, lines;
		@@ -272,21 +297,21 @@
		*
		* @param {Object} lex Parsed tokens
		* @param {Object} tokens Parsed tokens
		* @return {Object} Tokens
		*/
		Parser.prototype._handleKeys = function(lex) {
		Parser.prototype._handleKeys = function(tokens) {
		var response = [],
		lastNode;

		for (var i = 0, len = lex.length; i < len; i++) {
		if (lex[i].type === this.types.key) {
		for (var i = 0, len = tokens.length; i < len; i++) {
		if (tokens[i].type === this.types.key) {
		lastNode = {
		key: lex[i].value
		key: tokens[i].value
		};
		if (i && lex[i - 1].type === this.types.comments) {
		lastNode.comments = lex[i - 1].value;
		if (i && tokens[i - 1].type === this.types.comments) {
		lastNode.comments = tokens[i - 1].value;
		}
		lastNode.value = '';
		response.push(lastNode);
		} else if (lex[i].type === this.types.string && lastNode) {
		lastNode.value += lex[i].value;
		} else if (tokens[i].type === this.types.string && lastNode) {
		lastNode.value += tokens[i].value;
		}
		@@ -301,16 +326,16 @@ }
		*
		* @param {Object} lex Parsed tokens
		* @param {Object} tokens Parsed tokens
		* @return {Object} Tokens
		*/
		Parser.prototype._handleValues = function(lex) {
		Parser.prototype._handleValues = function(tokens) {
		var response = [],
		lastNode, curContext, curComments;

		for (var i = 0, len = lex.length; i < len; i++) {
		if (lex[i].key.toLowerCase() === 'msgctxt') {
		curContext = lex[i].value;
		curComments = lex[i].comments;
		} else if (lex[i].key.toLowerCase() === 'msgid') {
		for (var i = 0, len = tokens.length; i < len; i++) {
		if (tokens[i].key.toLowerCase() === 'msgctxt') {
		curContext = tokens[i].value;
		curComments = tokens[i].comments;
		} else if (tokens[i].key.toLowerCase() === 'msgid') {
		lastNode = {
		msgid: lex[i].value
		msgid: tokens[i].value
		};
		@@ -326,4 +351,4 @@

		if (lex[i].comments && !lastNode.comments) {
		lastNode.comments = lex[i].comments;
		if (tokens[i].comments && !lastNode.comments) {
		lastNode.comments = tokens[i].comments;
		}
		@@ -334,9 +359,9 @@
		response.push(lastNode);
		} else if (lex[i].key.toLowerCase() === 'msgid_plural') {
		} else if (tokens[i].key.toLowerCase() === 'msgid_plural') {
		if (lastNode) {
		lastNode.msgid_plural = lex[i].value;
		lastNode.msgid_plural = tokens[i].value;
		}

		if (lex[i].comments && !lastNode.comments) {
		lastNode.comments = lex[i].comments;
		if (tokens[i].comments && !lastNode.comments) {
		lastNode.comments = tokens[i].comments;
		}
		@@ -346,9 +371,9 @@
		curComments = false;
		} else if (lex[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
		} else if (tokens[i].key.substr(0, 6).toLowerCase() === 'msgstr') {
		if (lastNode) {
		lastNode.msgstr = (lastNode.msgstr \|\| []).concat(lex[i].value);
		lastNode.msgstr = (lastNode.msgstr \|\| []).concat(tokens[i].value);
		}

		if (lex[i].comments && !lastNode.comments) {
		lastNode.comments = lex[i].comments;
		if (tokens[i].comments && !lastNode.comments) {
		lastNode.comments = tokens[i].comments;
		}
		@@ -367,6 +392,6 @@
		*
		* @param {Object} lex Parsed tokens
		* @param {Object} tokens Parsed tokens
		* @return {Object} Translation table
		*/
		Parser.prototype._normalize = function(lex) {
		Parser.prototype._normalize = function(tokens) {
		var msgctxt,
		@@ -379,4 +404,4 @@ table = {

		for (var i = 0, len = lex.length; i < len; i++) {
		msgctxt = lex[i].msgctxt \|\| '';
		for (var i = 0, len = tokens.length; i < len; i++) {
		msgctxt = tokens[i].msgctxt \|\| '';

		@@ -387,7 +412,7 @@ if (!table.translations[msgctxt]) {

		if (!table.headers && !msgctxt && !lex[i].msgid) {
		table.headers = sharedFuncs.parseHeader(lex[i].msgstr[0]);
		if (!table.headers && !msgctxt && !tokens[i].msgid) {
		table.headers = sharedFuncs.parseHeader(tokens[i].msgstr[0]);
		}

		table.translations[msgctxt][lex[i].msgid] = lex[i];
		table.translations[msgctxt][tokens[i].msgid] = tokens[i];
		}
		@@ -399,15 +424,124 @@
		/**
		* Parses the PO object and returns translation table
		* Converts parsed tokens to a translation table
		*
		* @return {Object} Translation table
		* @param {Object} tokens Parsed tokens
		* @returns {Object} Translation table
		*/
		Parser.prototype.parse = function() {
		var lex = this._lexer();
		Parser.prototype._finalize = function(tokens) {
		var data = this._joinStringValues(tokens);
		this._parseComments(data);
		data = this._handleKeys(data);
		data = this._handleValues(data);

		lex = this._joinStringValues(lex);
		this._parseComments(lex);
		lex = this._handleKeys(lex);
		lex = this._handleValues(lex);
		return this._normalize(data);
		};

		return this._normalize(lex);
		/**
		* Creates a transform stream for parsing PO input
		*
		* @constructor
		* @param {String} [defaultCharset] Default charset to use
		* @param {String} [options] Stream options
		*/
		function PoParserTransform(defaultCharset, options) {
		if (!options && defaultCharset && typeof defaultCharset === 'object') {
		options = defaultCharset;
		defaultCharset = undefined;
		}

		this.defaultCharset = defaultCharset;
		this._parser = false;
		this._tokens = {};

		this._cache = [];
		this._cacheSize = 0;

		this.initialTreshold = options.initialTreshold \|\| 2 * 1024;

		Transform.call(this, options);
		this._writableState.objectMode = false;
		this._readableState.objectMode = true;
		}
		util.inherits(PoParserTransform, Transform);

		/**
		* Processes a chunk of the input stream
		*/
		PoParserTransform.prototype._transform = function(chunk, encoding, done) {
		var i, len = 0;

		if (!chunk \|\| !chunk.length) {
		return done();
		}

		if (!this._parser) {
		this._cache.push(chunk);
		this._cacheSize += chunk.length;

		// wait until the first 1kb before parsing headers for charset
		if (this._cacheSize < this.initialTreshold) {
		return setImmediate(done);
		} else if (this._cacheSize) {
		chunk = Buffer.concat(this._cache, this._cacheSize);
		this._cacheSize = 0;
		this._cache = [];
		}

		this._parser = new Parser(chunk, this.defaultCharset);
		} else if (this._cacheSize) {
		// this only happens if we had an uncompleted 8bit sequence from the last iteration
		this._cache.push(chunk);
		this._cacheSize += chunk.length;
		chunk = Buffer.concat(this._cache, this._cacheSize);
		this._cacheSize = 0;
		this._cache = [];
		}

		// cache 8bit bytes from the end of the chunk
		// helps if the chunk ends in the middle of an utf-8 sequence
		for (i = chunk.length - 1; i >= 0; i--) {
		if (chunk[i] >= 0x80) {
		len++;
		continue;
		}
		break;
		}
		// it seems we found some 8bit bytes from the end of the string, so let's cache these
		if (len) {
		this._cache = [chunk.slice(chunk.length - len)];
		this._cacheSize = this._cache[0].length;
		chunk = chunk.slice(0, chunk.length - len);
		}

		// chunk might be empty if it only contined of 8bit bytes and these were all cached
		if (chunk.length) {
		this._parser._lexer(this._parser._toString(chunk));
		}

		setImmediate(done);
		};

		/**
		* Once all input has been processed emit the parsed translation table as an object
		*/
		PoParserTransform.prototype._flush = function(done) {
		var chunk;

		if (this._cacheSize) {
		chunk = Buffer.concat(this._cache, this._cacheSize);
		}

		if (!this._parser && chunk) {
		this._parser = new Parser(chunk, this.defaultCharset);
		}

		if (chunk) {
		this._parser._lexer(this._parser._toString(chunk));
		}

		if (this._parser) {
		this.push(this._parser._finalize(this._parser._lex));
		}

		setImmediate(done);
		};

package.json

		{
		"name": "gettext-parser",
		"description": "Parse and compile gettext po and mo files to/from json, nothing more, nothing less",
		"version": "1.0.0",
		"version": "1.1.0",
		"author": "Andris Reinman",
		@@ -6,0 +6,0 @@ "homepage": "http://github.com/andris9/gettext-parser",

183

README.md

		@@ -11,6 +11,2 @@ gettext-parser

		## ICONV NOTICE

		By default gettext-parser uses pure JS [iconv-lite](https://github.com/ashtuchkin/iconv-lite) for encoding and decoding non UTF-8 charsets. If you need to support more complex encodings like EUC or Shift_JIS, you need to add [iconv](https://github.com/bnoordhuis/node-iconv) as a dependency for your project.

		## Usage
		@@ -22,17 +18,125 @@

		Available methods:

		* `gettextParser.po.parse(buf[, defaultCharset])` where `buf` is a po file as a Buffer or an unicode string. `defaultCharset` is the charset to use if charset is not defined or is the default `"CHARSET"`. Returns gettext-parser specific translation object (see below)
		* `gettextParser.po.compile(obj)` where `obj` is a translation object, returns a po file as a Buffer
		* `gettextParser.mo.parse(buf[, defaultCharset])` where `buf` is a mo file as a Buffer (mo is binary format, so do not use strings). `defaultCharset` is the charset to use if charset is not defined or is the default `"CHARSET"`. Returns translation object
		* `gettextParser.mo.compile(obj)` where `obj` is a translation object, returns a mo file as a Buffer
		### Parse PO files

		NB if you are compiling a previously parsed translation object, you can override the output charset with the `charset` property (applies both for compiling mo and po files).
		Parse a PO file with

		var obj = gettextParser.po.parse(inputBuf);
		obj.charset = "windows-1257";
		outputBuf = gettextParser.po.compile(obj);
		gettextParser.po.parse(input[, defaultCharset]) → Object

		Where

		* input is a po file as a Buffer or an unicode string. Charset is converted to unicode from other encodings only if the input is a Buffer, otherwise the charset information is discarded
		* defaultCharset is the charset to use if charset is not defined or is the default `"CHARSET"` (applies only if input is a Buffer)

		Method returns gettext-parser specific translation object (see below)

		Example

		```javascript
		var input = require('fs').readFileSync('en.po');
		var po = gettextParser.po.parse(input);
		console.log(po.translations['']); // output translations for the default context
		```

		### Parse PO as a Stream

		PO files can also be parsed from a stream source. After all input is processed the parser emits a single 'data' event which contains the parsed translation object.

		gettextParser.po.createParseStream([defaultCharset][, streamOptions]) → Transform Stream

		Where

		* defaultCharset is the charset to use if charset is not defined or is the default `"CHARSET"`
		* streamOptions are the standard stream options

		Example

		```javascript
		var input = require('fs').createReadStream('en.po');
		var po = gettextParser.po.createParseStream();
		input.pipe(po);
		po.on('data', function(data){
		console.log(data.translations['']); // output translations for the default context
		});
		```

		### Compile PO from a translation object

		If you have a translation object you can convert this to a valid PO file with

		gettextParser.po.compile(data) → Buffer

		Where

		* data is a translation object either got from parsing a PO/MO file or composed by other means

		Example

		```javascript
		var data = {
		...
		};
		var output = gettextParser.po.compile(data);
		require('fs').writeFileSync(output);
		```

		### Parse MO files

		Parse a MO file with

		gettextParser.mo.parse(input[, defaultCharset]) → Object

		Where

		* input is a mo file as a Buffer
		* defaultCharset is the charset to use if charset is not defined or is the default `"CHARSET"`

		Method returns gettext-parser specific translation object (see below)

		Example

		```javascript
		var input = require('fs').readFileSync('en.mo');
		var mo = gettextParser.mo.parse(input);
		console.log(mo.translations['']); // output translations for the default context
		```

		### Compile MO from a translation object

		If you have a translation object you can convert this to a valid MO file with

		gettextParser.mo.compile(data) → Buffer

		Where

		* data is a translation object either got from parsing a PO/MO file or composed by other means

		Example

		```javascript
		var data = {
		...
		};
		var output = gettextParser.mo.compile(data);
		require('fs').writeFileSync(output);
		```

		### Notes

		#### Overriding charset

		If you are compiling a previously parsed translation object, you can override the output charset with the `charset` property (applies both for compiling mo and po files).

		```javascript
		var obj = gettextParser.po.parse(inputBuf);
		obj.charset = "windows-1257";
		outputBuf = gettextParser.po.compile(obj);
		```

		Headers for the output are modified to match the updated charset.

		#### ICONV support

		By default gettext-parser uses pure JS [iconv-lite](https://github.com/ashtuchkin/iconv-lite) for encoding and decoding non UTF-8 charsets. If you need to support more complex encodings that are not supported by iconv-lite, you need to add [iconv](https://github.com/bnoordhuis/node-iconv) as an additional dependency for your project (gettext-parser will detect if it is available and tries to use it instead of iconv-lite).

		## Data structure of parsed mo/po files
		@@ -42,4 +146,5 @@

		The data is always in unicode but the original charset of the file can
		be found from the `charset` property.
		Parsed data is always in unicode but the original charset of the file can
		be found from the `charset` property. This value is also used when compiling translations
		to a mo or po file.

		@@ -66,30 +171,30 @@ ### Headers
		{
		"charset": "iso-8859-1",
		"charset": "iso-8859-1",

		"headers": {
		"content-type": "text/plain; charset=iso-8859-1",
		"plural-forms": "nplurals=2; plural=(n!=1);"
		},
		"headers": {
		"content-type": "text/plain; charset=iso-8859-1",
		"plural-forms": "nplurals=2; plural=(n!=1);"
		},

		"translations":{
		"": {
		"": {
		"msgid": "",
		"msgstr": ["Content-Type: text/plain; charset=iso-8859-1\n..."]
		}
		},
		"translations": {
		"": {
		"": {
		"msgid": "",
		"msgstr": ["Content-Type: text/plain; charset=iso-8859-1\n..."]
		}
		}
		},

		"another context":{
		"%s example":{
		"msgctx": "another context",
		"msgid": "%s example",
		"msgid_plural": "%s examples",
		"msgstr": ["% näide", "%s näidet"],
		"comments": {
		"translator": "This is regular comment",
		"reference": "/path/to/file:123"
		}
		}
		}
		"another context": {
		"%s example": {
		"msgctx": "another context",
		"msgid": "%s example",
		"msgid_plural": "%s examples",
		"msgstr": ["% näide", "%s näidet"],
		"comments": {
		"translator": "This is regular comment",
		"reference": "/path/to/file:123"
		}
		}
		}
		}
		@@ -96,0 +201,0 @@ ```

test/folder-test.js

		@@ -7,3 +7,3 @@ 'use strict';
		var expect = chai.expect;
		chai.Assertion.includeStack = true;
		chai.config.includeStack = true;

		@@ -10,0 +10,0 @@ describe('Folding tests', function() {

test/mo-compiler-test.js

		@@ -8,3 +8,3 @@ 'use strict';
		var expect = chai.expect;
		chai.Assertion.includeStack = true;
		chai.config.includeStack = true;

		@@ -11,0 +11,0 @@ describe('MO Compiler', function() {

test/mo-parser-test.js

		@@ -8,3 +8,3 @@ 'use strict';
		var expect = chai.expect;
		chai.Assertion.includeStack = true;
		chai.config.includeStack = true;

		@@ -11,0 +11,0 @@ describe('MO Parser', function() {

test/po-compiler-test.js

		@@ -8,3 +8,3 @@ 'use strict';
		var expect = chai.expect;
		chai.Assertion.includeStack = true;
		chai.config.includeStack = true;

		@@ -11,0 +11,0 @@ describe('PO Compiler', function() {

test/po-parser-test.js

		@@ -8,3 +8,3 @@ 'use strict';
		var expect = chai.expect;
		chai.Assertion.includeStack = true;
		chai.config.includeStack = true;

		@@ -31,2 +31,24 @@ describe('PO Parser', function() {

		describe('Stream input', function() {
		it('should parse', function(done) {
		var po = fs.createReadStream(__dirname + '/fixtures/utf8.po', {
		highWaterMark: 1 // ensure that any utf-8 sequences will be broken when streaming
		});
		var json = JSON.parse(fs.readFileSync(__dirname + '/fixtures/utf8-po.json', 'utf-8'));

		var parsed;
		var stream = po.pipe(gettextParser.po.createParseStream({
		initialTreshold: 800 // home many bytes to cache for parsing the header
		}));
		stream.on('data', function(data) {
		parsed = data;
		});
		stream.on('end', function() {
		expect(parsed).to.deep.equal(json);
		done();
		});

		});
		});

		describe('Latin-13', function() {
		@@ -33,0 +55,0 @@ it('should parse', function() {

gettext-parser - npm Package Compare versions

Improved metrics