sdf-parser - npm Package Compare versions

Comparing version 3.1.0 to 4.0.0

History.md

package.json

		{
		"name": "sdf-parser",
		"version": "3.1.0",
		"version": "4.0.0",
		"description": "SDF parser",
		@@ -12,4 +12,5 @@ "main": "./src/index.js",
		"eslint-fix": "npm run eslint -- --fix",
		"test": "jest && npm run eslint",
		"build": "cheminfo build --root SDFParser"
		"test": "npm run test-coverage && npm run eslint",
		"test-coverage": "jest --coverage",
		"test-only": "jest"
		},
		@@ -37,17 +38,18 @@ "browser": {
		"devDependencies": {
		"babel-eslint": "^10.0.1",
		"callback-stream": "^1.1.0",
		"cheminfo-tools": "^1.20.2",
		"eslint": "^4.16.0",
		"eslint-config-cheminfo": "^1.14.1",
		"eslint-plugin-no-only-tests": "^2.0.0",
		"jest": "^22.1.4",
		"openchemlib": "^5.5.0",
		"should": "^13.2.1"
		"eslint": "^5.16.0",
		"eslint-config-cheminfo": "^1.20.1",
		"eslint-plugin-import": "^2.17.3",
		"eslint-plugin-jest": "^22.6.4",
		"jest": "^24.8.0",
		"openchemlib": "^7.1.0"
		},
		"dependencies": {
		"multipipe": "^2.0.1",
		"split2": "^2.2.0",
		"through2": "^2.0.3",
		"through2-filter": "^2.0.0"
		"@targos/pumpify": "^2.0.0",
		"readable-stream": "^3.4.0",
		"split2": "^3.1.1",
		"through2": "^3.0.1",
		"through2-filter": "^3.0.0"
		}
		}

README.md

		# sdf-parser

		[![NPM version][npm-image]][npm-url]
		[![build status][travis-image]][travis-url]
		[![David deps][david-image]][david-url]
		[![npm download][download-image]][download-url]
		[![NPM version][npm-image]][npm-url]
		[![build status][travis-image]][travis-url]
		[![npm download][download-image]][download-url]

		Allow to parse a SDF file and convert it to an array of objects
		Allow to parse a SDF file and convert it to an array of objects.

		@@ -17,4 +16,4 @@ ## Use of the package
		In node script:

		```js

		// allows to parse a file test.sdf that would be present in the same directory
		@@ -29,3 +28,2 @@
		console.log(result);

		```
		@@ -36,25 +34,27 @@
		options:
		* exclude : array of string containing the fields to discard
		* include : array of string containing the fields to keep
		* modifiers : object of functions that need to be converted during the parsing
		* filter : function that allows to filter the result
		* mixedEOL : if set to true will try to deal with mixed End Of Line separator
		* dynamicTyping : convert fields containing only number to numbers (default: true)

		- exclude : array of string containing the fields to discard
		- include : array of string containing the fields to keep
		- modifiers : object of functions that need to be converted during the parsing
		- filter : function that allows to filter the result
		- mixedEOL : if set to true will try to deal with mixed End Of Line separator
		- dynamicTyping : convert fields containing only number to numbers (default: true)

		## Advanced example with filtering and modifiers

		```
		```js
		var result = parse(sdf, {
		exclude:["Number of H-Donors"],
		include:["Number of H-Donors",'CLogP','Code'],
		modifiers: {
		CLogP: function(field) {
		return {
		low: field*1-0.2,
		high: field*1+0.2
		}
		}
		},
		filter: function(entry) {
		return (entry.CLogP && entry.CLogP.low>4);
		exclude: ['Number of H-Donors'],
		include: ['Number of H-Donors', 'CLogP', 'Code'],
		modifiers: {
		CLogP: function(field) {
		return {
		low: field * 1 - 0.2,
		high: field * 1 + 0.2
		};
		}
		},
		filter: function(entry) {
		return entry.CLogP && entry.CLogP.low > 4;
		}
		});
		@@ -65,3 +65,3 @@ ```

		This API is only available on Node.js
		This API is only available on Node.js.

		@@ -78,8 +78,8 @@ ### molecules(options)
		```js
		const stream = require('sdf-parser').stream;
		const { stream } = require('sdf-parser');
		fs.createReadStream('test.sdf')
		.pipe(stream.molecules())
		.on('data', (molecule) => {
		console.log(molecule.molfile);
		});
		.pipe(stream.molecules())
		.on('data', (molecule) => {
		console.log(molecule.molfile);
		});
		```
		@@ -92,8 +92,8 @@
		```js
		const stream = require('sdf-parser').stream;
		const { stream } = require('sdf-parser');
		fs.createReadStream('test.sdf')
		.pipe(stream.entries())
		.on('data', (entry) => {
		// sdf entry as a string
		});
		.pipe(stream.entries())
		.on('data', (entry) => {
		// sdf entry as a string
		});
		```
		@@ -103,3 +103,3 @@

		[MIT](./LICENSE)
		[MIT](./LICENSE)

		@@ -110,5 +110,3 @@ [npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
		[travis-url]: https://travis-ci.org/cheminfo-js/sdf-parser
		[david-image]: https://img.shields.io/david/cheminfo-js/sdf-parser.svg?style=flat-square
		[david-url]: https://david-dm.org/cheminfo-js/sdf-parser
		[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
		[download-url]: https://www.npmjs.com/package/sdf-parser

236

src/parse.js

		'use strict';

		function parse(sdf, options = {}) {
		const {
		include,
		exclude,
		filter,
		modifiers = {},
		forEach = {},
		dynamicTyping = true
		} = options;
		const {
		include,
		exclude,
		filter,
		modifiers = {},
		forEach = {},
		dynamicTyping = true
		} = options;

		if (typeof sdf !== 'string') {
		throw new TypeError('Parameter "sdf" must be a string');
		}
		if (typeof sdf !== 'string') {
		throw new TypeError('Parameter "sdf" must be a string');
		}

		var eol = '\n';
		if (options.mixedEOL) {
		sdf = sdf.replace(/\r\n/g, '\n');
		sdf = sdf.replace(/\r/g, '\n');
		} else {
		// we will find the delimiter in order to be much faster and not use regular expression
		var header = sdf.substr(0, 1000);
		if (header.indexOf('\r\n') > -1) {
		eol = '\r\n';
		} else if (header.indexOf('\r') > -1) {
		eol = '\r';
		}
		var eol = '\n';
		if (options.mixedEOL) {
		sdf = sdf.replace(/\r\n/g, '\n');
		sdf = sdf.replace(/\r/g, '\n');
		} else {
		// we will find the delimiter in order to be much faster and not use regular expression
		var header = sdf.substr(0, 1000);
		if (header.indexOf('\r\n') > -1) {
		eol = '\r\n';
		} else if (header.indexOf('\r') > -1) {
		eol = '\r';
		}
		}

		var sdfParts = sdf.split(new RegExp(eol + '\\$\\$\\$\\$.*' + eol));
		var molecules = [];
		var labels = {};
		var sdfParts = sdf.split(new RegExp(`${eol}\\$\\$\\$\\$.*${eol}`));
		var molecules = [];
		var labels = {};

		var start = Date.now();
		var start = Date.now();

		for (var i = 0; i < sdfParts.length; i++) {
		var sdfPart = sdfParts[i];
		var parts = sdfPart.split(eol + '>');
		if (parts.length > 0 && parts[0].length > 5) {
		var molecule = {};
		var currentLabels = [];
		molecule.molfile = parts[0] + eol;
		for (var j = 1; j < parts.length; j++) {
		var lines = parts[j].split(eol);
		var from = lines[0].indexOf('<');
		var to = lines[0].indexOf('>');
		var label = lines[0].substring(from + 1, to);
		currentLabels.push(label);
		if (!labels[label]) {
		labels[label] = {
		counter: 0,
		isNumeric: dynamicTyping,
		keep: false
		};
		if (
		(!exclude \|\| exclude.indexOf(label) === -1) &&
		(!include \|\| include.indexOf(label) > -1)
		) {
		labels[label].keep = true;
		if (modifiers[label]) labels[label].modifier = modifiers[label];
		if (forEach[label]) labels[label].forEach = forEach[label];
		}
		}
		if (labels[label].keep) {
		for (var k = 1; k < lines.length - 1; k++) {
		if (molecule[label]) {
		molecule[label] += eol + lines[k];
		} else {
		molecule[label] = lines[k];
		}
		}
		if (labels[label].modifier) {
		var modifiedValue = labels[label].modifier(molecule[label]);
		if (modifiedValue === undefined \|\| modifiedValue === null) {
		delete molecule[label];
		} else {
		molecule[label] = modifiedValue;
		}
		}
		if (labels[label].isNumeric) {
		if (!isFinite(molecule[label]) \|\| molecule[label].match(/^0[0-9]/)) {
		labels[label].isNumeric = false;
		}
		}
		}
		for (var i = 0; i < sdfParts.length; i++) {
		var sdfPart = sdfParts[i];
		var parts = sdfPart.split(`${eol}>`);
		if (parts.length > 0 && parts[0].length > 5) {
		var molecule = {};
		var currentLabels = [];
		molecule.molfile = parts[0] + eol;
		for (var j = 1; j < parts.length; j++) {
		var lines = parts[j].split(eol);
		var from = lines[0].indexOf('<');
		var to = lines[0].indexOf('>');
		var label = lines[0].substring(from + 1, to);
		currentLabels.push(label);
		if (!labels[label]) {
		labels[label] = {
		counter: 0,
		isNumeric: dynamicTyping,
		keep: false
		};
		if (
		(!exclude \|\| exclude.indexOf(label) === -1) &&
		(!include \|\| include.indexOf(label) > -1)
		) {
		labels[label].keep = true;
		if (modifiers[label]) labels[label].modifier = modifiers[label];
		if (forEach[label]) labels[label].forEach = forEach[label];
		}
		}
		if (labels[label].keep) {
		for (var k = 1; k < lines.length - 1; k++) {
		if (molecule[label]) {
		molecule[label] += eol + lines[k];
		} else {
		molecule[label] = lines[k];
		}
		if (!filter \|\| filter(molecule)) {
		molecules.push(molecule);
		// only now we can increase the counter
		for (j = 0; j < currentLabels.length; j++) {
		var currentLabel = currentLabels[j];
		labels[currentLabel].counter++;
		}
		}
		if (labels[label].modifier) {
		var modifiedValue = labels[label].modifier(molecule[label]);
		if (modifiedValue === undefined \|\| modifiedValue === null) {
		delete molecule[label];
		} else {
		molecule[label] = modifiedValue;
		}
		}
		}

		// all numeric fields should be converted to numbers
		for (label in labels) {
		currentLabel = labels[label];
		if (currentLabel.isNumeric) {
		currentLabel.minValue = Infinity;
		currentLabel.maxValue = -Infinity;
		for (j = 0; j < molecules.length; j++) {
		if (molecules[j][label]) {
		var value = parseFloat(molecules[j][label]);
		molecules[j][label] = value;
		if (value > currentLabel.maxValue) currentLabel.maxValue = value;
		if (value < currentLabel.minValue) currentLabel.minValue = value;
		}
		}
		if (labels[label].isNumeric) {
		if (
		!isFinite(molecule[label]) \|\|
		molecule[label].match(/^0[0-9]/)
		) {
		labels[label].isNumeric = false;
		}
		}
		}
		}
		if (!filter \|\| filter(molecule)) {
		molecules.push(molecule);
		// only now we can increase the counter
		for (j = 0; j < currentLabels.length; j++) {
		var currentLabel = currentLabels[j];
		labels[currentLabel].counter++;
		}
		}
		}
		}

		// we check that a label is in all the records
		for (var key in labels) {
		if (labels[key].counter === molecules.length) {
		labels[key].always = true;
		} else {
		labels[key].always = false;
		// all numeric fields should be converted to numbers
		for (label in labels) {
		currentLabel = labels[label];
		if (currentLabel.isNumeric) {
		currentLabel.minValue = Infinity;
		currentLabel.maxValue = -Infinity;
		for (j = 0; j < molecules.length; j++) {
		if (molecules[j][label]) {
		var value = parseFloat(molecules[j][label]);
		molecules[j][label] = value;
		if (value > currentLabel.maxValue) currentLabel.maxValue = value;
		if (value < currentLabel.minValue) currentLabel.minValue = value;
		}
		}
		}
		}

		var statistics = [];
		for (key in labels) {
		var statistic = labels[key];
		statistic.label = key;
		statistics.push(statistic);
		// we check that a label is in all the records
		for (var key in labels) {
		if (labels[key].counter === molecules.length) {
		labels[key].always = true;
		} else {
		labels[key].always = false;
		}
		}

		return {
		time: Date.now() - start,
		molecules: molecules,
		labels: Object.keys(labels),
		statistics: statistics
		};
		var statistics = [];
		for (key in labels) {
		var statistic = labels[key];
		statistic.label = key;
		statistics.push(statistic);
		}

		return {
		time: Date.now() - start,
		molecules: molecules,
		labels: Object.keys(labels),
		statistics: statistics
		};
		}

		module.exports = parse;

src/stream.js

		'use strict';

		const combine = require('multipipe');
		const pipeline = require('@targos/pumpify');
		const split2 = require('split2');
		@@ -10,43 +10,43 @@ const filter = require('through2-filter');

		const filterStream = filter.bind(null, {objectMode: true});
		const filterStream = filter.bind(null, { objectMode: true });
		function filterCb(chunk) {
		return chunk.length > 1 && chunk.trim().length > 1;
		return chunk.length > 1 && chunk.trim().length > 1;
		}

		function entries() {
		return combine(
		split2(/\r?\n\${4}.*\r?\n/),
		filterStream(filterCb),
		through2({objectMode: true}, function (value, encoding, callback) {
		const eol = value.includes('\r\n') ? '\r\n' : '\n';
		this.push(value + eol + '$$$$' + eol);
		callback();
		})
		);
		return pipeline.obj(
		split2(/\r?\n\${4}.*\r?\n/),
		filterStream(filterCb),
		through2({ objectMode: true }, function (value, encoding, callback) {
		const eol = value.includes('\r\n') ? '\r\n' : '\n';
		this.push(`${value + eol}$$$$${eol}`);
		callback();
		})
		);
		}

		function molecules(options) {
		return combine(
		entries(),
		through2({objectMode: true}, function (value, encoding, callback) {
		try {
		const parsed = parse(value, options);
		if (parsed.molecules.length === 1) {
		if (options && options.fullResult) {
		this.push(parsed);
		} else {
		this.push(parsed.molecules[0]);
		}
		}
		callback();
		} catch (e) {
		callback(e);
		}
		})
		);
		return pipeline.obj(
		entries(),
		through2({ objectMode: true }, function (value, encoding, callback) {
		try {
		const parsed = parse(value, options);
		if (parsed.molecules.length === 1) {
		if (options && options.fullResult) {
		this.push(parsed);
		} else {
		this.push(parsed.molecules[0]);
		}
		}
		callback();
		} catch (e) {
		callback(e);
		}
		})
		);
		}

		module.exports = {
		entries,
		molecules
		entries,
		molecules
		};

sdf-parser - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes