sdf-parser
Advanced tools
Comparing version 3.1.0 to 4.0.0
{ | ||
"name": "sdf-parser", | ||
"version": "3.1.0", | ||
"version": "4.0.0", | ||
"description": "SDF parser", | ||
@@ -12,4 +12,5 @@ "main": "./src/index.js", | ||
"eslint-fix": "npm run eslint -- --fix", | ||
"test": "jest && npm run eslint", | ||
"build": "cheminfo build --root SDFParser" | ||
"test": "npm run test-coverage && npm run eslint", | ||
"test-coverage": "jest --coverage", | ||
"test-only": "jest" | ||
}, | ||
@@ -37,17 +38,18 @@ "browser": { | ||
"devDependencies": { | ||
"babel-eslint": "^10.0.1", | ||
"callback-stream": "^1.1.0", | ||
"cheminfo-tools": "^1.20.2", | ||
"eslint": "^4.16.0", | ||
"eslint-config-cheminfo": "^1.14.1", | ||
"eslint-plugin-no-only-tests": "^2.0.0", | ||
"jest": "^22.1.4", | ||
"openchemlib": "^5.5.0", | ||
"should": "^13.2.1" | ||
"eslint": "^5.16.0", | ||
"eslint-config-cheminfo": "^1.20.1", | ||
"eslint-plugin-import": "^2.17.3", | ||
"eslint-plugin-jest": "^22.6.4", | ||
"jest": "^24.8.0", | ||
"openchemlib": "^7.1.0" | ||
}, | ||
"dependencies": { | ||
"multipipe": "^2.0.1", | ||
"split2": "^2.2.0", | ||
"through2": "^2.0.3", | ||
"through2-filter": "^2.0.0" | ||
"@targos/pumpify": "^2.0.0", | ||
"readable-stream": "^3.4.0", | ||
"split2": "^3.1.1", | ||
"through2": "^3.0.1", | ||
"through2-filter": "^3.0.0" | ||
} | ||
} |
# sdf-parser | ||
[![NPM version][npm-image]][npm-url] | ||
[![build status][travis-image]][travis-url] | ||
[![David deps][david-image]][david-url] | ||
[![npm download][download-image]][download-url] | ||
[![NPM version][npm-image]][npm-url] | ||
[![build status][travis-image]][travis-url] | ||
[![npm download][download-image]][download-url] | ||
Allow to parse a SDF file and convert it to an array of objects | ||
Allow to parse a SDF file and convert it to an array of objects. | ||
@@ -17,4 +16,4 @@ ## Use of the package | ||
In node script: | ||
```js | ||
// allows to parse a file test.sdf that would be present in the same directory | ||
@@ -29,3 +28,2 @@ | ||
console.log(result); | ||
``` | ||
@@ -36,25 +34,27 @@ | ||
options: | ||
* exclude : array of string containing the fields to discard | ||
* include : array of string containing the fields to keep | ||
* modifiers : object of functions that need to be converted during the parsing | ||
* filter : function that allows to filter the result | ||
* mixedEOL : if set to true will try to deal with mixed End Of Line separator | ||
* dynamicTyping : convert fields containing only number to numbers (default: true) | ||
- exclude : array of string containing the fields to discard | ||
- include : array of string containing the fields to keep | ||
- modifiers : object of functions that need to be converted during the parsing | ||
- filter : function that allows to filter the result | ||
- mixedEOL : if set to true will try to deal with mixed End Of Line separator | ||
- dynamicTyping : convert fields containing only number to numbers (default: true) | ||
## Advanced example with filtering and modifiers | ||
``` | ||
```js | ||
var result = parse(sdf, { | ||
exclude:["Number of H-Donors"], | ||
include:["Number of H-Donors",'CLogP','Code'], | ||
modifiers: { | ||
CLogP: function(field) { | ||
return { | ||
low: field*1-0.2, | ||
high: field*1+0.2 | ||
} | ||
} | ||
}, | ||
filter: function(entry) { | ||
return (entry.CLogP && entry.CLogP.low>4); | ||
exclude: ['Number of H-Donors'], | ||
include: ['Number of H-Donors', 'CLogP', 'Code'], | ||
modifiers: { | ||
CLogP: function(field) { | ||
return { | ||
low: field * 1 - 0.2, | ||
high: field * 1 + 0.2 | ||
}; | ||
} | ||
}, | ||
filter: function(entry) { | ||
return entry.CLogP && entry.CLogP.low > 4; | ||
} | ||
}); | ||
@@ -65,3 +65,3 @@ ``` | ||
This API is only available on Node.js | ||
This API is only available on Node.js. | ||
@@ -78,8 +78,8 @@ ### molecules(options) | ||
```js | ||
const stream = require('sdf-parser').stream; | ||
const { stream } = require('sdf-parser'); | ||
fs.createReadStream('test.sdf') | ||
.pipe(stream.molecules()) | ||
.on('data', (molecule) => { | ||
console.log(molecule.molfile); | ||
}); | ||
.pipe(stream.molecules()) | ||
.on('data', (molecule) => { | ||
console.log(molecule.molfile); | ||
}); | ||
``` | ||
@@ -92,8 +92,8 @@ | ||
```js | ||
const stream = require('sdf-parser').stream; | ||
const { stream } = require('sdf-parser'); | ||
fs.createReadStream('test.sdf') | ||
.pipe(stream.entries()) | ||
.on('data', (entry) => { | ||
// sdf entry as a string | ||
}); | ||
.pipe(stream.entries()) | ||
.on('data', (entry) => { | ||
// sdf entry as a string | ||
}); | ||
``` | ||
@@ -103,3 +103,3 @@ | ||
[MIT](./LICENSE) | ||
[MIT](./LICENSE) | ||
@@ -110,5 +110,3 @@ [npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square | ||
[travis-url]: https://travis-ci.org/cheminfo-js/sdf-parser | ||
[david-image]: https://img.shields.io/david/cheminfo-js/sdf-parser.svg?style=flat-square | ||
[david-url]: https://david-dm.org/cheminfo-js/sdf-parser | ||
[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square | ||
[download-url]: https://www.npmjs.com/package/sdf-parser |
236
src/parse.js
'use strict'; | ||
function parse(sdf, options = {}) { | ||
const { | ||
include, | ||
exclude, | ||
filter, | ||
modifiers = {}, | ||
forEach = {}, | ||
dynamicTyping = true | ||
} = options; | ||
const { | ||
include, | ||
exclude, | ||
filter, | ||
modifiers = {}, | ||
forEach = {}, | ||
dynamicTyping = true | ||
} = options; | ||
if (typeof sdf !== 'string') { | ||
throw new TypeError('Parameter "sdf" must be a string'); | ||
} | ||
if (typeof sdf !== 'string') { | ||
throw new TypeError('Parameter "sdf" must be a string'); | ||
} | ||
var eol = '\n'; | ||
if (options.mixedEOL) { | ||
sdf = sdf.replace(/\r\n/g, '\n'); | ||
sdf = sdf.replace(/\r/g, '\n'); | ||
} else { | ||
// we will find the delimiter in order to be much faster and not use regular expression | ||
var header = sdf.substr(0, 1000); | ||
if (header.indexOf('\r\n') > -1) { | ||
eol = '\r\n'; | ||
} else if (header.indexOf('\r') > -1) { | ||
eol = '\r'; | ||
} | ||
var eol = '\n'; | ||
if (options.mixedEOL) { | ||
sdf = sdf.replace(/\r\n/g, '\n'); | ||
sdf = sdf.replace(/\r/g, '\n'); | ||
} else { | ||
// we will find the delimiter in order to be much faster and not use regular expression | ||
var header = sdf.substr(0, 1000); | ||
if (header.indexOf('\r\n') > -1) { | ||
eol = '\r\n'; | ||
} else if (header.indexOf('\r') > -1) { | ||
eol = '\r'; | ||
} | ||
} | ||
var sdfParts = sdf.split(new RegExp(eol + '\\$\\$\\$\\$.*' + eol)); | ||
var molecules = []; | ||
var labels = {}; | ||
var sdfParts = sdf.split(new RegExp(`${eol}\\$\\$\\$\\$.*${eol}`)); | ||
var molecules = []; | ||
var labels = {}; | ||
var start = Date.now(); | ||
var start = Date.now(); | ||
for (var i = 0; i < sdfParts.length; i++) { | ||
var sdfPart = sdfParts[i]; | ||
var parts = sdfPart.split(eol + '>'); | ||
if (parts.length > 0 && parts[0].length > 5) { | ||
var molecule = {}; | ||
var currentLabels = []; | ||
molecule.molfile = parts[0] + eol; | ||
for (var j = 1; j < parts.length; j++) { | ||
var lines = parts[j].split(eol); | ||
var from = lines[0].indexOf('<'); | ||
var to = lines[0].indexOf('>'); | ||
var label = lines[0].substring(from + 1, to); | ||
currentLabels.push(label); | ||
if (!labels[label]) { | ||
labels[label] = { | ||
counter: 0, | ||
isNumeric: dynamicTyping, | ||
keep: false | ||
}; | ||
if ( | ||
(!exclude || exclude.indexOf(label) === -1) && | ||
(!include || include.indexOf(label) > -1) | ||
) { | ||
labels[label].keep = true; | ||
if (modifiers[label]) labels[label].modifier = modifiers[label]; | ||
if (forEach[label]) labels[label].forEach = forEach[label]; | ||
} | ||
} | ||
if (labels[label].keep) { | ||
for (var k = 1; k < lines.length - 1; k++) { | ||
if (molecule[label]) { | ||
molecule[label] += eol + lines[k]; | ||
} else { | ||
molecule[label] = lines[k]; | ||
} | ||
} | ||
if (labels[label].modifier) { | ||
var modifiedValue = labels[label].modifier(molecule[label]); | ||
if (modifiedValue === undefined || modifiedValue === null) { | ||
delete molecule[label]; | ||
} else { | ||
molecule[label] = modifiedValue; | ||
} | ||
} | ||
if (labels[label].isNumeric) { | ||
if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) { | ||
labels[label].isNumeric = false; | ||
} | ||
} | ||
} | ||
for (var i = 0; i < sdfParts.length; i++) { | ||
var sdfPart = sdfParts[i]; | ||
var parts = sdfPart.split(`${eol}>`); | ||
if (parts.length > 0 && parts[0].length > 5) { | ||
var molecule = {}; | ||
var currentLabels = []; | ||
molecule.molfile = parts[0] + eol; | ||
for (var j = 1; j < parts.length; j++) { | ||
var lines = parts[j].split(eol); | ||
var from = lines[0].indexOf('<'); | ||
var to = lines[0].indexOf('>'); | ||
var label = lines[0].substring(from + 1, to); | ||
currentLabels.push(label); | ||
if (!labels[label]) { | ||
labels[label] = { | ||
counter: 0, | ||
isNumeric: dynamicTyping, | ||
keep: false | ||
}; | ||
if ( | ||
(!exclude || exclude.indexOf(label) === -1) && | ||
(!include || include.indexOf(label) > -1) | ||
) { | ||
labels[label].keep = true; | ||
if (modifiers[label]) labels[label].modifier = modifiers[label]; | ||
if (forEach[label]) labels[label].forEach = forEach[label]; | ||
} | ||
} | ||
if (labels[label].keep) { | ||
for (var k = 1; k < lines.length - 1; k++) { | ||
if (molecule[label]) { | ||
molecule[label] += eol + lines[k]; | ||
} else { | ||
molecule[label] = lines[k]; | ||
} | ||
if (!filter || filter(molecule)) { | ||
molecules.push(molecule); | ||
// only now we can increase the counter | ||
for (j = 0; j < currentLabels.length; j++) { | ||
var currentLabel = currentLabels[j]; | ||
labels[currentLabel].counter++; | ||
} | ||
} | ||
if (labels[label].modifier) { | ||
var modifiedValue = labels[label].modifier(molecule[label]); | ||
if (modifiedValue === undefined || modifiedValue === null) { | ||
delete molecule[label]; | ||
} else { | ||
molecule[label] = modifiedValue; | ||
} | ||
} | ||
} | ||
// all numeric fields should be converted to numbers | ||
for (label in labels) { | ||
currentLabel = labels[label]; | ||
if (currentLabel.isNumeric) { | ||
currentLabel.minValue = Infinity; | ||
currentLabel.maxValue = -Infinity; | ||
for (j = 0; j < molecules.length; j++) { | ||
if (molecules[j][label]) { | ||
var value = parseFloat(molecules[j][label]); | ||
molecules[j][label] = value; | ||
if (value > currentLabel.maxValue) currentLabel.maxValue = value; | ||
if (value < currentLabel.minValue) currentLabel.minValue = value; | ||
} | ||
} | ||
if (labels[label].isNumeric) { | ||
if ( | ||
!isFinite(molecule[label]) || | ||
molecule[label].match(/^0[0-9]/) | ||
) { | ||
labels[label].isNumeric = false; | ||
} | ||
} | ||
} | ||
} | ||
if (!filter || filter(molecule)) { | ||
molecules.push(molecule); | ||
// only now we can increase the counter | ||
for (j = 0; j < currentLabels.length; j++) { | ||
var currentLabel = currentLabels[j]; | ||
labels[currentLabel].counter++; | ||
} | ||
} | ||
} | ||
} | ||
// we check that a label is in all the records | ||
for (var key in labels) { | ||
if (labels[key].counter === molecules.length) { | ||
labels[key].always = true; | ||
} else { | ||
labels[key].always = false; | ||
// all numeric fields should be converted to numbers | ||
for (label in labels) { | ||
currentLabel = labels[label]; | ||
if (currentLabel.isNumeric) { | ||
currentLabel.minValue = Infinity; | ||
currentLabel.maxValue = -Infinity; | ||
for (j = 0; j < molecules.length; j++) { | ||
if (molecules[j][label]) { | ||
var value = parseFloat(molecules[j][label]); | ||
molecules[j][label] = value; | ||
if (value > currentLabel.maxValue) currentLabel.maxValue = value; | ||
if (value < currentLabel.minValue) currentLabel.minValue = value; | ||
} | ||
} | ||
} | ||
} | ||
var statistics = []; | ||
for (key in labels) { | ||
var statistic = labels[key]; | ||
statistic.label = key; | ||
statistics.push(statistic); | ||
// we check that a label is in all the records | ||
for (var key in labels) { | ||
if (labels[key].counter === molecules.length) { | ||
labels[key].always = true; | ||
} else { | ||
labels[key].always = false; | ||
} | ||
} | ||
return { | ||
time: Date.now() - start, | ||
molecules: molecules, | ||
labels: Object.keys(labels), | ||
statistics: statistics | ||
}; | ||
var statistics = []; | ||
for (key in labels) { | ||
var statistic = labels[key]; | ||
statistic.label = key; | ||
statistics.push(statistic); | ||
} | ||
return { | ||
time: Date.now() - start, | ||
molecules: molecules, | ||
labels: Object.keys(labels), | ||
statistics: statistics | ||
}; | ||
} | ||
module.exports = parse; |
'use strict'; | ||
const combine = require('multipipe'); | ||
const pipeline = require('@targos/pumpify'); | ||
const split2 = require('split2'); | ||
@@ -10,43 +10,43 @@ const filter = require('through2-filter'); | ||
const filterStream = filter.bind(null, {objectMode: true}); | ||
const filterStream = filter.bind(null, { objectMode: true }); | ||
function filterCb(chunk) { | ||
return chunk.length > 1 && chunk.trim().length > 1; | ||
return chunk.length > 1 && chunk.trim().length > 1; | ||
} | ||
function entries() { | ||
return combine( | ||
split2(/\r?\n\${4}.*\r?\n/), | ||
filterStream(filterCb), | ||
through2({objectMode: true}, function (value, encoding, callback) { | ||
const eol = value.includes('\r\n') ? '\r\n' : '\n'; | ||
this.push(value + eol + '$$$$' + eol); | ||
callback(); | ||
}) | ||
); | ||
return pipeline.obj( | ||
split2(/\r?\n\${4}.*\r?\n/), | ||
filterStream(filterCb), | ||
through2({ objectMode: true }, function (value, encoding, callback) { | ||
const eol = value.includes('\r\n') ? '\r\n' : '\n'; | ||
this.push(`${value + eol}$$$$${eol}`); | ||
callback(); | ||
}) | ||
); | ||
} | ||
function molecules(options) { | ||
return combine( | ||
entries(), | ||
through2({objectMode: true}, function (value, encoding, callback) { | ||
try { | ||
const parsed = parse(value, options); | ||
if (parsed.molecules.length === 1) { | ||
if (options && options.fullResult) { | ||
this.push(parsed); | ||
} else { | ||
this.push(parsed.molecules[0]); | ||
} | ||
} | ||
callback(); | ||
} catch (e) { | ||
callback(e); | ||
} | ||
}) | ||
); | ||
return pipeline.obj( | ||
entries(), | ||
through2({ objectMode: true }, function (value, encoding, callback) { | ||
try { | ||
const parsed = parse(value, options); | ||
if (parsed.molecules.length === 1) { | ||
if (options && options.fullResult) { | ||
this.push(parsed); | ||
} else { | ||
this.push(parsed.molecules[0]); | ||
} | ||
} | ||
callback(); | ||
} catch (e) { | ||
callback(e); | ||
} | ||
}) | ||
); | ||
} | ||
module.exports = { | ||
entries, | ||
molecules | ||
entries, | ||
molecules | ||
}; |
12358
8
184
5
105
+ Added@targos/pumpify@^2.0.0
+ Addedreadable-stream@^3.4.0
+ Added@targos/pumpify@2.0.0(transitive)
+ Addedduplexify@4.1.3(transitive)
+ Addedend-of-stream@1.4.4(transitive)
+ Addedonce@1.4.0(transitive)
+ Addedpump@3.0.2(transitive)
+ Addedreadable-stream@3.6.2(transitive)
+ Addedsafe-buffer@5.2.1(transitive)
+ Addedsplit2@3.2.2(transitive)
+ Addedstream-shift@1.0.3(transitive)
+ Addedstring_decoder@1.3.0(transitive)
+ Addedthrough2@3.0.24.0.2(transitive)
+ Addedthrough2-filter@3.1.0(transitive)
+ Addedwrappy@1.0.2(transitive)
- Removedmultipipe@^2.0.1
- Removedcore-util-is@1.0.3(transitive)
- Removedduplexer2@0.1.4(transitive)
- Removedisarray@1.0.0(transitive)
- Removedmultipipe@2.0.3(transitive)
- Removedobject-assign@4.1.1(transitive)
- Removedprocess-nextick-args@2.0.1(transitive)
- Removedreadable-stream@2.3.8(transitive)
- Removedsafe-buffer@5.1.2(transitive)
- Removedsplit2@2.2.0(transitive)
- Removedstring_decoder@1.1.1(transitive)
- Removedthrough2@2.0.5(transitive)
- Removedthrough2-filter@2.0.0(transitive)
- Removedxtend@4.0.2(transitive)
Updatedsplit2@^3.1.1
Updatedthrough2@^3.0.1
Updatedthrough2-filter@^3.0.0