Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

sdf-parser

Package Overview
Dependencies
Maintainers
4
Versions
16
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

sdf-parser - npm Package Compare versions

Comparing version 5.0.2 to 6.0.0

src/__tests__/iterator.test.js

271

lib/index.js

@@ -6,14 +6,5 @@ 'use strict';

var ensureString = require('ensure-string');
var pipeline = require('pumpify');
var split2 = require('split2');
var through2 = require('through2');
var filter = require('through2-filter');
var readline = require('readline');
var dynamicTyping = require('dynamic-typing');
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
var pipeline__default = /*#__PURE__*/_interopDefaultLegacy(pipeline);
var split2__default = /*#__PURE__*/_interopDefaultLegacy(split2);
var through2__default = /*#__PURE__*/_interopDefaultLegacy(through2);
var filter__default = /*#__PURE__*/_interopDefaultLegacy(filter);
function getEntriesBoundaries(string, substring, eol) {

@@ -41,21 +32,75 @@ const res = [];

function getMolecule$1(sdfPart, labels, currentLabels, options) {
let parts = sdfPart.split(`${options.eol}>`);
if (parts.length === 0 || parts[0].length <= 5) return;
let molecule = {};
molecule.molfile = parts[0] + options.eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(options.eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].substring(from + 1, to);
currentLabels.push(label);
if (!labels[label]) {
labels[label] = {
counter: 0,
isNumeric: options.dynamicTyping,
keep: false,
};
if (
(!options.exclude || options.exclude.indexOf(label) === -1) &&
(!options.include || options.include.indexOf(label) > -1)
) {
labels[label].keep = true;
if (options.modifiers[label]) {
labels[label].modifier = options.modifiers[label];
}
if (options.forEach[label]) {
labels[label].forEach = options.forEach[label];
}
}
}
if (labels[label].keep) {
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += options.eol + lines[k];
} else {
molecule[label] = lines[k];
}
}
if (labels[label].modifier) {
let modifiedValue = labels[label].modifier(molecule[label]);
if (modifiedValue === undefined || modifiedValue === null) {
delete molecule[label];
} else {
molecule[label] = modifiedValue;
}
}
if (labels[label].isNumeric) {
if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
labels[label].isNumeric = false;
}
}
}
}
return molecule;
}
/**
* Parse a SDF file
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
* @param {any} [options={}]
* @param {array<string>} [options.include] List of fields to include
* @param {array<string>} [options.exclude] List of fields to exclude
* @param {object} [options={}]
* @param {string[]} [options.include] List of fields to include
* @param {string[]} [options.exclude] List of fields to exclude
* @param {Function} [options.filter] Callback allowing to filter the molecules
* @param {boolean} [options.dynamicTyping] Dynamically type the data
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
*/
function parse(sdf, options = {}) {
const {
include,
exclude,
filter,
modifiers = {},
forEach = {},
dynamicTyping = true,
} = options;
options = { ...options };
if (options.modifiers === undefined) options.modifiers = {};
if (options.forEach === undefined) options.forEach = {};
if (options.dynamicTyping === undefined) options.dynamicTyping = true;

@@ -67,17 +112,23 @@ sdf = ensureString.ensureString(sdf);

let eol = '\n';
if (options.mixedEOL) {
sdf = sdf.replace(/\r\n/g, '\n');
sdf = sdf.replace(/\r/g, '\n');
} else {
// we will find the delimiter in order to be much faster and not use regular expression
let header = sdf.substr(0, 1000);
if (header.indexOf('\r\n') > -1) {
eol = '\r\n';
} else if (header.indexOf('\r') > -1) {
eol = '\r';
if (options.eol === undefined) {
options.eol = '\n';
if (options.mixedEOL) {
sdf = sdf.replace(/\r\n/g, '\n');
sdf = sdf.replace(/\r/g, '\n');
} else {
// we will find the delimiter in order to be much faster and not use regular expression
let header = sdf.substr(0, 1000);
if (header.indexOf('\r\n') > -1) {
options.eol = '\r\n';
} else if (header.indexOf('\r') > -1) {
options.eol = '\r';
}
}
}
let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
let entriesBoundaries = getEntriesBoundaries(
sdf,
`${options.eol}$$$$`,
options.eol,
);
let molecules = [];

@@ -90,68 +141,14 @@ let labels = {};

let sdfPart = sdf.substring(...entriesBoundaries[i]);
let parts = sdfPart.split(`${eol}>`);
if (parts.length > 0 && parts[0].length > 5) {
let molecule = {};
let currentLabels = [];
molecule.molfile = parts[0] + eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].substring(from + 1, to);
currentLabels.push(label);
if (!labels[label]) {
labels[label] = {
counter: 0,
isNumeric: dynamicTyping,
keep: false,
};
if (
(!exclude || exclude.indexOf(label) === -1) &&
(!include || include.indexOf(label) > -1)
) {
labels[label].keep = true;
if (modifiers[label]) {
labels[label].modifier = modifiers[label];
}
if (forEach[label]) {
labels[label].forEach = forEach[label];
}
}
}
if (labels[label].keep) {
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += eol + lines[k];
} else {
molecule[label] = lines[k];
}
}
if (labels[label].modifier) {
let modifiedValue = labels[label].modifier(molecule[label]);
if (modifiedValue === undefined || modifiedValue === null) {
delete molecule[label];
} else {
molecule[label] = modifiedValue;
}
}
if (labels[label].isNumeric) {
if (
!isFinite(molecule[label]) ||
molecule[label].match(/^0[0-9]/)
) {
labels[label].isNumeric = false;
}
}
}
let currentLabels = [];
const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
if (!molecule) continue;
if (!options.filter || options.filter(molecule)) {
molecules.push(molecule);
// only now we can increase the counter
for (let j = 0; j < currentLabels.length; j++) {
labels[currentLabels[j]].counter++;
}
if (!filter || filter(molecule)) {
molecules.push(molecule);
// only now we can increase the counter
for (let j = 0; j < currentLabels.length; j++) {
labels[currentLabels[j]].counter++;
}
}
}
}
// all numeric fields should be converted to numbers

@@ -202,43 +199,55 @@ for (let label in labels) {

const filterStream = filter__default["default"].bind(null, { objectMode: true });
/**
* Parse a SDF file
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
* @param {object} [options={}]
* @param {Function} [options.filter] Callback allowing to filter the molecules
* @param {boolean} [options.dynamicTyping] Dynamically type the data
*/
function filterCb(chunk) {
return chunk.length > 1 && chunk.trim().length > 1;
}
async function* iterator(readStream, options = {}) {
const lines = readline.createInterface(readStream);
const currentLines = [];
options = { ...options };
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
function entries() {
return pipeline__default["default"].obj(
split2__default["default"](/\r?\n\${4}.*\r?\n/),
filterStream(filterCb),
through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
const eol = value.includes('\r\n') ? '\r\n' : '\n';
this.push(`${value + eol}$$$$${eol}`);
callback();
}),
);
options.eol = '\n';
for await (let line of lines) {
if (line.startsWith('$$$$')) {
const molecule = getMolecule(currentLines.join(options.eol), options);
if (!options.filter || options.filter(molecule)) {
yield molecule;
}
currentLines.length = 0;
} else {
currentLines.push(line);
}
}
}
function molecules(options) {
return pipeline__default["default"].obj(
entries(),
through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
try {
const parsed = parse(value, options);
if (parsed.molecules.length === 1) {
if (options && options.fullResult) {
this.push(parsed);
} else {
this.push(parsed.molecules[0]);
}
}
callback();
} catch (e) {
callback(e);
function getMolecule(sdfPart, options) {
let parts = sdfPart.split(`${options.eol}>`);
if (parts.length === 0 || parts[0].length <= 5) return;
let molecule = {};
molecule.molfile = parts[0] + options.eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(options.eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].substring(from + 1, to);
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += options.eol + lines[k];
} else {
molecule[label] = lines[k];
}
}),
);
}
if (options.dynamicTyping) {
molecule[label] = dynamicTyping.parseString(molecule[label]);
}
}
return molecule;
}
exports.entries = entries;
exports.molecules = molecules;
exports.iterator = iterator;
exports.parse = parse;
{
"name": "sdf-parser",
"version": "5.0.2",
"version": "6.0.0",
"description": "SDF parser",

@@ -52,2 +52,3 @@ "main": "lib/index.js",

"eslint-config-cheminfo": "^8.0.2",
"filelist-utils": "^0.6.0",
"jest": "^28.1.3",

@@ -58,8 +59,5 @@ "openchemlib": "^8.0.1",

"dependencies": {
"ensure-string": "^1.2.0",
"pumpify": "^2.0.1",
"split2": "^4.1.0",
"through2": "^4.0.2",
"through2-filter": "^3.0.0"
"dynamic-typing": "^1.0.0",
"ensure-string": "^1.2.0"
}
}

@@ -60,37 +60,16 @@ # sdf-parser

## Streams
## Iterator
This API is only available on Node.js.
### molecules(options)
Transform an input text stream to a stream of molecule objects.
#### options
- `fullResult`: true to emit the full result of `parse` instead of just the molecules.
- All other options from the `parse` function.
```js
const { stream } = require('sdf-parser');
fs.createReadStream('test.sdf')
.pipe(stream.molecules())
.on('data', (molecule) => {
console.log(molecule.molfile);
});
const { iterator } = require('sdf-parser');
const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
const stream = readStream.pipe(createGunzip());
const results = [];
for await (const entry of iterator(stream)) {
results.push(entry);
}
```
### entries()
Transform an input text stream to a stream of sdf entries.
```js
const { stream } = require('sdf-parser');
fs.createReadStream('test.sdf')
.pipe(stream.entries())
.on('data', (entry) => {
// sdf entry as a string
});
```
## License

@@ -102,5 +81,5 @@

[npm-url]: https://www.npmjs.com/package/sdf-parser
[travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/master.svg?style=flat-square
[travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/main.svg?style=flat-square
[travis-url]: https://travis-ci.org/cheminfo/sdf-parser
[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
[download-url]: https://www.npmjs.com/package/sdf-parser
export * from './parse';
export * from './stream';
export * from './iterator';
import { ensureString } from 'ensure-string';
import { getEntriesBoundaries } from './getEntriesBoundaries';
import { getMolecule } from './util/getMolecule';
/**
* Parse a SDF file
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
* @param {any} [options={}]
* @param {array<string>} [options.include] List of fields to include
* @param {array<string>} [options.exclude] List of fields to exclude
* @param {object} [options={}]
* @param {string[]} [options.include] List of fields to include
* @param {string[]} [options.exclude] List of fields to exclude
* @param {Function} [options.filter] Callback allowing to filter the molecules
* @param {boolean} [options.dynamicTyping] Dynamically type the data
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
*/
export function parse(sdf, options = {}) {
const {
include,
exclude,
filter,
modifiers = {},
forEach = {},
dynamicTyping = true,
} = options;
options = { ...options };
if (options.modifiers === undefined) options.modifiers = {};
if (options.forEach === undefined) options.forEach = {};
if (options.dynamicTyping === undefined) options.dynamicTyping = true;

@@ -29,17 +28,23 @@ sdf = ensureString(sdf);

let eol = '\n';
if (options.mixedEOL) {
sdf = sdf.replace(/\r\n/g, '\n');
sdf = sdf.replace(/\r/g, '\n');
} else {
// we will find the delimiter in order to be much faster and not use regular expression
let header = sdf.substr(0, 1000);
if (header.indexOf('\r\n') > -1) {
eol = '\r\n';
} else if (header.indexOf('\r') > -1) {
eol = '\r';
if (options.eol === undefined) {
options.eol = '\n';
if (options.mixedEOL) {
sdf = sdf.replace(/\r\n/g, '\n');
sdf = sdf.replace(/\r/g, '\n');
} else {
// we will find the delimiter in order to be much faster and not use regular expression
let header = sdf.substr(0, 1000);
if (header.indexOf('\r\n') > -1) {
options.eol = '\r\n';
} else if (header.indexOf('\r') > -1) {
options.eol = '\r';
}
}
}
let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
let entriesBoundaries = getEntriesBoundaries(
sdf,
`${options.eol}$$$$`,
options.eol,
);
let molecules = [];

@@ -52,68 +57,14 @@ let labels = {};

let sdfPart = sdf.substring(...entriesBoundaries[i]);
let parts = sdfPart.split(`${eol}>`);
if (parts.length > 0 && parts[0].length > 5) {
let molecule = {};
let currentLabels = [];
molecule.molfile = parts[0] + eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].substring(from + 1, to);
currentLabels.push(label);
if (!labels[label]) {
labels[label] = {
counter: 0,
isNumeric: dynamicTyping,
keep: false,
};
if (
(!exclude || exclude.indexOf(label) === -1) &&
(!include || include.indexOf(label) > -1)
) {
labels[label].keep = true;
if (modifiers[label]) {
labels[label].modifier = modifiers[label];
}
if (forEach[label]) {
labels[label].forEach = forEach[label];
}
}
}
if (labels[label].keep) {
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += eol + lines[k];
} else {
molecule[label] = lines[k];
}
}
if (labels[label].modifier) {
let modifiedValue = labels[label].modifier(molecule[label]);
if (modifiedValue === undefined || modifiedValue === null) {
delete molecule[label];
} else {
molecule[label] = modifiedValue;
}
}
if (labels[label].isNumeric) {
if (
!isFinite(molecule[label]) ||
molecule[label].match(/^0[0-9]/)
) {
labels[label].isNumeric = false;
}
}
}
let currentLabels = [];
const molecule = getMolecule(sdfPart, labels, currentLabels, options);
if (!molecule) continue;
if (!options.filter || options.filter(molecule)) {
molecules.push(molecule);
// only now we can increase the counter
for (let j = 0; j < currentLabels.length; j++) {
labels[currentLabels[j]].counter++;
}
if (!filter || filter(molecule)) {
molecules.push(molecule);
// only now we can increase the counter
for (let j = 0; j < currentLabels.length; j++) {
labels[currentLabels[j]].counter++;
}
}
}
}
// all numeric fields should be converted to numbers

@@ -120,0 +71,0 @@ for (let label in labels) {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc