Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@datagrok/bio

Package Overview
Dependencies
Maintainers
6
Versions
280
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@datagrok/bio - npm Package Compare versions

Comparing version 1.2.1 to 1.3.0

src/tests/detectors-test.ts

201

detectors.js

@@ -7,4 +7,203 @@ /**

* Follow this naming convention to ensure that your detectors are properly loaded.
*
* TODO: Use detectors from WebLogo pickUp.. methods
*/
class Sequences1PackageDetectors extends DG.Package {
class BioPackageDetectors extends DG.Package {
static semType = 'MACROMOLECULE';
static Units = {
FastaSeqPt: 'fasta:SEQ:PT', FastaSeqNt: 'fasta:SEQ:NT', FastaMsaPt: 'fasta:MSA:PT', FastaMsaNt: 'fasta:MSA:NT',
};
static AminoacidsFastaAlphabet = new Set([
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
]);
static NucleotidesFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
//tags: semTypeDetector
//input: column col
//output: string semType
detectMacromolecule(col) {
// To collect alphabet freq three strategies can be used:
// as chars, as fasta (single or within square brackets), as with the separator.
const alphabetCandidates = [
['NT', BioPackageDetectors.NucleotidesFastaAlphabet],
['PT', BioPackageDetectors.AminoacidsFastaAlphabet],
];
// TODO: Detect HELM sequence
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
if (statsAsChars.sameLength) {
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
const units = `fasta:SEQ.MSA:${alphabet}`;
col.setTag(DG.TAGS.UNITS, units);
return BioPackageDetectors.semType;
} else {
const sep = BioPackageDetectors.detectSeparator(statsAsChars.freq);
const gapSymbol = sep ? '' : '-';
const splitter = sep ? BioPackageDetectors.getSplitterWithSeparator(sep) : BioPackageDetectors.splitterAsFasta;
const stats = BioPackageDetectors.getStats(col, 5, splitter);
const format = sep ? 'separator' : 'fasta';
const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, alphabetCandidates, gapSymbol);
const units = `${format}:${seqType}:${alphabet}`;
col.setTag(DG.TAGS.UNITS, units);
return BioPackageDetectors.semType;
}
}
/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
* Does not use any splitting strategies, estimates just by single characters.
* */
static detectSeparator(freq) {
// To detect a separator we analyse col's sequences character frequencies.
// If there is an exceptionally frequent symbol, then we will call it the separator.
// The most frequent symbol should occur with a rate of at least 0.15
// of all other symbols in sum to be called the separator.
// !!! But there is a caveat because exceptionally frequent char can be a gap symbol in MSA.
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
const maxFreq = Math.max(...Object.values(freq));
const sep = Object.entries(freq).find((kv) => kv[1] == maxFreq)[0];
const sepFreq = freq[sep];
const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
.map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
const freqThreshold = 3.5 * (1 / Object.keys(freq).length);
return sepFreq / otherSumFreq > freqThreshold ? sep : null;
}
/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
static getStats(seqCol, minLength, splitter) {
const freq = {};
let sameLength = true;
let firstLength = null;
for (const seq of seqCol.categories) {
const mSeq = splitter(seq);
if (firstLength == null) {
firstLength = mSeq.length;
} else if (mSeq.length !== firstLength) {
sameLength = false;
}
if (mSeq.length > minLength) {
for (const m of mSeq) {
if (!(m in freq)) {
freq[m] = 0;
}
freq[m] += 1;
}
}
}
return {freq: freq, sameLength: sameLength};
}
/** Detects alphabet for freq by freq similarity to alphabet monomer set.
* @param freq frequencies of monomers in sequence set
* @param candidates an array of pairs [name, monomer set]
* */
static detectAlphabet(freq, candidates, gapSymbol) {
const candidatesSims = candidates.map((c) => {
const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
return [c[0], c[1], freq, sim];
});
let alphabetName;
const maxSim = Math.max(...candidatesSims.map((cs) => cs[3]));
if (maxSim > 0.65) {
const sim = candidatesSims.find((cs) => cs[3] == maxSim);
alphabetName = sim[0];
} else {
alphabetName = 'UN';
}
return alphabetName;
}
static getAlphabetSimilarity(freq, alphabet, gapSymbol) {
const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
keys.delete(gapSymbol);
const freqA = [];
const alphabetA = [];
for (const m of keys) {
freqA.push(m in freq ? freq[m] : 0);
alphabetA.push(alphabet.has(m) ? 1 : 0);
}
/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));
return cos;
}
static vectorLength(v) {
let sqrSum = 0;
for (let i = 0; i < v.length; i++) {
sqrSum += v[i] * v[i];
}
return Math.sqrt(sqrSum);
}
static vectorDotProduct(v1, v2) {
if (v1.length != v2.length) {
throw Error('The dimensionality of the vectors must match');
}
let prod = 0;
for (let i = 0; i < v1.length; i++) {
prod += v1[i] * v2[i];
}
return prod;
}
/** For trivial checks split by single chars*/
static splitterAsChars(seq) {
return seq.split('');
}
static getSplitterWithSeparator(sep) {
return function(seq) {
return seq.split(sep);
};
}
// Multichar monomer names in square brackets, single char monomers or gap symbol
static monomerRe = /\[(\w+)\]|(\w)|(-)/g;
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
static splitterAsFasta(seq) {
const res = wu(seq.toString().matchAll(BioPackageDetectors.monomerRe)).map((ma) => {
let mRes;
const m = ma[0];
if (m.length > 1) {
if (m in BioPackageDetectors.aaSynonyms) {
mRes = BioPackageDetectors.aaSynonyms[m];
} else {
mRes = '';
console.debug(`Long monomer '${m}' has not a short synonym.`);
}
} else {
mRes = m;
}
return mRes;
}).toArray();
return res;
}
/** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
static aaSynonyms = {
'[MeNle]': 'L', // Nle - norleucine
'[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
};
}

4

package.json

@@ -5,3 +5,3 @@ {

"friendlyName": "Bio",
"version": "1.2.1",
"version": "1.3.0",
"description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",

@@ -14,3 +14,3 @@ "repository": {

"dependencies": {
"@datagrok-libraries/bio": "^2.1.0",
"@datagrok-libraries/bio": "^2.1.1",
"@datagrok-libraries/utils": "^0.4.2",

@@ -17,0 +17,0 @@ "cash-dom": "latest",

@@ -5,3 +5,5 @@ import * as DG from 'datagrok-api/dg';

import './tests/WebLogo.test';
import './tests/WebLogo-test';
import './tests/Palettes-test';
import './tests/detectors-test';

@@ -11,10 +13,11 @@ export const _packageTest = new DG.Package();

/** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
//name: test
//input: string category {optional: true}
//input: string t {optional: true}
//input: string test {optional: true}
//output: dataframe result
//top-menu: Tools | Dev | JS API Tests
export async function test(category: string, t: string): Promise<DG.DataFrame> {
const data = await runTests({category, test: t});
export async function test(category: string, test: string): Promise<DG.DataFrame> {
const data = await runTests({category, test});
return DG.DataFrame.fromObjects(data)!;
}

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc