@@ -7,4 +7,203 @@ /**
		* Follow this naming convention to ensure that your detectors are properly loaded.
		*
		* TODO: Use detectors from WebLogo pickUp.. methods
		*/
		class Sequences1PackageDetectors extends DG.Package {


		class BioPackageDetectors extends DG.Package {

		static semType = 'MACROMOLECULE';

		static Units = {
		FastaSeqPt: 'fasta:SEQ:PT', FastaSeqNt: 'fasta:SEQ:NT', FastaMsaPt: 'fasta:MSA:PT', FastaMsaNt: 'fasta:MSA:NT',
		};

		static AminoacidsFastaAlphabet = new Set([
		'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
		'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
		]);

		static NucleotidesFastaAlphabet = new Set(['A', 'C', 'G', 'T']);

		//tags: semTypeDetector
		//input: column col
		//output: string semType
		detectMacromolecule(col) {
		// To collect alphabet freq three strategies can be used:
		// as chars, as fasta (single or within square brackets), as with the separator.

		const alphabetCandidates = [
		['NT', BioPackageDetectors.NucleotidesFastaAlphabet],
		['PT', BioPackageDetectors.AminoacidsFastaAlphabet],
		];

		// TODO: Detect HELM sequence
		// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
		const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
		if (statsAsChars.sameLength) {
		const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
		const units = `fasta:SEQ.MSA:${alphabet}`;
		col.setTag(DG.TAGS.UNITS, units);
		return BioPackageDetectors.semType;
		} else {
		const sep = BioPackageDetectors.detectSeparator(statsAsChars.freq);
		const gapSymbol = sep ? '' : '-';
		const splitter = sep ? BioPackageDetectors.getSplitterWithSeparator(sep) : BioPackageDetectors.splitterAsFasta;
		const stats = BioPackageDetectors.getStats(col, 5, splitter);

		const format = sep ? 'separator' : 'fasta';
		const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';

		// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
		const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, alphabetCandidates, gapSymbol);

		const units = `${format}:${seqType}:${alphabet}`;
		col.setTag(DG.TAGS.UNITS, units);
		return BioPackageDetectors.semType;
		}
		}

		/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
		* Does not use any splitting strategies, estimates just by single characters.
		* */
		static detectSeparator(freq) {
		// To detect a separator we analyse col's sequences character frequencies.
		// If there is an exceptionally frequent symbol, then we will call it the separator.
		// The most frequent symbol should occur with a rate of at least 0.15
		// of all other symbols in sum to be called the separator.

		// !!! But there is a caveat because exceptionally frequent char can be a gap symbol in MSA.
		// !!! What is the difference between the gap symbol and separator symbol in stats terms?

		const maxFreq = Math.max(...Object.values(freq));
		const sep = Object.entries(freq).find((kv) => kv[1] == maxFreq)[0];
		const sepFreq = freq[sep];
		const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
		.map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
		const freqThreshold = 3.5 * (1 / Object.keys(freq).length);
		return sepFreq / otherSumFreq > freqThreshold ? sep : null;
		}

		/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
		static getStats(seqCol, minLength, splitter) {
		const freq = {};
		let sameLength = true;
		let firstLength = null;

		for (const seq of seqCol.categories) {
		const mSeq = splitter(seq);

		if (firstLength == null) {
		firstLength = mSeq.length;
		} else if (mSeq.length !== firstLength) {
		sameLength = false;
		}

		if (mSeq.length > minLength) {
		for (const m of mSeq) {
		if (!(m in freq)) {
		freq[m] = 0;
		}
		freq[m] += 1;
		}
		}
		}
		return {freq: freq, sameLength: sameLength};
		}

		/** Detects alphabet for freq by freq similarity to alphabet monomer set.
		* @param freq frequencies of monomers in sequence set
		* @param candidates an array of pairs [name, monomer set]
		* */
		static detectAlphabet(freq, candidates, gapSymbol) {
		const candidatesSims = candidates.map((c) => {
		const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
		return [c[0], c[1], freq, sim];
		});

		let alphabetName;
		const maxSim = Math.max(...candidatesSims.map((cs) => cs[3]));
		if (maxSim > 0.65) {
		const sim = candidatesSims.find((cs) => cs[3] == maxSim);
		alphabetName = sim[0];
		} else {
		alphabetName = 'UN';
		}
		return alphabetName;
		}

		static getAlphabetSimilarity(freq, alphabet, gapSymbol) {
		const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
		keys.delete(gapSymbol);

		const freqA = [];
		const alphabetA = [];
		for (const m of keys) {
		freqA.push(m in freq ? freq[m] : 0);
		alphabetA.push(alphabet.has(m) ? 1 : 0);
		}
		/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
		const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));
		return cos;
		}

		static vectorLength(v) {
		let sqrSum = 0;
		for (let i = 0; i < v.length; i++) {
		sqrSum += v[i] * v[i];
		}
		return Math.sqrt(sqrSum);
		}

		static vectorDotProduct(v1, v2) {
		if (v1.length != v2.length) {
		throw Error('The dimensionality of the vectors must match');
		}
		let prod = 0;
		for (let i = 0; i < v1.length; i++) {
		prod += v1[i] * v2[i];
		}
		return prod;
		}

		/** For trivial checks split by single chars*/
		static splitterAsChars(seq) {
		return seq.split('');
		}

		static getSplitterWithSeparator(sep) {
		return function(seq) {
		return seq.split(sep);
		};
		}

		// Multichar monomer names in square brackets, single char monomers or gap symbol
		static monomerRe = /\[(\w+)\]\|(\w)\|(-)/g;

		/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
		static splitterAsFasta(seq) {
		const res = wu(seq.toString().matchAll(BioPackageDetectors.monomerRe)).map((ma) => {
		let mRes;
		const m = ma[0];
		if (m.length > 1) {
		if (m in BioPackageDetectors.aaSynonyms) {
		mRes = BioPackageDetectors.aaSynonyms[m];
		} else {
		mRes = '';
		console.debug(`Long monomer '${m}' has not a short synonym.`);
		}
		} else {
		mRes = m;
		}
		return mRes;
		}).toArray();

		return res;
		}

		/** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
		static aaSynonyms = {
		'[MeNle]': 'L', // Nle - norleucine
		'[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
		};
		}

package.json

		@@ -5,3 +5,3 @@ {
		"friendlyName": "Bio",
		"version": "1.2.1",
		"version": "1.3.0",
		"description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",
		@@ -14,3 +14,3 @@ "repository": {
		"dependencies": {
		"@datagrok-libraries/bio": "^2.1.0",
		"@datagrok-libraries/bio": "^2.1.1",
		"@datagrok-libraries/utils": "^0.4.2",
		@@ -17,0 +17,0 @@ "cash-dom": "latest",

src/package-test.ts

		@@ -5,3 +5,5 @@ import * as DG from 'datagrok-api/dg';

		import './tests/WebLogo.test';
		import './tests/WebLogo-test';
		import './tests/Palettes-test';
		import './tests/detectors-test';

		@@ -11,10 +13,11 @@ export const _packageTest = new DG.Package();

		/** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
		//name: test
		//input: string category {optional: true}
		//input: string t {optional: true}
		//input: string test {optional: true}
		//output: dataframe result
		//top-menu: Tools \| Dev \| JS API Tests
		export async function test(category: string, t: string): Promise<DG.DataFrame> {
		const data = await runTests({category, test: t});
		export async function test(category: string, test: string): Promise<DG.DataFrame> {
		const data = await runTests({category, test});
		return DG.DataFrame.fromObjects(data)!;
		}

src/tests/Palettes.test.ts

src/tests/WebLogo.test.ts

test-Bio-3109311545e4-270baeb2.html

dist/package-test.js

Sorry, the diff of this file is too big to display

dist/package.js

Sorry, the diff of this file is too big to display

@datagrok/bio - npm Package Compare versions

Improved metrics

Dependency changes