Comparing version 0.1.23 to 0.1.25
{ | ||
"name": "qminer", | ||
"version": "0.1.23", | ||
"version": "0.1.25", | ||
"description": "A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data", | ||
@@ -32,5 +32,6 @@ "author": "Blaz Fortuna <blaz@blazfortuna.com>", | ||
"engines": { | ||
"node": "0.11.14" | ||
"node": "0.12.0" | ||
}, | ||
"dependencies": { | ||
"sget": "~0.1.5", | ||
"bindings": "~1.2.1" | ||
@@ -37,0 +38,0 @@ }, |
@@ -11,3 +11,7 @@ // typical use case: pathPrefix = 'Release' or pathPrefix = 'Debug'. Empty argument is supported as well (the first binary that the bindings finds will be used) | ||
var qm_util = require(__dirname + '/qm_util.js'); | ||
var sget = require('sget'); | ||
function defarg(arg, defaultval) { | ||
return arg == undefined ? defaultval : arg; | ||
} | ||
@@ -23,3 +27,2 @@ function createBatchModel(featureSpace, models) { | ||
// save list | ||
debugger | ||
sout.writeLine(this.models); | ||
@@ -197,3 +200,258 @@ // save feature space | ||
//#- `alModel = analytics.newActiveLearner(query, qRecSet, fRecSet, ftrSpace, settings)` -- initializes the | ||
//# active learning. The algorihm is run by calling `model.startLoop()`. The algorithm has two stages: query mode, where the algorithm suggests potential | ||
//# positive and negative examples based on the query text, and SVM mode, where the algorithm keeps | ||
//# selecting examples that are closest to the SVM margin (every time an example is labeled, the SVM | ||
//# is retrained. | ||
//# The inputs are: query (text), record set `qRecSet`, record set `fRecSet`, the feature space `ftrSpace` and a | ||
//# `settings`JSON object. The settings object specifies:`textField` (string) which is the name | ||
//# of the field in records that is used to create feature vectors, `nPos` (integer) and `nNeg` (integer) set the number of positive and negative | ||
//# examples that have to be identified in the query mode before the program enters SVM mode. | ||
//# We can set two additional parameters `querySampleSize` and `randomSampleSize` which specify the sizes of subsamples of qRecSet and fRecSet, where the rest of the data is ignored in the active learning. | ||
//# Final parameters are all SVM parameters (c, j, batchSize, maxIterations, maxTime, minDiff, verbose). | ||
exports.newActiveLearner = function (query, qRecSet, fRecSet, ftrSpace, stts) { | ||
return new exports.ActiveLearner(query, qRecSet, fRecSet, ftrSpace, stts); | ||
} | ||
exports.ActiveLearner = function (query, qRecSet, fRecSet, ftrSpace, stts) { | ||
var settings = defarg(stts, {}); | ||
settings.nPos = defarg(stts.nPos, 2); | ||
settings.nNeg = defarg(stts.nNeg, 2); | ||
settings.textField = defarg(stts.textField, "Text"); | ||
settings.querySampleSize = defarg(stts.querySampleSize, -1); | ||
settings.randomSampleSize = defarg(stts.randomSampleSize, -1); | ||
settings.c = defarg(stts.c, 1.0); | ||
settings.j = defarg(stts.j, 1.0); | ||
settings.batchSize = defarg(stts.batchSize, 100); | ||
settings.maxIterations = defarg(stts.maxIterations, 100000); | ||
settings.maxTime = defarg(stts.maxTime, 1); // 1 second for computation by default | ||
settings.minDiff = defarg(stts.minDiff, 1e-6); | ||
settings.verbose = defarg(stts.verbose, false); | ||
// compute features or provide them | ||
settings.extractFeatures = defarg(stts.extractFeatures, true); | ||
if (!settings.extractFeatures) { | ||
if (stts.uMat == null) { throw 'settings uMat not provided, extractFeatures = false'; } | ||
if (stts.uRecSet == null) { throw 'settings uRecSet not provided, extractFeatures = false'; } | ||
if (stts.querySpVec == null) { throw 'settings querySpVec not provided, extractFeatures = false'; } | ||
} | ||
// QUERY MODE | ||
var queryMode = true; | ||
// bow similarity between query and training set | ||
var querySpVec; | ||
var uRecSet; | ||
var uMat; | ||
if (settings.extractFeatures) { | ||
var temp = {}; temp[settings.textField] = query; | ||
var queryRec = qRecSet.store.newRec(temp); // record | ||
querySpVec = ftrSpace.ftrSpVec(queryRec); | ||
// use sampling? | ||
var sq = qRecSet; | ||
if (settings.querySampleSize >= 0 && qRecSet != undefined) { | ||
sq = qRecSet.sample(settings.querySampleSize); | ||
} | ||
var sf = fRecSet; | ||
if (settings.randomSampleSize >= 0 && fRecSet != undefined) { | ||
sf = fRecSet.sample(settings.randomSampleSize); | ||
} | ||
// take a union or just qset or just fset if some are undefined | ||
uRecSet = (sq != undefined) ? ((sf != undefined) ? sq.setunion(sf) : sq) : sf; | ||
if (uRecSet == undefined) { throw 'undefined record set for active learning!';} | ||
uMat = ftrSpace.ftrSpColMat(uRecSet); | ||
} else { | ||
querySpVec = stts.querySpVec; | ||
uRecSet = stts.uRecSet; | ||
uMat = stts.uMat; | ||
} | ||
querySpVec.normalize(); | ||
uMat.normalizeCols(); | ||
var X = new la.SparseMatrix(); | ||
var y = new la.Vector(); | ||
var simV = uMat.multiplyT(querySpVec); //similarities (q, recSet) | ||
var sortedSimV = simV.sortPerm(); //ascending sort | ||
var simVs = sortedSimV.vec; //sorted similarities (q, recSet) | ||
var simVp = sortedSimV.perm; //permutation of sorted similarities (q, recSet) | ||
//// counters for questions in query mode | ||
var nPosQ = 0; //for traversing simVp from the end | ||
var nNegQ = 0; //for traversing simVp from the start | ||
// SVM MODE | ||
var svm; | ||
var posIdxV = new la.IntVector(); //indices in recordSet | ||
var negIdxV = new la.IntVector(); //indices in recordSet | ||
var posRecIdV = new la.IntVector(); //record IDs | ||
var negRecIdV = new la.IntVector(); //record IDs | ||
var classVec = new la.Vector({ "vals": uRecSet.length }); //svm scores for record set | ||
var resultVec = new la.Vector({ "vals": uRecSet.length }); // non-absolute svm scores for record set | ||
//# - `rs = alModel.getRecSet()` -- returns the record set that is being used (result of sampling) | ||
this.getRecSet = function () { return uRecSet }; | ||
//# - `idx = alModel.selectedQuestionIdx()` -- returns the index of the last selected question in alModel.getRecSet() | ||
this.selectedQuestionIdx = -1; | ||
//# - `bool = alModel.getQueryMode()` -- returns true if in query mode, false otherwise (SVM mode) | ||
this.getQueryMode = function () { return queryMode; }; | ||
//# - `numArr = alModel.getPos(thresh)` -- given a `threshold` (number) return the indexes of records classified above it as a javascript array of numbers. Must be in SVM mode. | ||
this.getPos = function (threshold) { | ||
if (this.queryMode) { return null; } // must be in SVM mode to return results | ||
if (!threshold) { threshold = 0; } | ||
var posIdxArray = []; | ||
for (var recN = 0; recN < uRecSet.length; recN++) { | ||
if (resultVec[recN] >= threshold) { | ||
posIdxArray.push(recN); | ||
} | ||
} | ||
return posIdxArray; | ||
}; | ||
this.debug = function () { debugger; } | ||
this.getTop = function (limit) { | ||
if (this.queryMode) { return null; } // must be in SVM mode to return results | ||
if (!limit) { limit = 20; } | ||
var idxArray = []; | ||
var marginArray = []; | ||
var sorted = resultVec.sortPerm(false); | ||
for (var recN = 0; recN < uRecSet.length && recN < limit; recN++) { | ||
idxArray.push(sorted.perm[recN]); | ||
var val = sorted.vec[recN]; | ||
val = val == Number.POSITIVE_INFINITY ? Number.MAX_VALUE : val; | ||
val = val == Number.NEGATIVE_INFINITY ? -Number.MAX_VALUE : val; | ||
marginArray.push(val); | ||
} | ||
return { posIdx: idxArray, margins: marginArray }; | ||
}; | ||
//# - `objJSON = alModel.getSettings()` -- returns the settings object | ||
this.getSettings = function () { return settings; } | ||
// returns record set index of the unlabeled record that is closest to the margin | ||
//# - `recSetIdx = alModel.selectQuestion()` -- returns `recSetIdx` - the index of the record in `recSet`, whose class is unknonw and requires user input | ||
this.selectQuestion = function () { | ||
if (posRecIdV.length >= settings.nPos && negRecIdV.length >= settings.nNeg) { queryMode = false; } | ||
if (queryMode) { | ||
if (posRecIdV.length < settings.nPos && nPosQ + 1 < uRecSet.length) { | ||
nPosQ = nPosQ + 1; | ||
console.log("query mode, try to get pos"); | ||
this.selectedQuestionIdx = simVp[simVp.length - 1 - (nPosQ - 1)]; | ||
return this.selectedQuestionIdx; | ||
} | ||
if (negRecIdV.length < settings.nNeg && nNegQ + 1 < uRecSet.length) { | ||
nNegQ = nNegQ + 1; | ||
// TODO if nNegQ == rRecSet.length, find a new sample | ||
console.log("query mode, try to get neg"); | ||
this.selectedQuestionIdx = simVp[nNegQ - 1]; | ||
return this.selectedQuestionIdx; | ||
} | ||
} | ||
else { | ||
////call svm, get record closest to the margin | ||
svm = new exports.SVC(settings); | ||
svm.fit(X, y);//column examples, y float vector of +1/-1, default svm paramvals | ||
// mark positives | ||
for (var i = 0; i < posIdxV.length; i++) { | ||
classVec[posIdxV[i]] = Number.POSITIVE_INFINITY; | ||
resultVec[posIdxV[i]] = Number.POSITIVE_INFINITY; | ||
} | ||
// mark negatives | ||
for (var i = 0; i < negIdxV.length; i++) { | ||
classVec[negIdxV[i]] = Number.POSITIVE_INFINITY; | ||
resultVec[negIdxV[i]] = Number.NEGATIVE_INFINITY; | ||
} | ||
var posCount = posIdxV.length; | ||
var negCount = negIdxV.length; | ||
// classify unlabeled | ||
for (var recN = 0; recN < uRecSet.length; recN++) { | ||
if (classVec[recN] !== Number.POSITIVE_INFINITY) { | ||
var svmMargin = svm.predict(uMat.getCol(recN)); | ||
if (svmMargin > 0) { | ||
posCount++; | ||
} else { | ||
negCount++; | ||
} | ||
classVec[recN] = Math.abs(svmMargin); | ||
resultVec[recN] = svmMargin; | ||
} | ||
} | ||
var sorted = classVec.sortPerm(); | ||
console.log("svm mode, margin: " + sorted.vec[0] + ", npos: " + posCount + ", nneg: " + negCount); | ||
this.selectedQuestionIdx = sorted.perm[0]; | ||
return this.selectedQuestionIdx; | ||
} | ||
}; | ||
// asks the user for class label given a record set index | ||
//# - `alModel.getAnswer(ALAnswer, recSetIdx)` -- given user input `ALAnswer` (string) and `recSetIdx` (integer, result of model.selectQuestion) the training set is updated. | ||
//# The user input should be either "y" (indicating that recSet[recSetIdx] is a positive example), "n" (negative example). | ||
this.getAnswer = function (ALanswer, recSetIdx) { | ||
//todo options: ?newQuery | ||
if (ALanswer === "y") { | ||
posIdxV.push(recSetIdx); | ||
posRecIdV.push(uRecSet[recSetIdx].$id); | ||
//X.push(ftrSpace.ftrSpVec(uRecSet[recSetIdx])); | ||
X.push(uMat.getCol(recSetIdx)); | ||
y.push(1.0); | ||
} else { | ||
negIdxV.push(recSetIdx); | ||
negRecIdV.push(uRecSet[recSetIdx].$id); | ||
//X.push(ftrSpace.ftrSpVec(uRecSet[recSetIdx])); | ||
X.push(uMat.getCol(recSetIdx)); | ||
y.push(-1.0); | ||
} | ||
// +k query // rank unlabeled according to query, ask for k most similar | ||
// -k query // rank unlabeled according to query, ask for k least similar | ||
}; | ||
//# - `alModel.startLoop()` -- starts the active learning loop in console | ||
this.startLoop = function () { | ||
while (true) { | ||
var recSetIdx = this.selectQuestion(); | ||
var ALanswer = sget(uRecSet[recSetIdx].Text + ": y/(n)/s? Command s stops the process").trim(); | ||
if (ALanswer == "s") { break; } | ||
if (posIdxV.length + negIdxV.length == uRecSet.length) { break; } | ||
this.getAnswer(ALanswer, recSetIdx); | ||
} | ||
}; | ||
//# - `alModel.saveSvmModel(fout)` -- saves the binary SVM model to an output stream `fout`. The algorithm must be in SVM mode. | ||
this.saveSvmModel = function (outputStream) { | ||
// must be in SVM mode | ||
if (queryMode) { | ||
console.log("AL.save: Must be in svm mode"); | ||
return; | ||
} | ||
svm.save(outputStream); | ||
}; | ||
this.getWeights = function () { | ||
return svm.weights; | ||
} | ||
//this.saveLabeled | ||
//this.loadLabeled | ||
}; | ||
return exports; | ||
} |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
47740290
790
69328
2
9
24
+ Addedsget@~0.1.5
+ Addedsget@0.1.5(transitive)