Comparing version 0.2.5 to 0.2.6
77
index.js
@@ -128,4 +128,4 @@ var Tree = require("./lib/BTree"); | ||
}, | ||
extractSLP: function(TFTrheshold, SETreshold) { | ||
extractSLP: function(TFTrheshold, SETreshold, verbose) { | ||
var owner = this; | ||
@@ -135,6 +135,10 @@ var totalFrequency = this.tree.root.totalFrequency; | ||
var result = []; | ||
var sistrings = []; | ||
if(verbose) { | ||
console.log("collecting internal nodes"); | ||
} | ||
this.preOrderTraverse(function(node) { | ||
if(node.type == owner.INTERNAL) { | ||
var sistring = owner._restorePrefix(node); | ||
if(sistring != "" && node.totalFrequency > TFTrheshold) { | ||
if(node.type == owner.INTERNAL && node.totalFrequency > TFTrheshold) { | ||
var sistring = owner._restorePrefix(node); | ||
if(sistring != "" && sistrings.indexOf(sistring) == -1) { | ||
var map = {}; | ||
@@ -146,37 +150,43 @@ map.sistring = sistring; | ||
lexicalPatters.push(map); | ||
sistrings.push(sistring); | ||
if(verbose && lexicalPatters.length % 1000 == 0) { | ||
console.log("done collecting No." + lexicalPatters.length + " node"); | ||
} | ||
} | ||
} | ||
}) | ||
}); | ||
if(verbose) { | ||
console.log("collection completed, number of nodes: " + lexicalPatters.length + ", sorting nodes"); | ||
} | ||
lexicalPatters.sort(function(item1, item2) { | ||
return item2.sistring.length - item1.sistring.length; | ||
}); | ||
sistrings.sort(function(item1, item2) { | ||
return item2.length - item1.length; | ||
}); | ||
if(verbose) { | ||
console.log("start marking candidates") | ||
} | ||
for(var i = 0; i < lexicalPatters.length; i++) { | ||
var sistring = lexicalPatters[i].sistring; | ||
var fstOverlapIndex = -1; | ||
var sndOverlapIndex = -1; | ||
if(lexicalPatters[i].sistring != sistrings[i]) { | ||
throw "internal error, sistrings not aligned."; | ||
} | ||
var map = lexicalPatters[i]; | ||
var sistring = map.sistring; | ||
var fstOverlapString = sistring.slice(0, sistring.length - 1); | ||
var sndOverlapString = sistring.slice(1, sistring.length); | ||
for(var j = i; j < lexicalPatters.length; j++) { | ||
if(lexicalPatters[j].sistring == fstOverlapString) { | ||
fstOverlapIndex = j; | ||
} | ||
if(lexicalPatters[j].sistring == sndOverlapString) { | ||
sndOverlapIndex = j; | ||
} | ||
} | ||
var fstOverlap = lexicalPatters[sistrings.indexOf(fstOverlapString)]; | ||
var sndOverlap = lexicalPatters[sistrings.indexOf(sndOverlapString)]; | ||
var map = lexicalPatters[i]; | ||
var fstOverlap; | ||
var sndOverlap; | ||
if(fstOverlapIndex != -1 && sndOverlapIndex != -1) { | ||
fstOverlap = lexicalPatters[fstOverlapIndex]; | ||
sndOverlap = lexicalPatters[sndOverlapIndex]; | ||
if(fstOverlap && sndOverlap) { | ||
map.se = map.frequency / (fstOverlap.frequency + sndOverlap.frequency - map.frequency); | ||
} else if(fstOverlapIndex != -1) { | ||
fstOverlap = lexicalPatters[fstOverlapIndex]; | ||
} else if(fstOverlap) { | ||
map.se = map.frequency / (fstOverlap.frequency - map.frequency); | ||
} else if(sndOverlapIndex != -1) { | ||
sndOverlap = lexicalPatters[sndOverlapIndex]; | ||
} else if(sndOverlap) { | ||
map.se = map.frequency / (sndOverlap.frequency - map.frequency); | ||
@@ -186,6 +196,6 @@ } | ||
if(map.se > SETreshold) { | ||
if(fstOverlapIndex != -1) { | ||
if(fstOverlap) { | ||
fstOverlap.candidate = false; | ||
} | ||
if(sndOverlapIndex != -1) { | ||
if(sndOverlap) { | ||
sndOverlap.candidate = false; | ||
@@ -195,6 +205,13 @@ } | ||
if(map.candidate && result.indexOf(map.sistring) == -1) { | ||
if(map.candidate) { | ||
result.push(map.sistring); | ||
if(verbose && result.length % 1000 == 0) { | ||
console.log("done processing No." + result.length + " item"); | ||
} | ||
} | ||
} | ||
if(verbose) { | ||
console.log("extracting SLP completes, total " + result.length + " SLPs") | ||
} | ||
return result; | ||
@@ -201,0 +218,0 @@ }, |
{ | ||
"name": "pat-tree", | ||
"version": "0.2.5", | ||
"version": "0.2.6", | ||
"description": "PAT tree construction for Chinese documents, keyword extraction and text segmentation", | ||
@@ -8,3 +8,3 @@ "main": "index.js", | ||
"test": "make test" | ||
}, | ||
}, | ||
"repository": { | ||
@@ -11,0 +11,0 @@ "type": "git", |
@@ -29,4 +29,4 @@ pat-tree | ||
# Usage | ||
### Instanitiate | ||
@@ -48,3 +48,3 @@ | ||
```javascript | ||
var SLPs = tree.extractSLP(TFThreshold, SEThreshold); | ||
var SLPs = tree.extractSLP(TFThreshold, SEThreshold, verbose); | ||
// SLPs: array of strings, which are signifiant lexical patterns. | ||
@@ -56,2 +56,4 @@ ``` | ||
`verbose`: optional, if set to true, then will print out progress on console. | ||
`TFTreshold` should be integer, `SEThreshold` should be float between 0 and 1. | ||
@@ -256,2 +258,3 @@ | ||
* 0.2.6 Greatly improve performance of `extractSLP()` | ||
* 0.2.5 Greatly improve performance of `addDocument()` | ||
@@ -258,0 +261,0 @@ * 0.2.4 Fix bug in `reborn()` |
32657
864
270