Comparing version 0.1.2 to 0.1.3
164
index.js
@@ -17,2 +17,19 @@ var Tree = require("./lib/BTree"); | ||
traverse: function(preCallback, inCallback, postCallback) { | ||
this.tree.traverse(preCallback, inCallback, postCallback); | ||
}, | ||
preOrderTraverse: function(callback) { | ||
this.tree.preOrderTraverse(callback); | ||
}, | ||
inOrderTraverse: function(callback) { | ||
this.tree.inOrderTraverse(callback); | ||
}, | ||
postOrderTraverse: function(callback) { | ||
this.tree.postOrderTraverse(callback); | ||
}, | ||
addDocument: function(doc) { | ||
@@ -53,2 +70,3 @@ var sentenses = this._splitDocument(doc); | ||
this._insert(tree, tree.root.id, sistring, index); | ||
this._updateParents(); | ||
}, | ||
@@ -58,8 +76,9 @@ | ||
var node = tree.getNode(nodeId); | ||
var indexes = []; | ||
indexes.push(index); | ||
if(tree.isRoot(nodeId) && tree.root.data == null) { | ||
tree.setNodeData(nodeId, { | ||
type: this.EXTERNAL, | ||
index: index, | ||
sistring: sistring, | ||
count: 1 | ||
sistring: sistring, | ||
indexes: indexes | ||
}); | ||
@@ -83,4 +102,3 @@ } else if(node.data.type == this.INTERNAL) { | ||
sistring: sistring, | ||
index: index, | ||
count: 1 | ||
indexes: indexes | ||
}; | ||
@@ -97,4 +115,3 @@ tree.appendLeftChild(nodeId, leftChild); | ||
sistring: sistring, | ||
index: index, | ||
count: 1 | ||
indexes: indexes | ||
} | ||
@@ -113,3 +130,3 @@ tree.appendRightChild(nodeId, rightChild); | ||
if(node.data.sistring == sistring) { | ||
node.data.count++; | ||
node.data.indexes.push(index); | ||
} else { | ||
@@ -129,11 +146,12 @@ this._rebuildInternalSubtree(tree, node, sistring, index); | ||
*/ | ||
var nodeString; | ||
var sistrings = []; | ||
var indexes = []; | ||
indexes.push(index); | ||
if(node.data.type == this.INTERNAL) { | ||
nodeString = node.data.prefix; | ||
sistrings = sistrings.concat(node.data.sistrings); | ||
} else if(node.data.type == this.EXTERNAL) { | ||
nodeString = node.data.sistring; | ||
sistrings.push(node); | ||
} | ||
@@ -148,7 +166,5 @@ var branchBit = this._findBranchPosition(nodeString, sistring); | ||
sistring: sistring, | ||
index: index, | ||
count: 1 | ||
indexes: indexes | ||
}; | ||
sistrings.push(externalNode); | ||
@@ -160,3 +176,5 @@ var subtreeRoot = subtree.getRoot(); | ||
prefix: sistring.slice(0, branchBit), | ||
sistrings: sistrings | ||
externalNodeNum: 0, | ||
totalFrequency: 0, | ||
sistrings: [] | ||
}; | ||
@@ -195,2 +213,3 @@ | ||
} | ||
/* | ||
@@ -203,2 +222,47 @@ if(!this._checkConnections()) { | ||
_updateParents: function() { | ||
var owner = this; | ||
this.postOrderTraverse(function(node) { | ||
if(node.data.type == owner.INTERNAL) { | ||
var sistrings = []; | ||
var left = node.left; | ||
var right = node.right; | ||
var externalNodeNum = 0; | ||
var totalFrequency = 0; | ||
if(left && right) { | ||
if(left.data.type == owner.INTERNAL) { | ||
externalNodeNum += left.data.externalNodeNum; | ||
totalFrequency += left.data.totalFrequency; | ||
sistrings = sistrings.concat(left.data.sistrings); | ||
} else if(left.data.type == owner.EXTERNAL) { | ||
externalNodeNum += 1; | ||
totalFrequency += left.data.indexes.length; | ||
sistrings.push(left); | ||
} else { | ||
console.trace(); | ||
throw "unknown node type (neither internal nor external)" | ||
} | ||
if(right.data.type == owner.INTERNAL) { | ||
externalNodeNum += right.data.externalNodeNum; | ||
totalFrequency += right.data.totalFrequency; | ||
sistrings = sistrings.concat(right.data.sistrings); | ||
} else if(right.data.type == owner.EXTERNAL) { | ||
externalNodeNum += 1; | ||
totalFrequency += right.data.indexes.length; | ||
sistrings.push(right); | ||
} else { | ||
console.trace(); | ||
throw "unknown node type (neither internal nor external)" | ||
} | ||
} else { | ||
console.trace(); | ||
throw "internal node lost left or right child" | ||
} | ||
node.data.sistrings = sistrings; | ||
node.data.externalNodeNum = externalNodeNum; | ||
node.data.totalFrequency = totalFrequency; | ||
} | ||
}); | ||
}, | ||
_appendZeroes: function(length) { | ||
@@ -253,3 +317,5 @@ var tree = this.tree; | ||
_restoreSistring: function(sistring, index) { | ||
_restoreSistring: function(externalNode) { | ||
var sistring = externalNode.data.sistring; | ||
var index = externalNode.data.indexes[0]; | ||
var indexes = index.split("."); | ||
@@ -280,2 +346,32 @@ var docIndex = indexes[0]; | ||
_restorePrefix: function(internalNode) { | ||
var prefix = internalNode.data.prefix; | ||
var externalNode = internalNode.data.sistrings[0]; | ||
var index = externalNode.data.indexes[0]; | ||
var indexes = index.split("."); | ||
var docIndex = indexes[0]; | ||
var sentenseIndex = indexes[1]; | ||
var wordIndex = indexes[2]; | ||
var output = ""; | ||
var sentense = this.documents[docIndex][sentenseIndex]; | ||
//console.log(" sentense: " + sentense); | ||
var comparison = ""; | ||
for(var i = wordIndex; i < sentense.length && comparison.length < prefix.length; i++) { | ||
var arr = []; | ||
var word = sentense[i]; | ||
arr.push(sentense[i]); | ||
//console.log(" word: " + word); | ||
var binaryWord = this._toBinary(arr); | ||
if(comparison.length + binaryWord.length >= prefix.length) { | ||
break; | ||
} else { | ||
comparison += binaryWord; | ||
output += word; | ||
} | ||
} | ||
return output; | ||
}, | ||
_toString: function(b) { | ||
@@ -299,6 +395,6 @@ var output = ""; | ||
_printTreeContent: function() { | ||
printTreeContent: function(printExternalNodes, printDocuments) { | ||
var tree = this.tree; | ||
var owner = this; | ||
tree.preOrderTraverse(function(node) { | ||
this.preOrderTraverse(function(node) { | ||
console.log("id: " + node.id); | ||
@@ -313,16 +409,26 @@ var type = tree.getNodeType(node.id); | ||
console.log("position: " + node.data.position); | ||
console.log("prefix: " + node.data.prefix); | ||
console.log("sistrings: "); | ||
for(var i = 0; i < node.data.sistrings.length; i++) { | ||
var externalNodeData = node.data.sistrings[i].data; | ||
console.log(" sistring: " + owner._restoreSistring(externalNodeData.sistring, externalNodeData.index)); | ||
console.log(" count: " + externalNodeData.count); | ||
} | ||
console.log("prefix: " + owner._restorePrefix(node)); | ||
console.log("externalNodeNum: " + node.data.externalNodeNum); | ||
console.log("totalFrequency: " + node.data.totalFrequency); | ||
if(printExternalNodes) | ||
{ | ||
console.log("sistrings: "); | ||
for(var i = 0; i < node.data.sistrings.length; i++) { | ||
var externalNode = node.data.sistrings[i]; | ||
console.log(" sistring: " + owner._restoreSistring(externalNode)); | ||
console.log(" indexes: " + externalNode.data.indexes); | ||
} | ||
} | ||
} else if(node.data.type == owner.EXTERNAL) { | ||
console.log("sistring: " + owner._restoreSistring(node.data.sistring, node.data.index)); | ||
console.log("index: " + node.data.index); | ||
console.log("count: " + node.data.count); | ||
console.log("sistring: " + owner._restoreSistring(node)); | ||
console.log("indexes: " + node.data.indexes); | ||
} | ||
console.log(); | ||
}); | ||
}); | ||
if(printDocuments) | ||
{ | ||
console.log("documents:"); | ||
console.log(this.documents); | ||
} | ||
}, | ||
@@ -329,0 +435,0 @@ |
{ | ||
"name": "pat-tree", | ||
"version": "0.1.2", | ||
"version": "0.1.3", | ||
"description": "PAT tree construction for Chinese documents", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -9,13 +9,103 @@ pat-tree | ||
## Installation | ||
# Installation | ||
npm install pat-tree --save | ||
## Test | ||
# Test | ||
npm test | ||
## Release History | ||
# Usage | ||
* 0.1.2 Initial release | ||
## Init | ||
var PATtree = require("pat-tree"); | ||
var tree = new PATtree(); | ||
## Add document | ||
tree.addDocument(input); | ||
## Print tree content | ||
tree.printTreeContent(printExternalNodes, printDocuments); | ||
If **printExternalNodes** is set to true, print out all external nodes for each internal node. | ||
If **printDocuments** is set to true, print out the whole collection of the tree. | ||
## Traversal | ||
tree.traverse(preCallback, inCallback, postCallback); | ||
For convenient, there are functions for each order of traversal | ||
tree.preOrderTraverse(callback); | ||
tree.inOrderTraverse(callback); | ||
tree.postOrderTraverse(callback); | ||
For example | ||
tree.preOrderTraverse(function(node) { | ||
console.log("node id: " + node.id); | ||
}) | ||
# Data type | ||
## Node | ||
Every nodes has some common informaitons, an node has the following structure: | ||
node = { | ||
id: 3, // the id of this node, data type: JSON, auto generated. | ||
parent: 1, // the parent id of this node, data type: integer | ||
left: leftChildNode, // data type: Node | ||
right: rightChildNode, // data type: Node | ||
data: {} // payload for this node, data type : JSON | ||
} | ||
Data is different for internal nodes and external nodes, | ||
Internal nodes has following structure: | ||
## Internal nodes | ||
internalNode.data = { | ||
type: "internal", // indicates this is an internal node | ||
position: 13, // the branch position of external nodes, data type: integer | ||
prefix: "00101", // the sharing prefix of external nodes, data type: string of 0s and 1s | ||
externalNodeNum: 87, // number of external nodes contained in subtree of this node, data type: integer | ||
totalFrequency: 89, // number of the total frequency of the external nodes in the collection, data type: integer | ||
sistrings: ArrayOfExternalNodes // array of external nodes, data type: array | ||
} | ||
## External nodes | ||
External nodes has following structure: | ||
externalNode.data = { | ||
type: "external", // indicates this is an external node, | ||
sistring: "00101100110101", // binary representation of the character, data type: string | ||
indexes: ["0.1,3", "1.2.5"] // the positions where the sistring appears in the collection, data type: array | ||
} | ||
# Collection | ||
The whole collection consists of documents, which consists of sentenses, which consists of words. | ||
An example could be this: | ||
[ [ '嗨你好', | ||
'這是測試文件1' ], | ||
[ '你好', | ||
'這是測試文件2' ] ] | ||
An index is in following structure: | ||
DocumentPosition.SentensePosition.wordPosition | ||
For example, **"0.1.2"** is the index of the character "測". | ||
# Release History | ||
* 0.1.1 First release | ||
* 0.1.2 Construction complete | ||
* 0.1.3 Able to restore Chinese characters | ||
* 0.1.4 Add external node number and term frequency to internal nodes |
Sorry, the diff of this file is not supported yet
24804
738
111