358 lines
11 KiB
JavaScript
358 lines
11 KiB
JavaScript
// MIT License
|
|
// Random Forest Trees (only binary classifier)
|
|
// Andrej Karpathy
|
|
// @blab+
|
|
// https://github.com/karpathy/forestjs
|
|
|
|
|
|
var RandomForest = function(options) {
|
|
var L = {};
|
|
return L
|
|
}
|
|
|
|
RandomForest.code = {
|
|
|
|
/*
|
|
data is 2D array of size N x D of examples
|
|
labels is a 1D array of labels (only -1 or 1 for now). In future will support multiclass or maybe even regression
|
|
options.numTrees can be used to customize number of trees to train (default = 100)
|
|
options.maxDepth is the maximum depth of each tree in the forest (default = 4)
|
|
options.numTries is the number of random hypotheses generated at each node during training (default = 10)
|
|
options.trainFun is a function with signature "function myWeakTrain(data, labels, ix, options)". Here, ix is a list of
|
|
indeces into data of the instances that should be payed attention to. Everything not in the list
|
|
should be ignored. This is done for efficiency. The function should return a model where you store
|
|
variables. (i.e. model = {}; model.myvar = 5;) This will be passed to testFun.
|
|
options.testFun is a function with signature "funtion myWeakTest(inst, model)" where inst is 1D array specifying an example,
|
|
and model will be the same model that you return in options.trainFun. For example, model.myvar will be 5.
|
|
see decisionStumpTrain() and decisionStumpTest() downstairs for example.
|
|
*/
|
|
train: function(L, data, labels, options) {
|
|
options = options || {};
|
|
L.options = options;
|
|
|
|
L.numTrees = options.numTrees || 100;
|
|
|
|
// initialize many trees and train them all independently
|
|
L.trees= new Array(L.numTrees);
|
|
for(var i=0;i<L.numTrees;i++) {
|
|
L.trees[i] = DecisionTree();
|
|
DecisionTree.code.train(L.trees[i],data, labels, options);
|
|
}
|
|
},
|
|
|
|
/*
|
|
inst is a 1D array of length D of an example.
|
|
returns the probability of label 1, i.e. a number in range [0, 1]
|
|
*/
|
|
predictOne: function(L, inst) {
|
|
|
|
// have each tree predict and average out all votes
|
|
var dec=0;
|
|
for(var i=0;i<L.numTrees;i++) {
|
|
dec += DecisionTree.code.predictOne(L.trees[i],inst);
|
|
}
|
|
dec /= L.numTrees;
|
|
return dec;
|
|
},
|
|
|
|
// convenience function. Here, data is NxD array.
|
|
// returns probabilities of being 1 for all data in an array.
|
|
predict: function(L, data) {
|
|
|
|
var probabilities= new Array(data.length);
|
|
for(var i=0;i<data.length;i++) {
|
|
probabilities[i]= RandomForest.code.predictOne(L,data[i]);
|
|
}
|
|
return probabilities;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// represents a single decision tree
|
|
var DecisionTree = function(options) {
|
|
var L = {};
|
|
return L
|
|
}
|
|
|
|
DecisionTree.code = {
|
|
|
|
train: function(L, data, labels, options) {
|
|
|
|
options = options || {};
|
|
var maxDepth = options.maxDepth || 4;
|
|
var weakType = options.type || 0;
|
|
|
|
|
|
var trainFun= decisionStumpTrain;
|
|
var testFun= decisionStumpTest;
|
|
|
|
if(options.trainFun) trainFun = options.trainFun;
|
|
if(options.testFun) testFun = options.testFun;
|
|
|
|
if(weakType == 0) {
|
|
// Default
|
|
trainFun = decisionStumpTrain;
|
|
testFun = decisionStumpTest;
|
|
}
|
|
if(weakType) {
|
|
trainFun = decision2DStumpTrain;
|
|
L.testFun = testFun = decision2DStumpTest;
|
|
}
|
|
|
|
// initialize various helper variables
|
|
var numInternals= Math.pow(2, maxDepth)-1;
|
|
var numNodes= Math.pow(2, maxDepth + 1)-1;
|
|
var ixs= new Array(numNodes);
|
|
for(var i=1;i<ixs.length;i++) ixs[i]=[];
|
|
ixs[0]= new Array(labels.length);
|
|
for(var i=0;i<labels.length;i++) ixs[0][i]= i; // root node starts out with all nodes as relevant
|
|
var models = new Array(numInternals);
|
|
|
|
// train
|
|
for(var n=0; n < numInternals; n++) {
|
|
|
|
// few base cases
|
|
var ixhere= ixs[n];
|
|
if(ixhere.length == 0) { continue; }
|
|
if(ixhere.length == 1) { ixs[n*2+1] = [ixhere[0]]; continue; } // arbitrary send it down left
|
|
|
|
// learn a weak model on relevant data for this node
|
|
var model= trainFun(data, labels, ixhere);
|
|
models[n]= model; // back it up model
|
|
|
|
// split the data according to the learned model
|
|
var ixleft=[];
|
|
var ixright=[];
|
|
for(var i=0; i<ixhere.length;i++) {
|
|
var label= testFun(data[ixhere[i]], model);
|
|
if(label === 1) ixleft.push(ixhere[i]);
|
|
else ixright.push(ixhere[i]);
|
|
}
|
|
ixs[n*2+1]= ixleft;
|
|
ixs[n*2+2]= ixright;
|
|
}
|
|
|
|
// compute data distributions at the leafs
|
|
var leafPositives = new Array(numNodes);
|
|
var leafNegatives = new Array(numNodes);
|
|
for(var n=numInternals; n < numNodes; n++) {
|
|
var numones= 0;
|
|
for(var i=0;i<ixs[n].length;i++) {
|
|
if(labels[ixs[n][i]] === 1) numones+=1;
|
|
}
|
|
leafPositives[n]= numones;
|
|
leafNegatives[n]= ixs[n].length-numones;
|
|
}
|
|
|
|
// back up important prediction variables for predicting later
|
|
L.models= models;
|
|
L.leafPositives = leafPositives;
|
|
L.leafNegatives = leafNegatives;
|
|
L.maxDepth= maxDepth;
|
|
// L.trainFun= trainFun;
|
|
// L.testFun= testFun;
|
|
},
|
|
|
|
// returns probability that example inst is 1.
|
|
predictOne: function(L, inst) {
|
|
var testFun = L.testFun||decisionStumpTest;
|
|
var n=0;
|
|
for(var i=0;i<L.maxDepth;i++) {
|
|
var dir= testFun(inst, L.models[n]);
|
|
if(dir === 1) n= n*2+1; // descend left
|
|
else n= n*2+2; // descend right
|
|
}
|
|
|
|
return (L.leafPositives[n] + 0.5) / (L.leafNegatives[n] + 1.0); // bayesian smoothing!
|
|
}
|
|
}
|
|
|
|
// returns model
|
|
function decisionStumpTrain(data, labels, ix, options) {
|
|
|
|
options = options || {};
|
|
var numtries = options.numTries || 10;
|
|
|
|
// choose a dimension at random and pick a best split
|
|
var ri= randi(0, data[0].length);
|
|
var N= ix.length;
|
|
|
|
// evaluate class entropy of incoming data
|
|
var H= entropy(labels, ix);
|
|
var bestGain=0;
|
|
var bestThr= 0;
|
|
for(var i=0;i<numtries;i++) {
|
|
|
|
// pick a random splitting threshold
|
|
var ix1= ix[randi(0, N)];
|
|
var ix2= ix[randi(0, N)];
|
|
while(ix2==ix1) ix2= ix[randi(0, N)]; // enforce distinctness of ix2
|
|
|
|
var a= Math.random();
|
|
var thr= data[ix1][ri]*a + data[ix2][ri]*(1-a);
|
|
|
|
// measure information gain we'd get from split with thr
|
|
var l1=1, r1=1, lm1=1, rm1=1; //counts for Left and label 1, right and label 1, left and minus 1, right and minus 1
|
|
for(var j=0;j<ix.length;j++) {
|
|
if(data[ix[j]][ri] < thr) {
|
|
if(labels[ix[j]]==1) l1++;
|
|
else lm1++;
|
|
} else {
|
|
if(labels[ix[j]]==1) r1++;
|
|
else rm1++;
|
|
}
|
|
}
|
|
var t= l1+lm1; // normalize the counts to obtain probability estimates
|
|
l1=l1/t;
|
|
lm1=lm1/t;
|
|
t= r1+rm1;
|
|
r1=r1/t;
|
|
rm1= rm1/t;
|
|
|
|
var LH= -l1*Math.log(l1) -lm1*Math.log(lm1); // left and right entropy
|
|
var RH= -r1*Math.log(r1) -rm1*Math.log(rm1);
|
|
|
|
var informationGain= H - LH - RH;
|
|
//console.log("Considering split %f, entropy %f -> %f, %f. Gain %f", thr, H, LH, RH, informationGain);
|
|
if(informationGain > bestGain || i === 0) {
|
|
bestGain= informationGain;
|
|
bestThr= thr;
|
|
}
|
|
}
|
|
|
|
model= {};
|
|
model.thr= bestThr;
|
|
model.ri= ri;
|
|
return model;
|
|
}
|
|
|
|
// returns a decision for a single data instance
|
|
function decisionStumpTest(inst, model) {
|
|
if(!model) {
|
|
// this is a leaf that never received any data...
|
|
return 1;
|
|
}
|
|
return inst[model.ri] < model.thr ? 1 : -1;
|
|
|
|
}
|
|
|
|
// returns model. Code duplication with decisionStumpTrain :(
|
|
function decision2DStumpTrain(data, labels, ix, options) {
|
|
|
|
options = options || {};
|
|
var numtries = options.numTries || 10;
|
|
|
|
// choose a dimension at random and pick a best split
|
|
var N= ix.length;
|
|
|
|
var ri1= 0;
|
|
var ri2= 1;
|
|
if(data[0].length > 2) {
|
|
// more than 2D data. Pick 2 random dimensions
|
|
ri1= randi(0, data[0].length);
|
|
ri2= randi(0, data[0].length);
|
|
while(ri2 == ri1) ri2= randi(0, data[0].length); // must be distinct!
|
|
}
|
|
|
|
// evaluate class entropy of incoming data
|
|
var H= entropy(labels, ix);
|
|
var bestGain=0;
|
|
var bestw1, bestw2, bestthr;
|
|
var dots= new Array(ix.length);
|
|
for(var i=0;i<numtries;i++) {
|
|
|
|
// pick random line parameters
|
|
var alpha= randf(0, 2*Math.PI);
|
|
var w1= Math.cos(alpha);
|
|
var w2= Math.sin(alpha);
|
|
|
|
// project data on this line and get the dot products
|
|
for(var j=0;j<ix.length;j++) {
|
|
dots[j]= w1*data[ix[j]][ri1] + w2*data[ix[j]][ri2];
|
|
}
|
|
|
|
// we are in a tricky situation because data dot product distribution
|
|
// can be skewed. So we don't want to select just randomly between
|
|
// min and max. But we also don't want to sort as that is too expensive
|
|
// let's pick two random points and make the threshold be somewhere between them.
|
|
// for skewed datasets, the selected points will with relatively high likelihood
|
|
// be in the high-desnity regions, so the thresholds will make sense
|
|
var ix1= ix[randi(0, N)];
|
|
var ix2= ix[randi(0, N)];
|
|
while(ix2==ix1) ix2= ix[randi(0, N)]; // enforce distinctness of ix2
|
|
var a= Math.random();
|
|
var dotthr= dots[ix1]*a + dots[ix2]*(1-a);
|
|
|
|
// measure information gain we'd get from split with thr
|
|
var l1=1, r1=1, lm1=1, rm1=1; //counts for Left and label 1, right and label 1, left and minus 1, right and minus 1
|
|
for(var j=0;j<ix.length;j++) {
|
|
if(dots[j] < dotthr) {
|
|
if(labels[ix[j]]==1) l1++;
|
|
else lm1++;
|
|
} else {
|
|
if(labels[ix[j]]==1) r1++;
|
|
else rm1++;
|
|
}
|
|
}
|
|
var t= l1+lm1;
|
|
l1=l1/t;
|
|
lm1=lm1/t;
|
|
t= r1+rm1;
|
|
r1=r1/t;
|
|
rm1= rm1/t;
|
|
|
|
var LH= -l1*Math.log(l1) -lm1*Math.log(lm1); // left and right entropy
|
|
var RH= -r1*Math.log(r1) -rm1*Math.log(rm1);
|
|
|
|
var informationGain= H - LH - RH;
|
|
//console.log("Considering split %f, entropy %f -> %f, %f. Gain %f", thr, H, LH, RH, informationGain);
|
|
if(informationGain > bestGain || i === 0) {
|
|
bestGain= informationGain;
|
|
bestw1= w1;
|
|
bestw2= w2;
|
|
bestthr= dotthr;
|
|
}
|
|
}
|
|
|
|
model= {};
|
|
model.w1= bestw1;
|
|
model.w2= bestw2;
|
|
model.dotthr= bestthr;
|
|
return model;
|
|
}
|
|
|
|
// returns label for a single data instance
|
|
function decision2DStumpTest(inst, model) {
|
|
if(!model) {
|
|
// this is a leaf that never received any data...
|
|
return 1;
|
|
}
|
|
return inst[0]*model.w1 + inst[1]*model.w2 < model.dotthr ? 1 : -1;
|
|
|
|
}
|
|
|
|
// Misc utility functions
|
|
function entropy(labels, ix) {
|
|
var N= ix.length;
|
|
var p=0.0;
|
|
for(var i=0;i<N;i++) {
|
|
if(labels[ix[i]]==1) p+=1;
|
|
}
|
|
p=(1+p)/(N+2); // let's be bayesian about this
|
|
q=(1+N-p)/(N+2);
|
|
return (-p*Math.log(p) -q*Math.log(q));
|
|
}
|
|
|
|
// generate random floating point number between a and b
|
|
function randf(a, b) {
|
|
return Math.random()*(b-a)+a;
|
|
}
|
|
|
|
// generate random integer between a and b (b excluded)
|
|
function randi(a, b) {
|
|
return Math.floor(Math.random()*(b-a)+a);
|
|
}
|
|
|
|
module.exports = RandomForest
|