diff --git a/js/ml/rf.js b/js/ml/rf.js new file mode 100644 index 0000000..8772342 --- /dev/null +++ b/js/ml/rf.js @@ -0,0 +1,357 @@ +// MIT License +// Random Forest Trees (only binary classifier) +// Andrej Karpathy +// @blab+ +// https://github.com/karpathy/forestjs + + +var RandomForest = function(options) { + var L = {}; + return L +} + +RandomForest.code = { + + /* + data is 2D array of size N x D of examples + labels is a 1D array of labels (only -1 or 1 for now). In future will support multiclass or maybe even regression + options.numTrees can be used to customize number of trees to train (default = 100) + options.maxDepth is the maximum depth of each tree in the forest (default = 4) + options.numTries is the number of random hypotheses generated at each node during training (default = 10) + options.trainFun is a function with signature "function myWeakTrain(data, labels, ix, options)". Here, ix is a list of + indeces into data of the instances that should be payed attention to. Everything not in the list + should be ignored. This is done for efficiency. The function should return a model where you store + variables. (i.e. model = {}; model.myvar = 5;) This will be passed to testFun. + options.testFun is a function with signature "funtion myWeakTest(inst, model)" where inst is 1D array specifying an example, + and model will be the same model that you return in options.trainFun. For example, model.myvar will be 5. + see decisionStumpTrain() and decisionStumpTest() downstairs for example. + */ + train: function(L, data, labels, options) { + options = options || {}; + L.options = options; + + L.numTrees = options.numTrees || 100; + + // initialize many trees and train them all independently + L.trees= new Array(L.numTrees); + for(var i=0;i %f, %f. Gain %f", thr, H, LH, RH, informationGain); + if(informationGain > bestGain || i === 0) { + bestGain= informationGain; + bestThr= thr; + } + } + + model= {}; + model.thr= bestThr; + model.ri= ri; + return model; +} + +// returns a decision for a single data instance +function decisionStumpTest(inst, model) { + if(!model) { + // this is a leaf that never received any data... + return 1; + } + return inst[model.ri] < model.thr ? 1 : -1; + +} + +// returns model. Code duplication with decisionStumpTrain :( +function decision2DStumpTrain(data, labels, ix, options) { + + options = options || {}; + var numtries = options.numTries || 10; + + // choose a dimension at random and pick a best split + var N= ix.length; + + var ri1= 0; + var ri2= 1; + if(data[0].length > 2) { + // more than 2D data. Pick 2 random dimensions + ri1= randi(0, data[0].length); + ri2= randi(0, data[0].length); + while(ri2 == ri1) ri2= randi(0, data[0].length); // must be distinct! + } + + // evaluate class entropy of incoming data + var H= entropy(labels, ix); + var bestGain=0; + var bestw1, bestw2, bestthr; + var dots= new Array(ix.length); + for(var i=0;i %f, %f. Gain %f", thr, H, LH, RH, informationGain); + if(informationGain > bestGain || i === 0) { + bestGain= informationGain; + bestw1= w1; + bestw2= w2; + bestthr= dotthr; + } + } + + model= {}; + model.w1= bestw1; + model.w2= bestw2; + model.dotthr= bestthr; + return model; +} + +// returns label for a single data instance +function decision2DStumpTest(inst, model) { + if(!model) { + // this is a leaf that never received any data... + return 1; + } + return inst[0]*model.w1 + inst[1]*model.w2 < model.dotthr ? 1 : -1; + +} + +// Misc utility functions +function entropy(labels, ix) { + var N= ix.length; + var p=0.0; + for(var i=0;i