Mon 21 Jul 22:43:21 CEST 2025

2025-07-21 23:07:23 +02:00 · 2025-07-21 23:07:23 +02:00 · 6eefc84356
commit 6eefc84356
parent ad2194f533
1 changed files with 357 additions and 0 deletions
--- a/js/ml/rf.js
+++ b/js/ml/rf.js
@ -0,0 +1,357 @@
+// MIT License
+// Random Forest Trees (only binary classifier)
+// Andrej Karpathy
+// @blab+ 
+// https://github.com/karpathy/forestjs
+
+
+var RandomForest = function(options) {
+  var L = {};
+  return L
+}
+
+RandomForest.code = {
+
+  /*
+  data is 2D array of size N x D of examples
+  labels is a 1D array of labels (only -1 or 1 for now). In future will support multiclass or maybe even regression
+  options.numTrees can be used to customize number of trees to train (default = 100)
+  options.maxDepth is the maximum depth of each tree in the forest (default = 4)
+  options.numTries is the number of random hypotheses generated at each node during training (default = 10)
+  options.trainFun is a function with signature "function myWeakTrain(data, labels, ix, options)". Here, ix is a list of 
+                   indeces into data of the instances that should be payed attention to. Everything not in the list 
+                   should be ignored. This is done for efficiency. The function should return a model where you store 
+                   variables. (i.e. model = {}; model.myvar = 5;) This will be passed to testFun.
+  options.testFun is a function with signature "funtion myWeakTest(inst, model)" where inst is 1D array specifying an example,
+                   and model will be the same model that you return in options.trainFun. For example, model.myvar will be 5.
+                   see decisionStumpTrain() and decisionStumpTest() downstairs for example.
+  */
+  train: function(L, data, labels, options) {
+    options = options || {};
+    L.options = options;
+    
+    L.numTrees = options.numTrees || 100;
+
+    // initialize many trees and train them all independently
+    L.trees= new Array(L.numTrees);
+    for(var i=0;i<L.numTrees;i++) {
+      L.trees[i] = DecisionTree();
+      DecisionTree.code.train(L.trees[i],data, labels, options);
+    }
+  },
+
+  /*
+  inst is a 1D array of length D of an example. 
+  returns the probability of label 1, i.e. a number in range [0, 1]
+  */
+  predictOne: function(L, inst) {
+
+    // have each tree predict and average out all votes
+    var dec=0;
+    for(var i=0;i<L.numTrees;i++) {
+      dec += DecisionTree.code.predictOne(L.trees[i],inst);
+    }
+    dec /= L.numTrees;
+    return dec;
+  },
+
+  // convenience function. Here, data is NxD array. 
+  // returns probabilities of being 1 for all data in an array.
+  predict: function(L, data) {
+
+    var probabilities= new Array(data.length);
+    for(var i=0;i<data.length;i++) {
+      probabilities[i]= RandomForest.code.predictOne(L,data[i]);
+    }
+    return probabilities;
+
+  }
+
+}
+
+// represents a single decision tree
+var DecisionTree = function(options) {
+  var L = {};
+  return L
+}
+
+DecisionTree.code = {
+
+  train: function(L, data, labels, options) {
+
+    options = options || {};
+    var maxDepth = options.maxDepth || 4;
+    var weakType = options.type || 0;
+
+    
+    var trainFun= decisionStumpTrain;
+    var testFun= decisionStumpTest;
+
+    if(options.trainFun) trainFun = options.trainFun;
+    if(options.testFun) testFun = options.testFun;
+
+    if(weakType == 0) {
+      // Default
+      trainFun  = decisionStumpTrain;
+      testFun   = decisionStumpTest;
+    }
+    if(weakType) {
+      trainFun  = decision2DStumpTrain;
+      L.testFun = testFun = decision2DStumpTest;
+    }
+
+    // initialize various helper variables
+    var numInternals= Math.pow(2, maxDepth)-1;
+    var numNodes= Math.pow(2, maxDepth + 1)-1;
+    var ixs= new Array(numNodes);
+    for(var i=1;i<ixs.length;i++) ixs[i]=[];
+    ixs[0]= new Array(labels.length);
+    for(var i=0;i<labels.length;i++) ixs[0][i]= i; // root node starts out with all nodes as relevant
+    var models = new Array(numInternals);
+
+    // train
+    for(var n=0; n < numInternals; n++) {
+
+      // few base cases
+      var ixhere= ixs[n];
+      if(ixhere.length == 0) { continue; }
+      if(ixhere.length == 1) { ixs[n*2+1] = [ixhere[0]]; continue; } // arbitrary send it down left
+
+      // learn a weak model on relevant data for this node
+      var model= trainFun(data, labels, ixhere);
+      models[n]= model; // back it up model
+
+      // split the data according to the learned model
+      var ixleft=[];
+      var ixright=[];
+      for(var i=0; i<ixhere.length;i++) {
+          var label= testFun(data[ixhere[i]], model);
+          if(label === 1) ixleft.push(ixhere[i]);
+          else ixright.push(ixhere[i]);
+      }
+      ixs[n*2+1]= ixleft;
+      ixs[n*2+2]= ixright;
+    }
+
+    // compute data distributions at the leafs
+    var leafPositives = new Array(numNodes);
+    var leafNegatives = new Array(numNodes);
+    for(var n=numInternals; n < numNodes; n++) {
+      var numones= 0;
+      for(var i=0;i<ixs[n].length;i++) {
+          if(labels[ixs[n][i]] === 1) numones+=1;
+      }
+      leafPositives[n]= numones;
+      leafNegatives[n]= ixs[n].length-numones;
+    }
+
+    // back up important prediction variables for predicting later
+    L.models= models;
+    L.leafPositives = leafPositives;
+    L.leafNegatives = leafNegatives;
+    L.maxDepth= maxDepth;
+    // L.trainFun= trainFun;
+    // L.testFun= testFun;
+  }, 
+
+  // returns probability that example inst is 1.
+  predictOne: function(L, inst) { 
+      var testFun = L.testFun||decisionStumpTest;
+      var n=0;
+      for(var i=0;i<L.maxDepth;i++) {
+          var dir= testFun(inst, L.models[n]);
+          if(dir === 1) n= n*2+1; // descend left
+          else n= n*2+2; // descend right
+      }
+
+      return (L.leafPositives[n] + 0.5) / (L.leafNegatives[n] + 1.0); // bayesian smoothing!
+  }
+}
+
+// returns model
+function decisionStumpTrain(data, labels, ix, options) {
+
+  options = options || {};
+  var numtries = options.numTries || 10;
+
+  // choose a dimension at random and pick a best split
+  var ri= randi(0, data[0].length);
+  var N= ix.length;
+
+  // evaluate class entropy of incoming data
+  var H= entropy(labels, ix);
+  var bestGain=0; 
+  var bestThr= 0;
+  for(var i=0;i<numtries;i++) {
+
+      // pick a random splitting threshold
+      var ix1= ix[randi(0, N)];
+      var ix2= ix[randi(0, N)];
+      while(ix2==ix1) ix2= ix[randi(0, N)]; // enforce distinctness of ix2
+
+      var a= Math.random();
+      var thr= data[ix1][ri]*a + data[ix2][ri]*(1-a);
+
+      // measure information gain we'd get from split with thr
+      var l1=1, r1=1, lm1=1, rm1=1; //counts for Left and label 1, right and label 1, left and minus 1, right and minus 1
+      for(var j=0;j<ix.length;j++) {
+          if(data[ix[j]][ri] < thr) {
+            if(labels[ix[j]]==1) l1++;
+            else lm1++;
+          } else {
+            if(labels[ix[j]]==1) r1++;
+            else rm1++;
+          }
+      }
+      var t= l1+lm1;  // normalize the counts to obtain probability estimates
+      l1=l1/t;
+      lm1=lm1/t;
+      t= r1+rm1;
+      r1=r1/t;
+      rm1= rm1/t;
+
+      var LH= -l1*Math.log(l1) -lm1*Math.log(lm1); // left and right entropy
+      var RH= -r1*Math.log(r1) -rm1*Math.log(rm1);
+
+      var informationGain= H - LH - RH;
+      //console.log("Considering split %f, entropy %f -> %f, %f. Gain %f", thr, H, LH, RH, informationGain);
+      if(informationGain > bestGain || i === 0) {
+          bestGain= informationGain;
+          bestThr= thr;
+      }
+  }
+
+  model= {};
+  model.thr= bestThr;
+  model.ri= ri;
+  return model;
+}
+
+// returns a decision for a single data instance
+function decisionStumpTest(inst, model) {
+  if(!model) {
+      // this is a leaf that never received any data... 
+      return 1;
+  }
+  return inst[model.ri] < model.thr ? 1 : -1;
+
+}
+
+// returns model. Code duplication with decisionStumpTrain :(
+function decision2DStumpTrain(data, labels, ix, options) {
+
+  options = options || {};
+  var numtries = options.numTries || 10;
+
+  // choose a dimension at random and pick a best split
+  var N= ix.length;
+
+  var ri1= 0;
+  var ri2= 1;
+  if(data[0].length > 2) {
+    // more than 2D data. Pick 2 random dimensions
+    ri1= randi(0, data[0].length);
+    ri2= randi(0, data[0].length);
+    while(ri2 == ri1) ri2= randi(0, data[0].length); // must be distinct!
+  }
+
+  // evaluate class entropy of incoming data
+  var H= entropy(labels, ix);
+  var bestGain=0; 
+  var bestw1, bestw2, bestthr;
+  var dots= new Array(ix.length);
+  for(var i=0;i<numtries;i++) {
+
+      // pick random line parameters
+      var alpha= randf(0, 2*Math.PI);
+      var w1= Math.cos(alpha);
+      var w2= Math.sin(alpha);
+
+      // project data on this line and get the dot products
+      for(var j=0;j<ix.length;j++) {
+        dots[j]= w1*data[ix[j]][ri1] + w2*data[ix[j]][ri2];
+      }
+
+      // we are in a tricky situation because data dot product distribution
+      // can be skewed. So we don't want to select just randomly between
+      // min and max. But we also don't want to sort as that is too expensive
+      // let's pick two random points and make the threshold be somewhere between them.
+      // for skewed datasets, the selected points will with relatively high likelihood
+      // be in the high-desnity regions, so the thresholds will make sense
+      var ix1= ix[randi(0, N)];
+      var ix2= ix[randi(0, N)];
+      while(ix2==ix1) ix2= ix[randi(0, N)]; // enforce distinctness of ix2
+      var a= Math.random();
+      var dotthr= dots[ix1]*a + dots[ix2]*(1-a);
+
+      // measure information gain we'd get from split with thr
+      var l1=1, r1=1, lm1=1, rm1=1; //counts for Left and label 1, right and label 1, left and minus 1, right and minus 1
+      for(var j=0;j<ix.length;j++) {
+          if(dots[j] < dotthr) {
+            if(labels[ix[j]]==1) l1++;
+            else lm1++;
+          } else {
+            if(labels[ix[j]]==1) r1++;
+            else rm1++;
+          }
+      }
+      var t= l1+lm1; 
+      l1=l1/t;
+      lm1=lm1/t;
+      t= r1+rm1;
+      r1=r1/t;
+      rm1= rm1/t;
+
+      var LH= -l1*Math.log(l1) -lm1*Math.log(lm1); // left and right entropy
+      var RH= -r1*Math.log(r1) -rm1*Math.log(rm1);
+
+      var informationGain= H - LH - RH;
+      //console.log("Considering split %f, entropy %f -> %f, %f. Gain %f", thr, H, LH, RH, informationGain);
+      if(informationGain > bestGain || i === 0) {
+          bestGain= informationGain;
+          bestw1= w1;
+          bestw2= w2;
+          bestthr= dotthr;
+      }
+  }
+
+  model= {};
+  model.w1= bestw1;
+  model.w2= bestw2;
+  model.dotthr= bestthr;
+  return model;
+}
+
+// returns label for a single data instance
+function decision2DStumpTest(inst, model) {
+  if(!model) {
+      // this is a leaf that never received any data... 
+      return 1;
+  }
+  return inst[0]*model.w1 + inst[1]*model.w2 < model.dotthr ? 1 : -1;
+
+}
+
+// Misc utility functions
+function entropy(labels, ix) {
+  var N= ix.length;
+  var p=0.0;
+  for(var i=0;i<N;i++) {
+      if(labels[ix[i]]==1) p+=1;
+  }
+  p=(1+p)/(N+2); // let's be bayesian about this
+  q=(1+N-p)/(N+2);
+  return (-p*Math.log(p) -q*Math.log(q));
+}
+
+// generate random floating point number between a and b
+function randf(a, b) {
+  return Math.random()*(b-a)+a;
+}
+
+// generate random integer between a and b (b excluded)
+function randi(a, b) {
+   return Math.floor(Math.random()*(b-a)+a);
+}
+
+module.exports = RandomForest