/** ** ============================== ** O O O OOOO ** O O O O O O ** O O O O O O ** OOOO OOOO O OOO OOOO ** O O O O O O O ** O O O O O O O ** OOOO OOOO O O OOOO ** ============================== ** Dr. Stefan Bosse http://www.bsslab.de ** ** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED ** BY THE AUTHOR(S). ** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, ** MODIFIED, OR OTHERWISE USED IN A CONTEXT ** OUTSIDE OF THE SOFTWARE SYSTEM. ** ** $AUTHORS: Stefan Bosse ** $INITIAL: (C) 2006-2018 bLAB ** $CREATED: 03-03-16 by sbosse. ** $VERSION: 1.4.2 ** ** $INFO: ** ** Interval Decision Tree Learner ** ** Modified ID3-based Decision Tree Algorithm that wraps all data with 2-eps intervals and uses ** interval instead single value arithmetic for entropy calculation and feature selection. ** The classification bases on a nearest-neighbourhood look-up of best matching results. ** ** Two different algorithms are supported: ** ** 1. Static (using learn), the DTI learner using attribute selection based on entropy. ** The training data must be available in advance. ** 2. Dynamic (using update), the DTI learrner using attribute selection based on significance. ** The training data is applied sequentielly (stream learning) updating the model. ** ** Though in principle the both algrotihms can be mixed (first static, then dynamic updating), ** the resulting model will have poor classification quality. Either use static or only dynamic ** (stream) learning. ** ** Portable model ** ** $ENDOFINFO */ var Io = Require('com/io'); var Comp = Require('com/compat'); var current=none; var Aios=none; var min = Comp.pervasives.min; var max = Comp.pervasives.max; /** * Map of valid tree node types * @constant * @static */ var NODE_TYPES = { RESULT: 'result', FEATURE: 'feature', FEATURE_VALUE: 'feature_value' }; function Result(key) { return { type:NODE_TYPES.RESULT, name:key } } function Feature(name,vals) { return { type:NODE_TYPES.FEATURE, name:name, vals:vals } } // A value can be a scalar or a range {a,b} object function Value(val,child) { return { type:NODE_TYPES.FEATURE_VALUE, val:val, child:child } } /** Add a new training set with optional data set merging and value interval expansion. * */ function add_training_set(data,set,merge) { if (merge) { // Merge a data set with an existing for a specific key; create value ranges } else data.push(set); } /** * Computes Log with base-2 * @private */ function log2(n) { return Math.log(n) / Math.log(2); } function results(model) { var line='',sep; if (!model) return ''; switch (model.type) { case NODE_TYPES.RESULT: return model.name; case NODE_TYPES.FEATURE: sep=''; line=''; Comp.array.iter(model.vals,function (v) { line += sep+results(v); sep=','; }); return line; case NODE_TYPES.FEATURE_VALUE: return results(model.child); } return 'result?'; } /** * Finds element with highest occurrence in a list * @private */ function mostCommon(list) { var elementFrequencyMap = {}; var largestFrequency = -1; var mostCommonElement = null; list.forEach(function(element) { var elementFrequency = (elementFrequencyMap[element] || 0) + 1; elementFrequencyMap[element] = elementFrequency; if (largestFrequency < elementFrequency) { mostCommonElement = element; largestFrequency = elementFrequency; } }); return mostCommonElement; } function addVal(v1,v2) { if (v1.a!=undefined) { if (v2.a!=undefined) return {a:v1.a+v2.a,b:v1.b+v2.b}; else return {a:v1.a+v2,b:v2.b+v2}; } else if (v2.a!=undefined) return {a:v2.a+v1,b:v2.b+v1}; else return v1+v2; } function lowerBound(v) { if (v.a==undefined) return v; else return v.a; } function upperBound(v) { if (v.b==undefined) return v; else return v.b; } function equal(v1,v2) { return (v1==v2 || (upperBound(v1) == upperBound(v2) && (lowerBound(v1) == lowerBound(v2)))) } function overlap(v1,v2) { return (upperBound(v1) >= lowerBound(v2) && upperBound(v1) <= upperBound(v2)) || (upperBound(v2) >= lowerBound(v1) && upperBound(v2) <= upperBound(v1)) } function containsVal(vl,v) { for (var i in vl) { var v2=vl[i]; if (overlap(v,v2)) return true; } return false; } function centerVal(v) { if (v.a==undefined) return v; else return (v.a+v.b)/2; } function distanceVal (v1,v2) { return Math.abs(centerVal(v1)-centerVal(v2)); } function Bounds(vl,v) { if (vl.length==0) return {a:v,b:v}; else if (v==undefined) return {a:Min(vl),b:Max(vl)}; else return {a:Min([Min(vl),v]),b:Max([Max(vl),v])}; } function Min(vals) { var min=none; Comp.array.iter(vals, function (val) { if (min==none) min=(val.a==undefined?val:val.a); else min=val.a==undefined?(valmax?val:max):(val.b>max?val.a:max)); }); return max; } // Return interval of a value x with a<=x_center-eps, b>=x_center+eps function epsVal(x,eps) { if (x.a == undefined) return {a:x-eps,b:x+eps}; else if ((x.b-x.a) < 2*eps) return {a:centerVal(x)-eps,b:centerVal(x)+eps}; else return x; } /** Filter out unique values that are spaced at least by eps * */ function uniqueEps(data,eps) { var results=[]; Comp.array.iter(data,function (x) { var found; if (!results.length) results.push(x); else { Comp.array.iter(results,function (y,i) { if (found) return; found = Math.abs(centerVal(x)-centerVal(y)) lowerBound(_vals[i+1].val)) { if (_vals[i].val.b) _vals[i].val.b=lowerBound(_vals[i+1].val)-1; else _vals[i+1].val.a=upperBound(_vals[i].val)+1; } } } } model.vals=_vals; return model; break; case NODE_TYPES.FEATURE_VALUE: return model; break; } } /** Creates a new tree from training data (data) * * data is {x1:v1,x2:v2,..,y:vn} [] * target is classification key name * features is ['x1','x2,',..] w/o target variable * eps is interval applied to all data values * */ function createTree(data, target, features, options) { var _newS,child_node,bounds; var targets = Comp.array.unique(Comp.array.pluck(data, target)); // console.log(targets) if (options.maxdepth==undefined) options.maxdepth=1; if (options.maxdepth==0) return Result('-'); // console.log(data); // console.log(features); //Aios.aios.log('createTree:'+targets.length); //try {Aios.aios.CP();} catch (e) {throw 'DTI.createTree: '+options.maxdepth }; if (Aios) Aios.aios.CP(); if (targets.length == 1) return Result(targets[0]); if (features.length == 0) { var topTarget = mostCommon(targets); return Result(topTarget) } var bestFeatures = getBestFeatures(data, target, features, options.eps); var bestFeature = bestFeatures[0]; var remainingFeatures = Comp.array.filtermap(bestFeatures,function (feat) { if (feat.name!=bestFeature.name) return feat.name; else return none; }); /* var possibleValues = Comp.array.sort(Comp.array.pluck(data, bestFeature.name), function (x,y) { if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order }); */ var possibleValues = getPossibleVals(data,bestFeature.name); var vals=[]; //console.log(bestFeatures); //console.log(possibleValues); var partitions=partitionVals(possibleValues,options.eps); // Aios.aios.log(partitions); //console.log(bestFeatures); //console.log(possibleValues); if (partitions.length==1) { // no further 2*eps separation possible, find best feature by largest distance // resort best feature list with respect to value deviation bestFeatures.sort(function (ef1,ef2) { if (ef1.d > ef2.d) return -1; else return 1; }); bestFeature = bestFeatures[0]; possibleValues = getPossibleVals(data,bestFeature.name); Comp.array.iter(mergeVals(possibleValues),function (val,i) { _newS = data.filter(function(x) { // console.log(x[bestFeature.name],val,overlap(val,x[bestFeature.name])) return overlap(val,x[bestFeature.name]); }); child_node = Value(val); options.maxdepth--; child_node.child = createTree(_newS, target, remainingFeatures, options); //console.log(_newS); vals.push(child_node); }) } else Comp.array.iter(partitions,function (partition,i) { _newS = data.filter(function(x) { // console.log(x[bestFeature.name],v,overlap(x[bestFeature.name],v)) return containsVal(partition,x[bestFeature.name]); }); bounds = Bounds(partition); child_node = Value(options.eps==0?{a:bounds.a,b:bounds.b}:{a:bounds.a-options.eps,b:bounds.b+options.eps}); options.maxdepth--; child_node.child = createTree(_newS, target, remainingFeatures, options); //console.log(_newS); vals.push(child_node); }); return Feature(bestFeature.name,vals); } /** Return the depth of the tree * */ function depth(model) { switch (model.type) { case NODE_TYPES.RESULT: return 0; case NODE_TYPES.FEATURE: return 1+Comp.array.max(model.vals,function (val) { return depth(val); }); case NODE_TYPES.FEATURE_VALUE: return depth(model.child); } return 0; } /** Computes entropy of a list with 2-epsilon intervals * */ function entropyEps(vals,eps) { // TODO: overlapping value intervals var uniqueVals = Comp.array.unique(vals); var probs = uniqueVals.map(function(x) { return probEps(x, vals, eps) }); var logVals = probs.map(function(p) { return -p * log2(p) }); return logVals.reduce(function(a, b) { return a + b }, 0); } function entropyEps2(vals,eps) { // TODO: overlapping value intervals var uniqueVals = uniqueEps(vals,eps); var probs = uniqueVals.map(function(x) { return probEps2(x, vals, eps) }); var logVals = probs.map(function(p) { return -p * log2(p) }); return logVals.reduce(function(a, b) { return a + b }, 0); } function getBestFeatures(data,target,features,eps) { var bestfeatures=[]; function deviation(vals) { var n = vals.length; var mu=Comp.array.sum(vals,function (val) { return (lowerBound(val)+upperBound(val))/2; })/n; var dev=Comp.array.sum(vals,function (val) { return Math.pow(((lowerBound(val)+upperBound(val))/2)-mu,2); })/n; return dev; } for (var feature in features) { if (features[feature]==undefined) throw 'DTI.getBestFeatures: invalid feature vector'; var vals=Comp.array.pluck(data, features[feature]).map(function (val) {return val==undefined?0:val}); var e = entropyEps(vals,eps); var d = deviation(vals); var min = Min(vals); var max = Max(vals); bestfeatures.push({e:e,d:d,range:{a:min,b:max},name:features[feature]}); } bestfeatures.sort(function (ef1,ef2) { if (ef1.e > ef2.e) return -1; else return 1; }); return bestfeatures; } /** Find in one data set the most significant feature variable (i.e., with highest value) */ function getSignificantFeature(data,features) { var f,sig; for (f in features) { if (sig==undefined || sig.val < data[features[f]]) sig={name:features[f],val:data[features[f]]}; } return sig; } function getPossibleVals(data,feature) { return Comp.array.sort(Comp.array.pluck(data, feature), function (x,y) { if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order }); } /** Merge values and intervals */ function mergeVals(vals) { var _vals, merged,i,j; for (i in vals) { var vi = vals[i]; if (!_vals) _vals=[vi]; else { // Find overlapping values and merge merged=false; loopj: for (j in _vals) { var vj = _vals[j]; if (equal(vi,vj)) { merged=true; break loopj; } else if (overlap(vi,vj)) { merged=true; _vals[j]={a:Min([vi,vj]),b:Max([vi,vj])}; break loopj; } } if (!merged) _vals.push(vi); } } //Aios.aios.log(_vals); return _vals||[]; } /** * Predicts class for sample */ function nearestVal(vals,sample,fun) { var best=none; for (var v in vals) { var d=fun?distanceVal(fun(vals[v]),sample):distanceVal(vals[v],sample); if (best==none) best={v:vals[v],d:d}; else if (best.d > d) best={v:vals[v],d:d}; } if (best) return best.v; else return none; } /** Parttition an ordered set of values * Each partition of values has at least 2*eps distance to the next partition. * */ function partitionVals(vals,eps) { var last=none; var partitions=[]; var partition=[]; for(var i in vals) { var val0=vals[i]; var val1=vals[i-1]; if (val1==undefined) partition.push(val0); else if ( upperBound(val0) < upperBound(addVal(val1,2*eps))) partition.push(val0); else { partitions.push(partition); partition=[val0]; } } if (partition.length>0) partitions.push(partition); return partitions; } /** Make a predicition with sample data * */ function predict(model,sample) { var root = model; while (root && root.type !== NODE_TYPES.RESULT) { var attr = root.name; var sampleVal = sample[attr]; var childNode = nearestVal(root.vals,sampleVal,function (node) { return node.val; }); if (childNode){ root = childNode.child; } else { root = none; } } if (root) return root.name||root.val; else return none; }; /** Print the tree * */ function print(model,indent, compact) { var line='',sep; if (compact) return results(model); if (indent==undefined) indent=0; if (!model) return ''; var sp = function () {return Comp.string.create(indent);}; switch (model.type) { case NODE_TYPES.RESULT: return sp()+'-> '+model.name+NL; case NODE_TYPES.FEATURE: line=sp()+'$'+model.name+'?'+NL; Comp.array.iter(model.vals,function (v) { line += print(v,indent+2); }); return line; case NODE_TYPES.FEATURE_VALUE: line=sp()+'='+(model.val.a==undefined?model.val:'['+model.val.a+','+model.val.b+']')+NL; return line+print(model.child,indent+2); } return 'model?'; } /** * Computes probability of of a given value existing in a given list * with additional 2*epsilon interval, only applicable to numerical values. */ function probEps(value, list, eps) { // TODO: ranges var occurrences = Comp.array.filter(list, function(element) { return (element >= (value-eps)) && (element <= (value+eps)); }); var numOccurrences = occurrences.length; var numElements = list.length; return numOccurrences / numElements; } function probEps2(value, list, eps) { // TODO: ranges var occurrences = Comp.array.filter(list, function(element) { return overlap(epsVal(value), epsVal(element)); }); var numOccurrences = occurrences.length; var numElements = list.length; return numOccurrences / numElements; } /** Incremental update of the model with new training set(s). Can be executed with an empty model. * The current tree can be week for a new training set (new target). * This can result in a classification of the new target with insignificant variables. * Therefore, the last tree node must be exapnded with an additional strong (most significant) * variable of the new data set (but it is still a heuristic for future updates). */ function updateTree(model,data, target, features, options) { var eps = options.eps, maxdepth = options.maxdepth, verbose = options.verbose; var featuresINm={}, // All current tree feature variables and their value interval results=[], // All current tree result leafs set,i,v,feature,remainingFeatures,exists,sigFeature; // 1. Analysis of existing model var analyze = function (model,feature) { var feature2; if (!model) return; switch (model.type) { case NODE_TYPES.RESULT: if (!Comp.array.contains(results,model.name)) results.push(model.name); break; case NODE_TYPES.FEATURE: feature2={name:model.name}; if (!featuresINm[model.name]) featuresINm[model.name]=feature2; Comp.array.iter(model.vals,function (v) { analyze(v,featuresINm[model.name]) }); break; case NODE_TYPES.FEATURE_VALUE: if (!feature.val) feature.val={ a:(model.val.a==undefined?model.val:model.val.a), b:(model.val.a==undefined?model.val:model.val.b) }; else { feature.val.a=min(feature.val.a, (model.val.a==undefined?model.val:model.val.a)); feature.val.b=max(feature.val.b, (model.val.a==undefined?model.val:model.val.b)); } analyze(model.child); break; } } analyze(model); // console.log(featuresINm); // console.log(results); exists=Comp.array.contains(results,data[target]); // 2a. Empty model, add first training set with two significant feature variable nodes function init(set) { set=data[i]; sigFeature1=getSignificantFeature(set,features); remainingFeatures=Comp.array.filter(features,function (feat) { return sigFeature1.name!=feat; }); sigFeature2=getSignificantFeature(set,remainingFeatures); featuresINm[sigFeature1.name]={name:sigFeature1.name, val:{a:sigFeature1.val-eps,b:sigFeature1.val+eps}}; featuresINm[sigFeature2.name]={name:sigFeature2.name, val:{a:sigFeature2.val-eps,b:sigFeature2.val+eps}}; results.push(set[target]); model=Feature(sigFeature1.name,[ Value({a:set[sigFeature1.name]-eps,b:set[sigFeature1.name]+eps}, Feature(sigFeature2.name,[ Value({a:sigFeature2.val-eps,b:sigFeature2.val+eps}, Result(set[target])) ]))]); return model; } remainingFeatures=Comp.array.filter(features,function (feat) { return !featuresINm[feat]; }); // 2b. Update the tree with the new training set var update = function (model,set,feature) { var feature2,p; if (!model) return; switch (model.type) { case NODE_TYPES.RESULT: if (model.name != set[target] && verbose) console.log('Cannot insert new training set '+set[target]+' in tree. No more separating variables!'); break; case NODE_TYPES.FEATURE: // console.log(set[target]+': '+ model.name+'='+set[model.name]); if (set[model.name]<(featuresINm[model.name].val.a-eps) || set[model.name]>(featuresINm[model.name].val.b+eps)) { // add new training set; done // the current decision tree can be week, thus add another strong variable node, too! sigFeature=getSignificantFeature(set,remainingFeatures); featuresINm[sigFeature.name]={name:sigFeature.name, val:{a:sigFeature.val-eps,b:sigFeature.val+eps}}; featuresINm[model.name].val.a=min(featuresINm[model.name].val.a,set[model.name]-eps); featuresINm[model.name].val.b=max(featuresINm[model.name].val.b,set[model.name]+eps); if (!Comp.array.contains(results,set[target])) results.push(set[target]); model.vals.push(Value({a:set[model.name]-eps,b:set[model.name]+eps}, Feature(sigFeature.name,[ Value({a:sigFeature.val-eps,b:sigFeature.val+eps}, Result(set[target])) ]))); model.vals=Comp.array.sort(model.vals,function (v1,v2) {return (lowerBound(v1.val)