diff --git a/js/ml/dti.js b/js/ml/dti.js new file mode 100644 index 0000000..e80f67e --- /dev/null +++ b/js/ml/dti.js @@ -0,0 +1,805 @@ +/** + ** ============================== + ** O O O OOOO + ** O O O O O O + ** O O O O O O + ** OOOO OOOO O OOO OOOO + ** O O O O O O O + ** O O O O O O O + ** OOOO OOOO O O OOOO + ** ============================== + ** Dr. Stefan Bosse http://www.bsslab.de + ** + ** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED + ** BY THE AUTHOR(S). + ** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, + ** MODIFIED, OR OTHERWISE USED IN A CONTEXT + ** OUTSIDE OF THE SOFTWARE SYSTEM. + ** + ** $AUTHORS: Stefan Bosse + ** $INITIAL: (C) 2006-2018 bLAB + ** $CREATED: 03-03-16 by sbosse. + ** $VERSION: 1.4.2 + ** + ** $INFO: + ** + ** Interval Decision Tree Learner + ** + ** Modified ID3-based Decision Tree Algorithm that wraps all data with 2-eps intervals and uses + ** interval instead single value arithmetic for entropy calculation and feature selection. + ** The classification bases on a nearest-neighbourhood look-up of best matching results. + ** + ** Two different algorithms are supported: + ** + ** 1. Static (using learn), the DTI learner using attribute selection based on entropy. + ** The training data must be available in advance. + ** 2. Dynamic (using update), the DTI learrner using attribute selection based on significance. + ** The training data is applied sequentielly (stream learning) updating the model. + ** + ** Though in principle the both algrotihms can be mixed (first static, then dynamic updating), + ** the resulting model will have poor classification quality. Either use static or only dynamic + ** (stream) learning. + ** + ** Portable model + ** + ** $ENDOFINFO + */ +var Io = Require('com/io'); +var Comp = Require('com/compat'); +var current=none; +var Aios=none; +var min = Comp.pervasives.min; +var max = Comp.pervasives.max; + +/** + * Map of valid tree node types + * @constant + * @static + */ +var NODE_TYPES = { + RESULT: 'result', + FEATURE: 'feature', + FEATURE_VALUE: 'feature_value' +}; + + +function Result(key) { + return { + type:NODE_TYPES.RESULT, + name:key + } +} + +function Feature(name,vals) { + return { + type:NODE_TYPES.FEATURE, + name:name, + vals:vals + } +} + +// A value can be a scalar or a range {a,b} object +function Value(val,child) { + return { + type:NODE_TYPES.FEATURE_VALUE, + val:val, + child:child + } +} + +/** Add a new training set with optional data set merging and value interval expansion. + * + */ +function add_training_set(data,set,merge) { + if (merge) { + // Merge a data set with an existing for a specific key; create value ranges + } else + data.push(set); +} + + +/** + * Computes Log with base-2 + * @private + */ +function log2(n) { + return Math.log(n) / Math.log(2); +} + + + + +function results(model) { + var line='',sep; + if (!model) return ''; + switch (model.type) { + case NODE_TYPES.RESULT: + return model.name; + case NODE_TYPES.FEATURE: + sep=''; + line=''; + Comp.array.iter(model.vals,function (v) { + line += sep+results(v); + sep=','; + }); + return line; + case NODE_TYPES.FEATURE_VALUE: + return results(model.child); + } + return 'result?'; +} + + +/** + * Finds element with highest occurrence in a list + * @private + */ +function mostCommon(list) { + var elementFrequencyMap = {}; + var largestFrequency = -1; + var mostCommonElement = null; + + list.forEach(function(element) { + var elementFrequency = (elementFrequencyMap[element] || 0) + 1; + elementFrequencyMap[element] = elementFrequency; + + if (largestFrequency < elementFrequency) { + mostCommonElement = element; + largestFrequency = elementFrequency; + } + }); + + return mostCommonElement; +} + +function addVal(v1,v2) { + if (v1.a!=undefined) { + if (v2.a!=undefined) return {a:v1.a+v2.a,b:v1.b+v2.b}; + else return {a:v1.a+v2,b:v2.b+v2}; + } else if (v2.a!=undefined) return {a:v2.a+v1,b:v2.b+v1}; + else return v1+v2; +} + +function lowerBound(v) { + if (v.a==undefined) return v; else return v.a; +} + +function upperBound(v) { + if (v.b==undefined) return v; else return v.b; +} + +function equal(v1,v2) { + return (v1==v2 || + (upperBound(v1) == upperBound(v2) && + (lowerBound(v1) == lowerBound(v2)))) +} + +function overlap(v1,v2) { + return (upperBound(v1) >= lowerBound(v2) && upperBound(v1) <= upperBound(v2)) || + (upperBound(v2) >= lowerBound(v1) && upperBound(v2) <= upperBound(v1)) +} + +function containsVal(vl,v) { + for (var i in vl) { + var v2=vl[i]; + if (overlap(v,v2)) return true; + } + return false; +} + +function centerVal(v) { + if (v.a==undefined) return v; else return (v.a+v.b)/2; +} + +function distanceVal (v1,v2) { + return Math.abs(centerVal(v1)-centerVal(v2)); +} + +function Bounds(vl,v) { + if (vl.length==0) return {a:v,b:v}; + else if (v==undefined) return {a:Min(vl),b:Max(vl)}; + else return {a:Min([Min(vl),v]),b:Max([Max(vl),v])}; +} + +function Min(vals) { + var min=none; + Comp.array.iter(vals, function (val) { + if (min==none) min=(val.a==undefined?val:val.a); + else min=val.a==undefined?(valmax?val:max):(val.b>max?val.a:max)); + }); + return max; +} + +// Return interval of a value x with a<=x_center-eps, b>=x_center+eps +function epsVal(x,eps) { + if (x.a == undefined) return {a:x-eps,b:x+eps}; + else if ((x.b-x.a) < 2*eps) return {a:centerVal(x)-eps,b:centerVal(x)+eps}; + else return x; +} +/** Filter out unique values that are spaced at least by eps + * + */ +function uniqueEps(data,eps) { + var results=[]; + Comp.array.iter(data,function (x) { + var found; + if (!results.length) results.push(x); + else { + Comp.array.iter(results,function (y,i) { + if (found) return; + found = Math.abs(centerVal(x)-centerVal(y)) lowerBound(_vals[i+1].val)) { + if (_vals[i].val.b) _vals[i].val.b=lowerBound(_vals[i+1].val)-1; + else _vals[i+1].val.a=upperBound(_vals[i].val)+1; + } + } + } + } + + model.vals=_vals; + return model; + break; + case NODE_TYPES.FEATURE_VALUE: + return model; + break; + } +} + + + +/** Creates a new tree from training data (data) + * + * data is {x1:v1,x2:v2,..,y:vn} [] + * target is classification key name + * features is ['x1','x2,',..] w/o target variable + * eps is interval applied to all data values + * + */ +function createTree(data, target, features, options) { + var _newS,child_node,bounds; + + var targets = Comp.array.unique(Comp.array.pluck(data, target)); + // console.log(targets) + if (options.maxdepth==undefined) options.maxdepth=1; + if (options.maxdepth==0) return Result('-'); + // console.log(data); + // console.log(features); + + //Aios.aios.log('createTree:'+targets.length); + //try {Aios.aios.CP();} catch (e) {throw 'DTI.createTree: '+options.maxdepth }; + if (Aios) Aios.aios.CP(); + if (targets.length == 1) return Result(targets[0]); + + if (features.length == 0) { + var topTarget = mostCommon(targets); + return Result(topTarget) + } + var bestFeatures = getBestFeatures(data, target, features, options.eps); + var bestFeature = bestFeatures[0]; + + var remainingFeatures = Comp.array.filtermap(bestFeatures,function (feat) { + if (feat.name!=bestFeature.name) return feat.name; + else return none; + }); +/* + var possibleValues = Comp.array.sort(Comp.array.pluck(data, bestFeature.name), function (x,y) { + if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order + }); +*/ + var possibleValues = getPossibleVals(data,bestFeature.name); + + var vals=[]; + + //console.log(bestFeatures); + //console.log(possibleValues); + var partitions=partitionVals(possibleValues,options.eps); + // Aios.aios.log(partitions); + //console.log(bestFeatures); + //console.log(possibleValues); + if (partitions.length==1) { + // no further 2*eps separation possible, find best feature by largest distance + // resort best feature list with respect to value deviation + bestFeatures.sort(function (ef1,ef2) { + if (ef1.d > ef2.d) return -1; else return 1; + }); + bestFeature = bestFeatures[0]; + possibleValues = getPossibleVals(data,bestFeature.name); + Comp.array.iter(mergeVals(possibleValues),function (val,i) { + + _newS = data.filter(function(x) { + // console.log(x[bestFeature.name],val,overlap(val,x[bestFeature.name])) + + return overlap(val,x[bestFeature.name]); + }); + child_node = Value(val); + options.maxdepth--; + child_node.child = createTree(_newS, target, remainingFeatures, options); + //console.log(_newS); + vals.push(child_node); + }) + + } else Comp.array.iter(partitions,function (partition,i) { + + _newS = data.filter(function(x) { + // console.log(x[bestFeature.name],v,overlap(x[bestFeature.name],v)) + return containsVal(partition,x[bestFeature.name]); + }); + bounds = Bounds(partition); + child_node = Value(options.eps==0?{a:bounds.a,b:bounds.b}:{a:bounds.a-options.eps,b:bounds.b+options.eps}); + options.maxdepth--; + child_node.child = createTree(_newS, target, remainingFeatures, options); + //console.log(_newS); + vals.push(child_node); + }); + + return Feature(bestFeature.name,vals); +} + +/** Return the depth of the tree + * + */ +function depth(model) { + switch (model.type) { + case NODE_TYPES.RESULT: return 0; + case NODE_TYPES.FEATURE: + return 1+Comp.array.max(model.vals,function (val) { + return depth(val); + }); + case NODE_TYPES.FEATURE_VALUE: + return depth(model.child); + } + return 0; +} + +/** Computes entropy of a list with 2-epsilon intervals + * + */ + +function entropyEps(vals,eps) { + // TODO: overlapping value intervals + var uniqueVals = Comp.array.unique(vals); + var probs = uniqueVals.map(function(x) { + return probEps(x, vals, eps) + }); + + var logVals = probs.map(function(p) { + return -p * log2(p) + }); + + return logVals.reduce(function(a, b) { + return a + b + }, 0); +} + +function entropyEps2(vals,eps) { + // TODO: overlapping value intervals + var uniqueVals = uniqueEps(vals,eps); + var probs = uniqueVals.map(function(x) { + return probEps2(x, vals, eps) + }); + + var logVals = probs.map(function(p) { + return -p * log2(p) + }); + + return logVals.reduce(function(a, b) { + return a + b + }, 0); +} + + +function getBestFeatures(data,target,features,eps) { + var bestfeatures=[]; + function deviation(vals) { + var n = vals.length; + var mu=Comp.array.sum(vals,function (val) { + return (lowerBound(val)+upperBound(val))/2; + })/n; + var dev=Comp.array.sum(vals,function (val) { + return Math.pow(((lowerBound(val)+upperBound(val))/2)-mu,2); + })/n; + return dev; + } + for (var feature in features) { + if (features[feature]==undefined) throw 'DTI.getBestFeatures: invalid feature vector'; + var vals=Comp.array.pluck(data, features[feature]).map(function (val) {return val==undefined?0:val}); + var e = entropyEps(vals,eps); + var d = deviation(vals); + var min = Min(vals); + var max = Max(vals); + bestfeatures.push({e:e,d:d,range:{a:min,b:max},name:features[feature]}); + } + bestfeatures.sort(function (ef1,ef2) { + if (ef1.e > ef2.e) return -1; else return 1; + }); + return bestfeatures; +} + +/** Find in one data set the most significant feature variable (i.e., with highest value) + */ +function getSignificantFeature(data,features) { + var f,sig; + for (f in features) { + if (sig==undefined || sig.val < data[features[f]]) sig={name:features[f],val:data[features[f]]}; + } + return sig; +} + +function getPossibleVals(data,feature) { + return Comp.array.sort(Comp.array.pluck(data, feature), function (x,y) { + if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order + }); +} + +/** Merge values and intervals + */ +function mergeVals(vals) { + var _vals, + merged,i,j; + for (i in vals) { + var vi = vals[i]; + if (!_vals) _vals=[vi]; + else { + // Find overlapping values and merge + merged=false; + loopj: for (j in _vals) { + var vj = _vals[j]; + if (equal(vi,vj)) { + merged=true; + break loopj; + } + else if (overlap(vi,vj)) { + merged=true; + _vals[j]={a:Min([vi,vj]),b:Max([vi,vj])}; + break loopj; + } + } + if (!merged) _vals.push(vi); + } + } + //Aios.aios.log(_vals); + return _vals||[]; +} + +/** + * Predicts class for sample + */ +function nearestVal(vals,sample,fun) { + var best=none; + for (var v in vals) { + var d=fun?distanceVal(fun(vals[v]),sample):distanceVal(vals[v],sample); + if (best==none) + best={v:vals[v],d:d}; + else if (best.d > d) + best={v:vals[v],d:d}; + } + if (best) return best.v; + else return none; +} + + +/** Parttition an ordered set of values + * Each partition of values has at least 2*eps distance to the next partition. + * + */ +function partitionVals(vals,eps) { + var last=none; + var partitions=[]; + var partition=[]; + for(var i in vals) { + var val0=vals[i]; + var val1=vals[i-1]; + + if (val1==undefined) partition.push(val0); + else if ( upperBound(val0) < upperBound(addVal(val1,2*eps))) partition.push(val0); + else { + partitions.push(partition); + partition=[val0]; + } + } + if (partition.length>0) partitions.push(partition); + return partitions; +} + +/** Make a predicition with sample data + * + */ +function predict(model,sample) { + var root = model; + while (root && root.type !== NODE_TYPES.RESULT) { + var attr = root.name; + var sampleVal = sample[attr]; + var childNode = nearestVal(root.vals,sampleVal,function (node) { + return node.val; + }); + + if (childNode){ + root = childNode.child; + } else { + root = none; + } + } + if (root) return root.name||root.val; + else return none; +}; + +/** Print the tree + * + */ +function print(model,indent, compact) { + var line='',sep; + if (compact) return results(model); + if (indent==undefined) indent=0; + if (!model) return ''; + var sp = function () {return Comp.string.create(indent);}; + switch (model.type) { + case NODE_TYPES.RESULT: + return sp()+'-> '+model.name+NL; + case NODE_TYPES.FEATURE: + line=sp()+'$'+model.name+'?'+NL; + Comp.array.iter(model.vals,function (v) { + line += print(v,indent+2); + }); + return line; + case NODE_TYPES.FEATURE_VALUE: + line=sp()+'='+(model.val.a==undefined?model.val:'['+model.val.a+','+model.val.b+']')+NL; + return line+print(model.child,indent+2); + } + return 'model?'; +} + +/** + * Computes probability of of a given value existing in a given list + * with additional 2*epsilon interval, only applicable to numerical values. + */ +function probEps(value, list, eps) { + // TODO: ranges + var occurrences = Comp.array.filter(list, function(element) { + return (element >= (value-eps)) && (element <= (value+eps)); + }); + + var numOccurrences = occurrences.length; + var numElements = list.length; + return numOccurrences / numElements; +} + +function probEps2(value, list, eps) { + // TODO: ranges + var occurrences = Comp.array.filter(list, function(element) { + return overlap(epsVal(value), epsVal(element)); + }); + + var numOccurrences = occurrences.length; + var numElements = list.length; + return numOccurrences / numElements; +} + +/** Incremental update of the model with new training set(s). Can be executed with an empty model. + * The current tree can be week for a new training set (new target). + * This can result in a classification of the new target with insignificant variables. + * Therefore, the last tree node must be exapnded with an additional strong (most significant) + * variable of the new data set (but it is still a heuristic for future updates). + */ +function updateTree(model,data, target, features, options) { + var eps = options.eps, + maxdepth = options.maxdepth, + verbose = options.verbose; + var featuresINm={}, // All current tree feature variables and their value interval + results=[], // All current tree result leafs + set,i,v,feature,remainingFeatures,exists,sigFeature; + // 1. Analysis of existing model + + var analyze = function (model,feature) { + var feature2; + if (!model) return; + switch (model.type) { + case NODE_TYPES.RESULT: + if (!Comp.array.contains(results,model.name)) results.push(model.name); + break; + case NODE_TYPES.FEATURE: + feature2={name:model.name}; + if (!featuresINm[model.name]) featuresINm[model.name]=feature2; + Comp.array.iter(model.vals,function (v) { analyze(v,featuresINm[model.name]) }); + break; + case NODE_TYPES.FEATURE_VALUE: + if (!feature.val) feature.val={ + a:(model.val.a==undefined?model.val:model.val.a), + b:(model.val.a==undefined?model.val:model.val.b) + }; else { + feature.val.a=min(feature.val.a, + (model.val.a==undefined?model.val:model.val.a)); + feature.val.b=max(feature.val.b, + (model.val.a==undefined?model.val:model.val.b)); + } + analyze(model.child); + break; + } + } + + + analyze(model); + // console.log(featuresINm); + // console.log(results); + + exists=Comp.array.contains(results,data[target]); + + + // 2a. Empty model, add first training set with two significant feature variable nodes + function init(set) { + set=data[i]; + sigFeature1=getSignificantFeature(set,features); + remainingFeatures=Comp.array.filter(features,function (feat) { + return sigFeature1.name!=feat; + }); + sigFeature2=getSignificantFeature(set,remainingFeatures); + + featuresINm[sigFeature1.name]={name:sigFeature1.name, + val:{a:sigFeature1.val-eps,b:sigFeature1.val+eps}}; + featuresINm[sigFeature2.name]={name:sigFeature2.name, + val:{a:sigFeature2.val-eps,b:sigFeature2.val+eps}}; + results.push(set[target]); + model=Feature(sigFeature1.name,[ + Value({a:set[sigFeature1.name]-eps,b:set[sigFeature1.name]+eps}, + Feature(sigFeature2.name,[ + Value({a:sigFeature2.val-eps,b:sigFeature2.val+eps}, + Result(set[target])) + ]))]); + return model; + } + + remainingFeatures=Comp.array.filter(features,function (feat) { + return !featuresINm[feat]; + }); + + // 2b. Update the tree with the new training set + var update = function (model,set,feature) { + var feature2,p; + if (!model) return; + switch (model.type) { + + case NODE_TYPES.RESULT: + if (model.name != set[target] && verbose) + console.log('Cannot insert new training set '+set[target]+' in tree. No more separating variables!'); + break; + + case NODE_TYPES.FEATURE: + // console.log(set[target]+': '+ model.name+'='+set[model.name]); + if (set[model.name]<(featuresINm[model.name].val.a-eps) || + set[model.name]>(featuresINm[model.name].val.b+eps)) { + // add new training set; done + // the current decision tree can be week, thus add another strong variable node, too! + sigFeature=getSignificantFeature(set,remainingFeatures); + featuresINm[sigFeature.name]={name:sigFeature.name, + val:{a:sigFeature.val-eps,b:sigFeature.val+eps}}; + featuresINm[model.name].val.a=min(featuresINm[model.name].val.a,set[model.name]-eps); + featuresINm[model.name].val.b=max(featuresINm[model.name].val.b,set[model.name]+eps); + if (!Comp.array.contains(results,set[target])) results.push(set[target]); + + model.vals.push(Value({a:set[model.name]-eps,b:set[model.name]+eps}, + Feature(sigFeature.name,[ + Value({a:sigFeature.val-eps,b:sigFeature.val+eps}, + Result(set[target])) + ]))); + model.vals=Comp.array.sort(model.vals,function (v1,v2) {return (lowerBound(v1.val)