Mon 21 Jul 22:43:21 CEST 2025
This commit is contained in:
parent
be0958c0bb
commit
c981a071e9
805
js/ml/dti.js
Normal file
805
js/ml/dti.js
Normal file
|
@ -0,0 +1,805 @@
|
|||
/**
|
||||
** ==============================
|
||||
** O O O OOOO
|
||||
** O O O O O O
|
||||
** O O O O O O
|
||||
** OOOO OOOO O OOO OOOO
|
||||
** O O O O O O O
|
||||
** O O O O O O O
|
||||
** OOOO OOOO O O OOOO
|
||||
** ==============================
|
||||
** Dr. Stefan Bosse http://www.bsslab.de
|
||||
**
|
||||
** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED
|
||||
** BY THE AUTHOR(S).
|
||||
** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED,
|
||||
** MODIFIED, OR OTHERWISE USED IN A CONTEXT
|
||||
** OUTSIDE OF THE SOFTWARE SYSTEM.
|
||||
**
|
||||
** $AUTHORS: Stefan Bosse
|
||||
** $INITIAL: (C) 2006-2018 bLAB
|
||||
** $CREATED: 03-03-16 by sbosse.
|
||||
** $VERSION: 1.4.2
|
||||
**
|
||||
** $INFO:
|
||||
**
|
||||
** Interval Decision Tree Learner
|
||||
**
|
||||
** Modified ID3-based Decision Tree Algorithm that wraps all data with 2-eps intervals and uses
|
||||
** interval instead single value arithmetic for entropy calculation and feature selection.
|
||||
** The classification bases on a nearest-neighbourhood look-up of best matching results.
|
||||
**
|
||||
** Two different algorithms are supported:
|
||||
**
|
||||
** 1. Static (using learn), the DTI learner using attribute selection based on entropy.
|
||||
** The training data must be available in advance.
|
||||
** 2. Dynamic (using update), the DTI learrner using attribute selection based on significance.
|
||||
** The training data is applied sequentielly (stream learning) updating the model.
|
||||
**
|
||||
** Though in principle the both algrotihms can be mixed (first static, then dynamic updating),
|
||||
** the resulting model will have poor classification quality. Either use static or only dynamic
|
||||
** (stream) learning.
|
||||
**
|
||||
** Portable model
|
||||
**
|
||||
** $ENDOFINFO
|
||||
*/
|
||||
var Io = Require('com/io');
|
||||
var Comp = Require('com/compat');
|
||||
var current=none;
|
||||
var Aios=none;
|
||||
var min = Comp.pervasives.min;
|
||||
var max = Comp.pervasives.max;
|
||||
|
||||
/**
|
||||
* Map of valid tree node types
|
||||
* @constant
|
||||
* @static
|
||||
*/
|
||||
var NODE_TYPES = {
|
||||
RESULT: 'result',
|
||||
FEATURE: 'feature',
|
||||
FEATURE_VALUE: 'feature_value'
|
||||
};
|
||||
|
||||
|
||||
function Result(key) {
|
||||
return {
|
||||
type:NODE_TYPES.RESULT,
|
||||
name:key
|
||||
}
|
||||
}
|
||||
|
||||
function Feature(name,vals) {
|
||||
return {
|
||||
type:NODE_TYPES.FEATURE,
|
||||
name:name,
|
||||
vals:vals
|
||||
}
|
||||
}
|
||||
|
||||
// A value can be a scalar or a range {a,b} object
|
||||
function Value(val,child) {
|
||||
return {
|
||||
type:NODE_TYPES.FEATURE_VALUE,
|
||||
val:val,
|
||||
child:child
|
||||
}
|
||||
}
|
||||
|
||||
/** Add a new training set with optional data set merging and value interval expansion.
|
||||
*
|
||||
*/
|
||||
function add_training_set(data,set,merge) {
|
||||
if (merge) {
|
||||
// Merge a data set with an existing for a specific key; create value ranges
|
||||
} else
|
||||
data.push(set);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes Log with base-2
|
||||
* @private
|
||||
*/
|
||||
function log2(n) {
|
||||
return Math.log(n) / Math.log(2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
function results(model) {
|
||||
var line='',sep;
|
||||
if (!model) return '';
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
return model.name;
|
||||
case NODE_TYPES.FEATURE:
|
||||
sep='';
|
||||
line='';
|
||||
Comp.array.iter(model.vals,function (v) {
|
||||
line += sep+results(v);
|
||||
sep=',';
|
||||
});
|
||||
return line;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
return results(model.child);
|
||||
}
|
||||
return 'result?';
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds element with highest occurrence in a list
|
||||
* @private
|
||||
*/
|
||||
function mostCommon(list) {
|
||||
var elementFrequencyMap = {};
|
||||
var largestFrequency = -1;
|
||||
var mostCommonElement = null;
|
||||
|
||||
list.forEach(function(element) {
|
||||
var elementFrequency = (elementFrequencyMap[element] || 0) + 1;
|
||||
elementFrequencyMap[element] = elementFrequency;
|
||||
|
||||
if (largestFrequency < elementFrequency) {
|
||||
mostCommonElement = element;
|
||||
largestFrequency = elementFrequency;
|
||||
}
|
||||
});
|
||||
|
||||
return mostCommonElement;
|
||||
}
|
||||
|
||||
function addVal(v1,v2) {
|
||||
if (v1.a!=undefined) {
|
||||
if (v2.a!=undefined) return {a:v1.a+v2.a,b:v1.b+v2.b};
|
||||
else return {a:v1.a+v2,b:v2.b+v2};
|
||||
} else if (v2.a!=undefined) return {a:v2.a+v1,b:v2.b+v1};
|
||||
else return v1+v2;
|
||||
}
|
||||
|
||||
function lowerBound(v) {
|
||||
if (v.a==undefined) return v; else return v.a;
|
||||
}
|
||||
|
||||
function upperBound(v) {
|
||||
if (v.b==undefined) return v; else return v.b;
|
||||
}
|
||||
|
||||
function equal(v1,v2) {
|
||||
return (v1==v2 ||
|
||||
(upperBound(v1) == upperBound(v2) &&
|
||||
(lowerBound(v1) == lowerBound(v2))))
|
||||
}
|
||||
|
||||
function overlap(v1,v2) {
|
||||
return (upperBound(v1) >= lowerBound(v2) && upperBound(v1) <= upperBound(v2)) ||
|
||||
(upperBound(v2) >= lowerBound(v1) && upperBound(v2) <= upperBound(v1))
|
||||
}
|
||||
|
||||
function containsVal(vl,v) {
|
||||
for (var i in vl) {
|
||||
var v2=vl[i];
|
||||
if (overlap(v,v2)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function centerVal(v) {
|
||||
if (v.a==undefined) return v; else return (v.a+v.b)/2;
|
||||
}
|
||||
|
||||
function distanceVal (v1,v2) {
|
||||
return Math.abs(centerVal(v1)-centerVal(v2));
|
||||
}
|
||||
|
||||
function Bounds(vl,v) {
|
||||
if (vl.length==0) return {a:v,b:v};
|
||||
else if (v==undefined) return {a:Min(vl),b:Max(vl)};
|
||||
else return {a:Min([Min(vl),v]),b:Max([Max(vl),v])};
|
||||
}
|
||||
|
||||
function Min(vals) {
|
||||
var min=none;
|
||||
Comp.array.iter(vals, function (val) {
|
||||
if (min==none) min=(val.a==undefined?val:val.a);
|
||||
else min=val.a==undefined?(val<min?val:min):(val.a<min?val.a:min);
|
||||
});
|
||||
return min;
|
||||
}
|
||||
|
||||
function Max(vals) {
|
||||
var max=none;
|
||||
Comp.array.iter(vals,function (val) {
|
||||
if (max==none) max=(val.b==undefined?val:val.b);
|
||||
else max=(val.b==undefined?(val>max?val:max):(val.b>max?val.a:max));
|
||||
});
|
||||
return max;
|
||||
}
|
||||
|
||||
// Return interval of a value x with a<=x_center-eps, b>=x_center+eps
|
||||
function epsVal(x,eps) {
|
||||
if (x.a == undefined) return {a:x-eps,b:x+eps};
|
||||
else if ((x.b-x.a) < 2*eps) return {a:centerVal(x)-eps,b:centerVal(x)+eps};
|
||||
else return x;
|
||||
}
|
||||
/** Filter out unique values that are spaced at least by eps
|
||||
*
|
||||
*/
|
||||
function uniqueEps(data,eps) {
|
||||
var results=[];
|
||||
Comp.array.iter(data,function (x) {
|
||||
var found;
|
||||
if (!results.length) results.push(x);
|
||||
else {
|
||||
Comp.array.iter(results,function (y,i) {
|
||||
if (found) return;
|
||||
found = Math.abs(centerVal(x)-centerVal(y))<eps;
|
||||
if (found) // create new overlapping value with +-eps extensions
|
||||
results[i]={a:Min([x,y])-eps,b:Max([x,y])+eps}
|
||||
});
|
||||
if (!found) results.push(x);
|
||||
}
|
||||
});
|
||||
return results;
|
||||
}
|
||||
|
||||
/** Compact tree, merge nodes and intervals.
|
||||
** adjust=true: Adjust overlapping feature variable value intervals!!!
|
||||
*/
|
||||
|
||||
function compactTree(model,adjust) {
|
||||
var i,j,vi,vj,_vals,merged;
|
||||
function target(model) {
|
||||
var line;
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
return model.name;
|
||||
case NODE_TYPES.FEATURE:
|
||||
line = model.name+'?'+target;
|
||||
Comp.array.iter(model.vals,function (v) {
|
||||
line += target(v);
|
||||
});
|
||||
return line;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
line='='+(model.val.a==undefined?model.val:'['+model.val.a+','+model.val.b+']')+NL;
|
||||
return line+target(model.child);
|
||||
}
|
||||
}
|
||||
if (!model) return model;
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
return model;
|
||||
break;
|
||||
case NODE_TYPES.FEATURE:
|
||||
_vals=[];
|
||||
// 1. Merge
|
||||
for (i in model.vals) {
|
||||
vi=model.vals[i];
|
||||
assert((vi.type==NODE_TYPES.FEATURE_VALUE)||'vi.type==NODE_TYPES.FEATURE_VALUE');
|
||||
merged=false;
|
||||
loopj: for(j in _vals) {
|
||||
vj=_vals[j];
|
||||
if (target(vi.child)==target(vj.child)) {
|
||||
merged=true;
|
||||
vj.val={a:Min([vi.val,vj.val]),b:Max([vi.val,vj.val])}
|
||||
break loopj;
|
||||
}
|
||||
}
|
||||
if (!merged) {
|
||||
_vals.push(vi);
|
||||
vi.child=compactTree(vi.child);
|
||||
}
|
||||
}
|
||||
// 2. Adjust overlapping value intervals!
|
||||
if (adjust) {
|
||||
// TODO: approach too simple!!!!
|
||||
for (i in _vals) {
|
||||
i=Comp.pervasives.int_of_string(i);
|
||||
if (_vals[i+1]) {
|
||||
if (upperBound(_vals[i].val) > lowerBound(_vals[i+1].val)) {
|
||||
if (_vals[i].val.b) _vals[i].val.b=lowerBound(_vals[i+1].val)-1;
|
||||
else _vals[i+1].val.a=upperBound(_vals[i].val)+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
model.vals=_vals;
|
||||
return model;
|
||||
break;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
return model;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Creates a new tree from training data (data)
|
||||
*
|
||||
* data is {x1:v1,x2:v2,..,y:vn} []
|
||||
* target is classification key name
|
||||
* features is ['x1','x2,',..] w/o target variable
|
||||
* eps is interval applied to all data values
|
||||
*
|
||||
*/
|
||||
function createTree(data, target, features, options) {
|
||||
var _newS,child_node,bounds;
|
||||
|
||||
var targets = Comp.array.unique(Comp.array.pluck(data, target));
|
||||
// console.log(targets)
|
||||
if (options.maxdepth==undefined) options.maxdepth=1;
|
||||
if (options.maxdepth==0) return Result('-');
|
||||
// console.log(data);
|
||||
// console.log(features);
|
||||
|
||||
//Aios.aios.log('createTree:'+targets.length);
|
||||
//try {Aios.aios.CP();} catch (e) {throw 'DTI.createTree: '+options.maxdepth };
|
||||
if (Aios) Aios.aios.CP();
|
||||
if (targets.length == 1) return Result(targets[0]);
|
||||
|
||||
if (features.length == 0) {
|
||||
var topTarget = mostCommon(targets);
|
||||
return Result(topTarget)
|
||||
}
|
||||
var bestFeatures = getBestFeatures(data, target, features, options.eps);
|
||||
var bestFeature = bestFeatures[0];
|
||||
|
||||
var remainingFeatures = Comp.array.filtermap(bestFeatures,function (feat) {
|
||||
if (feat.name!=bestFeature.name) return feat.name;
|
||||
else return none;
|
||||
});
|
||||
/*
|
||||
var possibleValues = Comp.array.sort(Comp.array.pluck(data, bestFeature.name), function (x,y) {
|
||||
if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order
|
||||
});
|
||||
*/
|
||||
var possibleValues = getPossibleVals(data,bestFeature.name);
|
||||
|
||||
var vals=[];
|
||||
|
||||
//console.log(bestFeatures);
|
||||
//console.log(possibleValues);
|
||||
var partitions=partitionVals(possibleValues,options.eps);
|
||||
// Aios.aios.log(partitions);
|
||||
//console.log(bestFeatures);
|
||||
//console.log(possibleValues);
|
||||
if (partitions.length==1) {
|
||||
// no further 2*eps separation possible, find best feature by largest distance
|
||||
// resort best feature list with respect to value deviation
|
||||
bestFeatures.sort(function (ef1,ef2) {
|
||||
if (ef1.d > ef2.d) return -1; else return 1;
|
||||
});
|
||||
bestFeature = bestFeatures[0];
|
||||
possibleValues = getPossibleVals(data,bestFeature.name);
|
||||
Comp.array.iter(mergeVals(possibleValues),function (val,i) {
|
||||
|
||||
_newS = data.filter(function(x) {
|
||||
// console.log(x[bestFeature.name],val,overlap(val,x[bestFeature.name]))
|
||||
|
||||
return overlap(val,x[bestFeature.name]);
|
||||
});
|
||||
child_node = Value(val);
|
||||
options.maxdepth--;
|
||||
child_node.child = createTree(_newS, target, remainingFeatures, options);
|
||||
//console.log(_newS);
|
||||
vals.push(child_node);
|
||||
})
|
||||
|
||||
} else Comp.array.iter(partitions,function (partition,i) {
|
||||
|
||||
_newS = data.filter(function(x) {
|
||||
// console.log(x[bestFeature.name],v,overlap(x[bestFeature.name],v))
|
||||
return containsVal(partition,x[bestFeature.name]);
|
||||
});
|
||||
bounds = Bounds(partition);
|
||||
child_node = Value(options.eps==0?{a:bounds.a,b:bounds.b}:{a:bounds.a-options.eps,b:bounds.b+options.eps});
|
||||
options.maxdepth--;
|
||||
child_node.child = createTree(_newS, target, remainingFeatures, options);
|
||||
//console.log(_newS);
|
||||
vals.push(child_node);
|
||||
});
|
||||
|
||||
return Feature(bestFeature.name,vals);
|
||||
}
|
||||
|
||||
/** Return the depth of the tree
|
||||
*
|
||||
*/
|
||||
function depth(model) {
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT: return 0;
|
||||
case NODE_TYPES.FEATURE:
|
||||
return 1+Comp.array.max(model.vals,function (val) {
|
||||
return depth(val);
|
||||
});
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
return depth(model.child);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Computes entropy of a list with 2-epsilon intervals
|
||||
*
|
||||
*/
|
||||
|
||||
function entropyEps(vals,eps) {
|
||||
// TODO: overlapping value intervals
|
||||
var uniqueVals = Comp.array.unique(vals);
|
||||
var probs = uniqueVals.map(function(x) {
|
||||
return probEps(x, vals, eps)
|
||||
});
|
||||
|
||||
var logVals = probs.map(function(p) {
|
||||
return -p * log2(p)
|
||||
});
|
||||
|
||||
return logVals.reduce(function(a, b) {
|
||||
return a + b
|
||||
}, 0);
|
||||
}
|
||||
|
||||
function entropyEps2(vals,eps) {
|
||||
// TODO: overlapping value intervals
|
||||
var uniqueVals = uniqueEps(vals,eps);
|
||||
var probs = uniqueVals.map(function(x) {
|
||||
return probEps2(x, vals, eps)
|
||||
});
|
||||
|
||||
var logVals = probs.map(function(p) {
|
||||
return -p * log2(p)
|
||||
});
|
||||
|
||||
return logVals.reduce(function(a, b) {
|
||||
return a + b
|
||||
}, 0);
|
||||
}
|
||||
|
||||
|
||||
function getBestFeatures(data,target,features,eps) {
|
||||
var bestfeatures=[];
|
||||
function deviation(vals) {
|
||||
var n = vals.length;
|
||||
var mu=Comp.array.sum(vals,function (val) {
|
||||
return (lowerBound(val)+upperBound(val))/2;
|
||||
})/n;
|
||||
var dev=Comp.array.sum(vals,function (val) {
|
||||
return Math.pow(((lowerBound(val)+upperBound(val))/2)-mu,2);
|
||||
})/n;
|
||||
return dev;
|
||||
}
|
||||
for (var feature in features) {
|
||||
if (features[feature]==undefined) throw 'DTI.getBestFeatures: invalid feature vector';
|
||||
var vals=Comp.array.pluck(data, features[feature]).map(function (val) {return val==undefined?0:val});
|
||||
var e = entropyEps(vals,eps);
|
||||
var d = deviation(vals);
|
||||
var min = Min(vals);
|
||||
var max = Max(vals);
|
||||
bestfeatures.push({e:e,d:d,range:{a:min,b:max},name:features[feature]});
|
||||
}
|
||||
bestfeatures.sort(function (ef1,ef2) {
|
||||
if (ef1.e > ef2.e) return -1; else return 1;
|
||||
});
|
||||
return bestfeatures;
|
||||
}
|
||||
|
||||
/** Find in one data set the most significant feature variable (i.e., with highest value)
|
||||
*/
|
||||
function getSignificantFeature(data,features) {
|
||||
var f,sig;
|
||||
for (f in features) {
|
||||
if (sig==undefined || sig.val < data[features[f]]) sig={name:features[f],val:data[features[f]]};
|
||||
}
|
||||
return sig;
|
||||
}
|
||||
|
||||
function getPossibleVals(data,feature) {
|
||||
return Comp.array.sort(Comp.array.pluck(data, feature), function (x,y) {
|
||||
if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order
|
||||
});
|
||||
}
|
||||
|
||||
/** Merge values and intervals
|
||||
*/
|
||||
function mergeVals(vals) {
|
||||
var _vals,
|
||||
merged,i,j;
|
||||
for (i in vals) {
|
||||
var vi = vals[i];
|
||||
if (!_vals) _vals=[vi];
|
||||
else {
|
||||
// Find overlapping values and merge
|
||||
merged=false;
|
||||
loopj: for (j in _vals) {
|
||||
var vj = _vals[j];
|
||||
if (equal(vi,vj)) {
|
||||
merged=true;
|
||||
break loopj;
|
||||
}
|
||||
else if (overlap(vi,vj)) {
|
||||
merged=true;
|
||||
_vals[j]={a:Min([vi,vj]),b:Max([vi,vj])};
|
||||
break loopj;
|
||||
}
|
||||
}
|
||||
if (!merged) _vals.push(vi);
|
||||
}
|
||||
}
|
||||
//Aios.aios.log(_vals);
|
||||
return _vals||[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Predicts class for sample
|
||||
*/
|
||||
function nearestVal(vals,sample,fun) {
|
||||
var best=none;
|
||||
for (var v in vals) {
|
||||
var d=fun?distanceVal(fun(vals[v]),sample):distanceVal(vals[v],sample);
|
||||
if (best==none)
|
||||
best={v:vals[v],d:d};
|
||||
else if (best.d > d)
|
||||
best={v:vals[v],d:d};
|
||||
}
|
||||
if (best) return best.v;
|
||||
else return none;
|
||||
}
|
||||
|
||||
|
||||
/** Parttition an ordered set of values
|
||||
* Each partition of values has at least 2*eps distance to the next partition.
|
||||
*
|
||||
*/
|
||||
function partitionVals(vals,eps) {
|
||||
var last=none;
|
||||
var partitions=[];
|
||||
var partition=[];
|
||||
for(var i in vals) {
|
||||
var val0=vals[i];
|
||||
var val1=vals[i-1];
|
||||
|
||||
if (val1==undefined) partition.push(val0);
|
||||
else if ( upperBound(val0) < upperBound(addVal(val1,2*eps))) partition.push(val0);
|
||||
else {
|
||||
partitions.push(partition);
|
||||
partition=[val0];
|
||||
}
|
||||
}
|
||||
if (partition.length>0) partitions.push(partition);
|
||||
return partitions;
|
||||
}
|
||||
|
||||
/** Make a predicition with sample data
|
||||
*
|
||||
*/
|
||||
function predict(model,sample) {
|
||||
var root = model;
|
||||
while (root && root.type !== NODE_TYPES.RESULT) {
|
||||
var attr = root.name;
|
||||
var sampleVal = sample[attr];
|
||||
var childNode = nearestVal(root.vals,sampleVal,function (node) {
|
||||
return node.val;
|
||||
});
|
||||
|
||||
if (childNode){
|
||||
root = childNode.child;
|
||||
} else {
|
||||
root = none;
|
||||
}
|
||||
}
|
||||
if (root) return root.name||root.val;
|
||||
else return none;
|
||||
};
|
||||
|
||||
/** Print the tree
|
||||
*
|
||||
*/
|
||||
function print(model,indent, compact) {
|
||||
var line='',sep;
|
||||
if (compact) return results(model);
|
||||
if (indent==undefined) indent=0;
|
||||
if (!model) return '';
|
||||
var sp = function () {return Comp.string.create(indent);};
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
return sp()+'-> '+model.name+NL;
|
||||
case NODE_TYPES.FEATURE:
|
||||
line=sp()+'$'+model.name+'?'+NL;
|
||||
Comp.array.iter(model.vals,function (v) {
|
||||
line += print(v,indent+2);
|
||||
});
|
||||
return line;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
line=sp()+'='+(model.val.a==undefined?model.val:'['+model.val.a+','+model.val.b+']')+NL;
|
||||
return line+print(model.child,indent+2);
|
||||
}
|
||||
return 'model?';
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes probability of of a given value existing in a given list
|
||||
* with additional 2*epsilon interval, only applicable to numerical values.
|
||||
*/
|
||||
function probEps(value, list, eps) {
|
||||
// TODO: ranges
|
||||
var occurrences = Comp.array.filter(list, function(element) {
|
||||
return (element >= (value-eps)) && (element <= (value+eps));
|
||||
});
|
||||
|
||||
var numOccurrences = occurrences.length;
|
||||
var numElements = list.length;
|
||||
return numOccurrences / numElements;
|
||||
}
|
||||
|
||||
function probEps2(value, list, eps) {
|
||||
// TODO: ranges
|
||||
var occurrences = Comp.array.filter(list, function(element) {
|
||||
return overlap(epsVal(value), epsVal(element));
|
||||
});
|
||||
|
||||
var numOccurrences = occurrences.length;
|
||||
var numElements = list.length;
|
||||
return numOccurrences / numElements;
|
||||
}
|
||||
|
||||
/** Incremental update of the model with new training set(s). Can be executed with an empty model.
|
||||
* The current tree can be week for a new training set (new target).
|
||||
* This can result in a classification of the new target with insignificant variables.
|
||||
* Therefore, the last tree node must be exapnded with an additional strong (most significant)
|
||||
* variable of the new data set (but it is still a heuristic for future updates).
|
||||
*/
|
||||
function updateTree(model,data, target, features, options) {
|
||||
var eps = options.eps,
|
||||
maxdepth = options.maxdepth,
|
||||
verbose = options.verbose;
|
||||
var featuresINm={}, // All current tree feature variables and their value interval
|
||||
results=[], // All current tree result leafs
|
||||
set,i,v,feature,remainingFeatures,exists,sigFeature;
|
||||
// 1. Analysis of existing model
|
||||
|
||||
var analyze = function (model,feature) {
|
||||
var feature2;
|
||||
if (!model) return;
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
if (!Comp.array.contains(results,model.name)) results.push(model.name);
|
||||
break;
|
||||
case NODE_TYPES.FEATURE:
|
||||
feature2={name:model.name};
|
||||
if (!featuresINm[model.name]) featuresINm[model.name]=feature2;
|
||||
Comp.array.iter(model.vals,function (v) { analyze(v,featuresINm[model.name]) });
|
||||
break;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
if (!feature.val) feature.val={
|
||||
a:(model.val.a==undefined?model.val:model.val.a),
|
||||
b:(model.val.a==undefined?model.val:model.val.b)
|
||||
}; else {
|
||||
feature.val.a=min(feature.val.a,
|
||||
(model.val.a==undefined?model.val:model.val.a));
|
||||
feature.val.b=max(feature.val.b,
|
||||
(model.val.a==undefined?model.val:model.val.b));
|
||||
}
|
||||
analyze(model.child);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
analyze(model);
|
||||
// console.log(featuresINm);
|
||||
// console.log(results);
|
||||
|
||||
exists=Comp.array.contains(results,data[target]);
|
||||
|
||||
|
||||
// 2a. Empty model, add first training set with two significant feature variable nodes
|
||||
function init(set) {
|
||||
set=data[i];
|
||||
sigFeature1=getSignificantFeature(set,features);
|
||||
remainingFeatures=Comp.array.filter(features,function (feat) {
|
||||
return sigFeature1.name!=feat;
|
||||
});
|
||||
sigFeature2=getSignificantFeature(set,remainingFeatures);
|
||||
|
||||
featuresINm[sigFeature1.name]={name:sigFeature1.name,
|
||||
val:{a:sigFeature1.val-eps,b:sigFeature1.val+eps}};
|
||||
featuresINm[sigFeature2.name]={name:sigFeature2.name,
|
||||
val:{a:sigFeature2.val-eps,b:sigFeature2.val+eps}};
|
||||
results.push(set[target]);
|
||||
model=Feature(sigFeature1.name,[
|
||||
Value({a:set[sigFeature1.name]-eps,b:set[sigFeature1.name]+eps},
|
||||
Feature(sigFeature2.name,[
|
||||
Value({a:sigFeature2.val-eps,b:sigFeature2.val+eps},
|
||||
Result(set[target]))
|
||||
]))]);
|
||||
return model;
|
||||
}
|
||||
|
||||
remainingFeatures=Comp.array.filter(features,function (feat) {
|
||||
return !featuresINm[feat];
|
||||
});
|
||||
|
||||
// 2b. Update the tree with the new training set
|
||||
var update = function (model,set,feature) {
|
||||
var feature2,p;
|
||||
if (!model) return;
|
||||
switch (model.type) {
|
||||
|
||||
case NODE_TYPES.RESULT:
|
||||
if (model.name != set[target] && verbose)
|
||||
console.log('Cannot insert new training set '+set[target]+' in tree. No more separating variables!');
|
||||
break;
|
||||
|
||||
case NODE_TYPES.FEATURE:
|
||||
// console.log(set[target]+': '+ model.name+'='+set[model.name]);
|
||||
if (set[model.name]<(featuresINm[model.name].val.a-eps) ||
|
||||
set[model.name]>(featuresINm[model.name].val.b+eps)) {
|
||||
// add new training set; done
|
||||
// the current decision tree can be week, thus add another strong variable node, too!
|
||||
sigFeature=getSignificantFeature(set,remainingFeatures);
|
||||
featuresINm[sigFeature.name]={name:sigFeature.name,
|
||||
val:{a:sigFeature.val-eps,b:sigFeature.val+eps}};
|
||||
featuresINm[model.name].val.a=min(featuresINm[model.name].val.a,set[model.name]-eps);
|
||||
featuresINm[model.name].val.b=max(featuresINm[model.name].val.b,set[model.name]+eps);
|
||||
if (!Comp.array.contains(results,set[target])) results.push(set[target]);
|
||||
|
||||
model.vals.push(Value({a:set[model.name]-eps,b:set[model.name]+eps},
|
||||
Feature(sigFeature.name,[
|
||||
Value({a:sigFeature.val-eps,b:sigFeature.val+eps},
|
||||
Result(set[target]))
|
||||
])));
|
||||
model.vals=Comp.array.sort(model.vals,function (v1,v2) {return (lowerBound(v1.val)<lowerBound(v2.val))?-1:1});
|
||||
} else {
|
||||
// go deeper, but extend the interval of the best matching child node with new data variable
|
||||
Comp.array.iter_break(model.vals,function (fv) {
|
||||
// console.log(model.name,fv.val,overlap(fv.val,{a:set[model.name]-eps,b:set[model.name]+eps}))
|
||||
if (overlap(fv.val,{a:set[model.name]-eps,b:set[model.name]+eps})) {
|
||||
fv.val.a=min(lowerBound(fv.val),set[model.name]-eps);
|
||||
fv.val.b=max(upperBound(fv.val),set[model.name]+eps);
|
||||
update(fv,set,model.name);
|
||||
return true;
|
||||
} else return false;
|
||||
});
|
||||
}
|
||||
break;
|
||||
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
update(model.child,set);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (i in data) {
|
||||
set=data[i];
|
||||
if (model==undefined || model.type==undefined)
|
||||
model=init(set);
|
||||
else
|
||||
update(model,set);
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
NODE_TYPES:NODE_TYPES,
|
||||
compactTree:compactTree,
|
||||
create:function (options) {
|
||||
// type options = {data number [][], target:string, features: string [], eps;number, maxdepth}
|
||||
return createTree(options.data,options.target,options.features,options)
|
||||
},
|
||||
depth:depth,
|
||||
entropy:entropyEps,
|
||||
evaluate:function evaluate(model,target,samples){},
|
||||
predict:predict,
|
||||
print:print,
|
||||
results:results,
|
||||
update:function (model,options) {
|
||||
// type options = {data number [][], target:string, features: string [], eps:number, maxdepth}
|
||||
return updateTree(model,options.data,options.target,options.features,options)
|
||||
},
|
||||
current:function (module) { current=module.current; Aios=module;}
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user