From e1d7ab15f6358cf2067753b1ced4a5d9186e4dcc Mon Sep 17 00:00:00 2001 From: sbosse Date: Mon, 21 Jul 2025 23:07:11 +0200 Subject: [PATCH] Mon 21 Jul 22:43:21 CEST 2025 --- js/ml/dts.js | 477 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 477 insertions(+) create mode 100644 js/ml/dts.js diff --git a/js/ml/dts.js b/js/ml/dts.js new file mode 100644 index 0000000..51ad6aa --- /dev/null +++ b/js/ml/dts.js @@ -0,0 +1,477 @@ +/** + ** ================================== + ** OOOO OOOO OOOO O O OOOO + ** O O O O O O O O O + ** O O O O O O O O O + ** OOOO OOOO OOOO O OOO OOOO + ** O O O O O O O O O + ** O O O O O O O O O + ** OOOO OOOO OOOO OOOO O O OOOO + ** ================================== + ** BSSLAB, Dr. Stefan Bosse http://www.bsslab.de + ** + ** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED + ** BY THE AUTHOR. + ** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, + ** MODIFIED, OR OTHERWISE USED IN A CONTEXT + ** OUTSIDE OF THE SOFTWARE SYSTEM. + ** + ** $AUTHORS: Stefan Bosse + ** $INITIAL: (C) 2006-2016 BSSLAB + ** $CREATED: 22/04/16 by sbosse. + ** $VERSION: 1.1.1 + ** + ** $INFO: + ** + ** ID3 eps-Entropy-sigma Interval Decision Tree Algorithm for Strings + ** + ** String ::= {c} + ** c ::= 'a'-'z''A'-'Z''-''*''!' + ** + ** $ENDOFINFO + */ +var _ = Require('ml/lodash'); +var Io = Require('com/io'); +var Comp = Require('com/compat'); +var current=none; +var Aios=none; + +/** + * Map of valid tree node types + * @constant + * @static + */ +var NODE_TYPES = { + RESULT: 'result', + FEATURE: 'feature', + FEATURE_VALUE: 'feature_value' +}; + + +function Result(key) { + return { + type:NODE_TYPES.RESULT, + name:key + } +} + +function Feature(name,vals) { + return { + type:NODE_TYPES.FEATURE, + name:name, + vals:vals + } +} + +// A value can be a scalar or a range {a,b} object +function Value(val,child) { + return { + type:NODE_TYPES.FEATURE_VALUE, + val:val, + child:child + } +} + + +function decA(c) { + var c,cv; + if (c==undefined || c=='*' || c=='0' || c=='-') return 0; + if (c=='!') return 100; + cv=c.charCodeAt(); + if (c>='a' && c<='z') cv=cv-96; // 1..26 + else if (c>='A' && c<='Z') cv=cv-38; // 27..52 + else cv=0; + return cv; +} + +/** Naive Hammer(ing) Distance of two strings (based on alg. from Mitch Anderson). + * 0: equal + * >0: not equal + * + */ +function Distance(str1, str2) { + var dist = 0; + if (str1.length= (value-eps)) && (d <= (value+eps)); + }); + + var numOccurrences = occurrences.length; + var numElements = list.length; + return numOccurrences / numElements; +} + +/** + * Computes Log with base-2 + * @private + */ +function log2(n) { + return Math.log(n) / Math.log(2); +} + +function depth(model) { + switch (model.type) { + case NODE_TYPES.RESULT: return 0; + case NODE_TYPES.FEATURE: + return 1+Comp.array.max(model.vals,function (val) { + return depth(val); + }); + case NODE_TYPES.FEATURE_VALUE: + return depth(model.child); + } + return 0; +} + + +function print(model) { + var line='',sep; + if (!model) return ''; + switch (model.type) { + case NODE_TYPES.RESULT: + return ' -> '+model.name; + case NODE_TYPES.FEATURE: + line='('+model.name+'?'; + sep=''; + Comp.array.iter(model.vals,function (v) { + line += sep+print(v); + sep=','; + }); + return line+')'; + case NODE_TYPES.FEATURE_VALUE: + return ' '+(model.val.a==undefined?model.val:'['+model.val.a+','+model.val.b+']')+ + ':'+print(model.child); + } + return 'model?'; +} + +function results(model) { + var line='',sep; + if (!model) return ''; + switch (model.type) { + case NODE_TYPES.RESULT: + return model.name; + case NODE_TYPES.FEATURE: + sep=''; + line=''; + Comp.array.iter(model.vals,function (v) { + line += sep+results(v); + sep=','; + }); + return line; + case NODE_TYPES.FEATURE_VALUE: + return results(model.child); + } + return 'result?'; +} + + +/** + * Finds element with highest occurrence in a list + * @private + */ +function mostCommon(list) { + var elementFrequencyMap = {}; + var largestFrequency = -1; + var mostCommonElement = null; + + list.forEach(function(element) { + var elementFrequency = (elementFrequencyMap[element] || 0) + 1; + elementFrequencyMap[element] = elementFrequency; + + if (largestFrequency < elementFrequency) { + mostCommonElement = element; + largestFrequency = elementFrequency; + } + }); + + return mostCommonElement; +} + +function addVal(v1,v2) { + if (v1.a!=undefined) { + if (v2.a!=undefined) return {a:v1.a+v2.a,b:v1.b+v2.b}; + else return {a:v1.a+v2,b:v2.b+v2}; + } else if (v2.a!=undefined) return {a:v2.a+v1,b:v2.b+v1}; + else return v1+v2; +} + +function lowerBound(v) { + if (v.a==undefined) return v; else return v.a; +} + +function upperBound(v) { + if (v.b==undefined) return v; else return v.b; +} + +function overlap(v1,v2) { + return (upperBound(v1) >= lowerBound(v2) && upperBound(v1) <= upperBound(v2)) || + (upperBound(v2) >= lowerBound(v1) && upperBound(v2) <= upperBound(v1)) +} + +function containsVal(vl,v) { + for (var i in vl) { + var v2=vl[i]; + if (overlap(v,v2)) return true; + } + return false; +} + +function centerVal(v) { + if (v.a==undefined) return v; else return (v.a+v.b)/2; +} + +function distanceVal (v1,v2) { + return Math.abs(centerVal(v1)-centerVal(v2)); +} + +function Bounds(vl,v) { + if (vl.length==0) return {a:v,b:v}; + else if (v==undefined) return {a:Min(vl),b:Max(vl)}; + else return {a:Min([Min(vl),v]),b:Max([Max(vl),v])}; +} + +function Min(vals) { + var min=none; + Comp.array.iter(vals, function (val) { + if (min==none) min=(val.a==undefined?val:val.a); + else min=val.a==undefined?(valmax?val:max):(val.b>max?val.a:max)); + }); + return max; +} + +function getBestFeatures(data,target,features,eps) { + var bestfeatures=[]; + function deviation(vals) { + var n = vals.length; + var mu=Comp.array.sum(vals,function (val) { + return (lowerBound(val)+upperBound(val))/2; + })/n; + var dev=Comp.array.sum(vals,function (val) { + return Math.pow(((lowerBound(val)+upperBound(val))/2)-mu,2); + })/n; + return dev; + } + for (var feature in features) { + var vals=_.pluck(data, features[feature]); + var e = entropyEps(vals,eps); + var d = deviation(vals); + var min = Min(vals); + var max = Max(vals); + bestfeatures.push({e:e,d:d,range:{a:min,b:max},name:features[feature]}); + } + bestfeatures.sort(function (ef1,ef2) { + if (ef1.e > ef2.e) return -1; else return 1; + }); + return bestfeatures; +} + +/** Parttition an ordered set of values + * Each partition of values has at least 2*eps distance to the next partition. + * + */ +function partitionVals(vals,eps) { + var last=none; + var partitions=[]; + var partition=[]; + for(var i in vals) { + var val0=vals[i]; + var val1=vals[i-1]; + + if (val1==undefined) partition.push(val0); + else if ( upperBound(val0) < upperBound(addVal(val1,2*eps))) partition.push(val0); + else { + partitions.push(partition); + partition=[val0]; + } + } + if (partition.length>0) partitions.push(partition); + return partitions; +} + +function getPossibleVals(data,feature) { + return Comp.array.sort(_.pluck(data, feature), function (x,y) { + if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order + }); +} + +/** + * Creates a new tree + */ +function createTree(data, target, features, eps) { + var _newS,child_node,bounds; + var targets = _.unique(_.pluck(data, target)); + + //console.log(data); + //console.log(features); + + // Aios.aios.log('createTree:'+targets.length); + if (targets.length == 1) return Result(targets[0]); + + if (features.length == 0) { + var topTarget = mostCommon(targets); + return Result(topTarget) + } + var bestFeatures = getBestFeatures(data, target, features, eps); + var bestFeature = bestFeatures[0]; + var remainingFeatures = Comp.array.filtermap(bestFeatures,function (feat) { + if (feat.name!=bestFeature.name) return feat.name; + else return none; + }); + var possibleValues = Comp.array.sort(_.pluck(data, bestFeature.name), function (x,y) { + if (upperBound(x) < lowerBound(y)) return -1; else return 1; // increasing value order + }); + var vals=[]; + + //console.log(bestFeatures); + //console.log(possibleValues); + var partitions=partitionVals(possibleValues,eps); + // Aios.aios.log(partitions); + //console.log(bestFeatures); + //console.log(possibleValues); + if (partitions.length==1) { + // no further 2*eps separation possible, find best feature by largest distance + // resort beat feature list with respect to value deviation + bestFeatures.sort(function (ef1,ef2) { + if (ef1.d > ef2.d) return -1; else return 1; + }); + bestFeature = bestFeatures[0]; + possibleValues = getPossibleVals(data,bestFeature.name); + Comp.array.iter(possibleValues,function (val,i) { + + _newS = data.filter(function(x) { + // console.log(x[bestFeature.name],val,overlap(val,x[bestFeature.name])) + + return overlap(val,x[bestFeature.name]); + }); + child_node = Value(val); + child_node.child = createTree(_newS, target, remainingFeatures, eps); + //console.log(_newS); + vals.push(child_node); + }) + + } else Comp.array.iter(partitions,function (partition,i) { + + _newS = data.filter(function(x) { + // console.log(x[bestFeature.name],v,overlap(x[bestFeature.name],v)) + return containsVal(partition,x[bestFeature.name]); + }); + bounds = Bounds(partition); + child_node = Value(eps==0?v:{a:bounds.a-eps,b:bounds.b+eps}); + child_node.child = createTree(_newS, target, remainingFeatures, eps); + //console.log(_newS); + vals.push(child_node); + }); + + return Feature(bestFeature.name,vals); +} + +/** + * Predicts class for sample + */ +function nearestVal(vals,sample,fun) { + var best=none; + for (var v in vals) { + var d=fun?distanceVal(fun(vals[v]),sample):distanceVal(vals[v],sample); + if (best==none) + best={v:vals[v],d:d}; + else if (best.d > d) + best={v:vals[v],d:d}; + } + if (best) return best.v; + else return none; +} + +function predict(model,sample) { + var root = model; + while (root && root.type !== NODE_TYPES.RESULT) { + var attr = root.name; + var sampleVal = sample[attr]; + var childNode = nearestVal(root.vals,sampleVal,function (node) { + return node.val; + }); + + if (childNode){ + root = childNode.child; + } else { + root = none; + } + } + if (root) return root.name||root.val; + else return none; +}; + +module.exports = { + NODE_TYPES:NODE_TYPES, + createTree:createTree, + depth:depth, + entropy:entropyEps, + evaluate:function evaluate(model,target,samples){}, + predict:predict, + print:print, + results:results, + current:function (module) { current=module.current; Aios=module;} +}; + +