Mon 21 Jul 22:43:21 CEST 2025

This commit is contained in:
sbosse 2025-07-21 23:07:42 +02:00
parent c2742ffb2c
commit 7e75307ca2

412
js/ml/C45.js Normal file
View File

@ -0,0 +1,412 @@
/**
** ==============================
** O O O OOOO
** O O O O O O
** O O O O O O
** OOOO OOOO O OOO OOOO
** O O O O O O O
** O O O O O O O
** OOOO OOOO O O OOOO
** ==============================
** Dr. Stefan Bosse http://www.bsslab.de
**
** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED
** BY THE AUTHOR(S).
** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED,
** MODIFIED, OR OTHERWISE USED IN A CONTEXT
** OUTSIDE OF THE SOFTWARE SYSTEM.
**
** $AUTHORS: ?, Stefan Bosse
** $INITIAL: (C) ?
** $MODIFIED: (C) 2006-2018 bLAB by sbosse
** $VERSION: 1.1.6
**
** $INFO:
**
** C45 Decision Tree ML Algorithm
**
** Portable model
**
** $ENDOFINFO
*/
'use strict';
var Io = Require('com/io');
var Comp = Require('com/compat');
var current=none;
var Aios=none;
var NODE_TYPES = {
RESULT: 'result',
FEATURE_NUMBER: 'feature_number', // Number value node (cut split)
FEATURE_VALUE: 'feature_value', // Category value
FEATURE_CATEGORY: 'feature_category' // Symbolic variable node (split)
};
function unique(col) {
var u = {}, a = [];
for(var i = 0, l = col.length; i < l; ++i){
if(u.hasOwnProperty(col[i])) {
continue;
}
a.push(col[i]);
u[col[i]] = 1;
}
return a;
}
function find(col, pred) {
var value;
col.forEach(function(item) {
var result = pred(item);
if (result) {
value = item;
}
});
return value;
}
function max(array, fn) {
var max = -Infinity;
var index;
for (var i = 0; i < array.length; i++) {
var result = fn(array[i]);
if (result >= max) {
max = result;
index = i;
}
}
return typeof index !== 'undefined' ? array[index] : max;
}
function sortBy(col, fn) {
col = [].slice.call(col);
return col.sort(fn);
}
var C45 = {
create: function () {
return {
features : [],
targets: [],
model: null
}
},
/**
* train
*
* @param {object} options
* @param {array} options.data - training data
* @param {string} options.target - class label
* @param {array} options.features - features names
* @param {array} options.featureTypes - features type (ie 'category', 'number')
*/
train: function(model,options) {
var data = options.data,
target = options.target,
features = options.features,
featureTypes = options.featureTypes;
featureTypes.forEach(function(f) {
if (['number','category'].indexOf(f) === -1) {
throw new Error('C4.5: Unrecognized option!');
}
});
var targets = unique(data.map(function(d) {
return d[d.length-1];
}));
model.features = features;
model.targets = targets;
// model is the generated tree structure
model.model = C45._c45(model, data, target, features, featureTypes, 0);
},
_c45: function(model, data, target, features, featureTypes, depth) {
var targets = unique(data.map(function(d) {
return d[d.length-1];
}));
if (!targets.length) {
return {
type: 'result',
value: 'none data',
name: 'none data'
};
}
if (targets.length === 1) {
return {
type: 'result',
value: targets[0],
name: targets[0]
};
}
if (!features.length) {
var topTarget = C45.mostCommon(targets);
return {
type: 'result',
value: topTarget,
name: topTarget
};
}
var bestFeatureData = C45.maxGain(model, data, target, features, featureTypes);
var bestFeature = bestFeatureData.feature;
var remainingFeatures = features.slice(0);
remainingFeatures.splice(features.indexOf(bestFeature), 1);
if (featureTypes[model.features.indexOf(bestFeature)] === 'category') {
var possibleValues = unique(data.map(function(d) {
return d[model.features.indexOf(bestFeature)];
}));
var node = {
name: bestFeature,
type: 'feature_category',
values: possibleValues.map(function(v) {
var newData = data.filter(function(x) {
return x[model.features.indexOf(bestFeature)] === v;
});
var childNode = {
name: v,
type: 'feature_value',
child: C45._c45(model, newData, target, remainingFeatures, featureTypes, depth+1)
};
return childNode;
})
};
} else if (featureTypes[model.features.indexOf(bestFeature)] === 'number') {
var possibleValues = unique(data.map(function(d) {
return d[model.features.indexOf(bestFeature)];
}));
var node = {
name: bestFeature,
type: 'feature_number',
cut: bestFeatureData.cut,
values: []
};
var newDataRight = data.filter(function(x) {
return parseFloat(x[model.features.indexOf(bestFeature)]) > bestFeatureData.cut;
});
var childNodeRight = {
name: bestFeatureData.cut.toString(),
type: 'feature_value',
child: C45._c45(model, newDataRight, target, remainingFeatures, featureTypes, depth+1)
};
node.values.push(childNodeRight);
var newDataLeft = data.filter(function(x) {
return parseFloat(x[model.features.indexOf(bestFeature)]) <= bestFeatureData.cut;
});
var childNodeLeft = {
name: bestFeatureData.cut.toString(),
type: 'feature_value',
child: C45._c45(model, newDataLeft, target, remainingFeatures, featureTypes, depth+1),
};
node.values.push(childNodeLeft);
}
return node;
},
classify: function (model,sample) {
// root is feature (attribute) containing all sub values
var childNode, featureName, sampleVal;
var root = model.model;
if (typeof root === 'undefined') {
callback(new Error('model is undefined'));
}
while (root.type != NODE_TYPES.RESULT) {
if (root.type == NODE_TYPES.FEATURE_NUMBER) {
// feature number attribute
featureName = root.name;
sampleVal = parseFloat(sample[featureName]);
if (sampleVal <= root.cut) {
childNode = root.values[1];
} else {
childNode = root.values[0];
}
} else if (root.type == NODE_TYPES.FEATURE_CATEGORY) {
// feature category attribute
featureName = root.name;
sampleVal = sample[featureName];
// sub value , containing n childs
childNode = find(root.values, function(x) {
return x.name === sampleVal;
});
}
// non trained feature
if (typeof childNode === 'undefined') {
return 'unknown';
}
root = childNode.child;
}
return root.value;
},
conditionalEntropy: function(model, data, feature, cut, target) {
var subset1 = data.filter(function(x) {
return parseFloat(x[model.features.indexOf(feature)]) <= cut;
});
var subset2 = data.filter(function(x) {
return parseFloat(x[model.features.indexOf(feature)]) > cut;
});
var setSize = data.length;
return subset1.length/setSize * C45.entropy(model,
subset1.map(function(d) {
return d[d.length-1];
})
) + subset2.length/setSize*C45.entropy(model,
subset2.map(function(d) {
return d[d.length-1];
})
);
},
count: function(target, targets) {
return targets.filter(function(t) {
return t === target;
}).length;
},
entropy: function(model, vals) {
var uniqueVals = unique(vals);
var probs = uniqueVals.map(function(x) {
return C45.prob(x, vals);
});
var logVals = probs.map(function(p) {
return -p * C45.log2(p);
});
return logVals.reduce(function(a, b) {
return a + b;
}, 0);
},
gain: function(model, data, target, features, feature, featureTypes) {
var setEntropy = C45.entropy(model, data.map(function(d) {
return d[d.length-1];
}));
if (featureTypes[model.features.indexOf(feature)] === 'category') {
var attrVals = unique(data.map(function(d) {
return d[model.features.indexOf(feature)];
}));
var setSize = data.length;
var entropies = attrVals.map(function(n) {
var subset = data.filter(function(x) {
return x[feature] === n;
});
return (subset.length/setSize) * C45.entropy(model,
subset.map(function(d) {
return d[d.length-1];
})
);
});
var sumOfEntropies = entropies.reduce(function(a, b) {
return a + b;
}, 0);
return {
feature: feature,
gain: setEntropy - sumOfEntropies,
cut: 0
};
} else if (featureTypes[model.features.indexOf(feature)] === 'number') {
var attrVals = unique(data.map(function(d) {
return d[model.features.indexOf(feature)];
}));
var gainVals = attrVals.map(function(cut) {
var cutf = parseFloat(cut);
var gain = setEntropy - C45.conditionalEntropy(model, data, feature, cutf, target);
return {
feature: feature,
gain: gain,
cut: cutf
};
});
var maxgain = max(gainVals, function(e) {
return e.gain;
});
return maxgain;
}
},
log2: function(n) {
return Math.log(n) / Math.log(2);
},
maxGain: function(model, data, target, features, featureTypes) {
var g45 = features.map(function(feature) {
return C45.gain(model, data, target, features, feature, featureTypes);
});
return max(g45, function(e) {
return e.gain;
});
},
mostCommon: function(targets) {
return sortBy(targets, function(target) {
return C45.count(target, targets);
}).reverse()[0];
},
/** Print the tree
*
*/
print: function (model,indent) {
var NL = '\n',
line='',sep;
if (indent==undefined) indent=0;
if (!model) return '';
var sp = function () {return Comp.string.create(indent);};
switch (model.type) {
case NODE_TYPES.RESULT:
return sp()+'-> '+model.name+NL;
case NODE_TYPES.FEATURE_CATEGORY:
line=sp()+'$'+model.name+'?'+NL;
Comp.array.iter(model.values,function (v) {
line += C45.print(v,indent+2);
});
return line;
case NODE_TYPES.FEATURE_NUMBER:
line = sp()+'$'+model.name+'>'+model.cut+'?'+NL;
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
line = line+C45.print(model.values[0].child,indent+2);
else
line = line+C45.print(model.values[0],indent+2);
line = line+sp()+'$'+model.name+'<='+model.cut+'?'+NL;
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
line = line+C45.print(model.values[1].child,indent+2);
else
line = line+C45.print(model.values[1],indent+2);
return line;
case NODE_TYPES.FEATURE_VALUE:
line=sp()+''+model.name+NL;
line += C45.print(model.child,indent+2);
return line;
}
return 'model?';
},
prob: function(target, targets) {
return C45.count(target,targets)/targets.length;
},
};
module.exports = {
classify:C45.classify,
create:C45.create,
entropy:C45.entropy,
log2:C45.log2,
print:function (model,indent) { return C45.print(model.model,indent) },
unique:unique,
train:C45.train,
current:function (module) { current=module.current; Aios=module;}
}