Mon 21 Jul 22:43:21 CEST 2025
This commit is contained in:
parent
c2742ffb2c
commit
7e75307ca2
412
js/ml/C45.js
Normal file
412
js/ml/C45.js
Normal file
|
@ -0,0 +1,412 @@
|
|||
/**
|
||||
** ==============================
|
||||
** O O O OOOO
|
||||
** O O O O O O
|
||||
** O O O O O O
|
||||
** OOOO OOOO O OOO OOOO
|
||||
** O O O O O O O
|
||||
** O O O O O O O
|
||||
** OOOO OOOO O O OOOO
|
||||
** ==============================
|
||||
** Dr. Stefan Bosse http://www.bsslab.de
|
||||
**
|
||||
** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED
|
||||
** BY THE AUTHOR(S).
|
||||
** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED,
|
||||
** MODIFIED, OR OTHERWISE USED IN A CONTEXT
|
||||
** OUTSIDE OF THE SOFTWARE SYSTEM.
|
||||
**
|
||||
** $AUTHORS: ?, Stefan Bosse
|
||||
** $INITIAL: (C) ?
|
||||
** $MODIFIED: (C) 2006-2018 bLAB by sbosse
|
||||
** $VERSION: 1.1.6
|
||||
**
|
||||
** $INFO:
|
||||
**
|
||||
** C45 Decision Tree ML Algorithm
|
||||
**
|
||||
** Portable model
|
||||
**
|
||||
** $ENDOFINFO
|
||||
*/
|
||||
'use strict';
|
||||
var Io = Require('com/io');
|
||||
var Comp = Require('com/compat');
|
||||
var current=none;
|
||||
var Aios=none;
|
||||
|
||||
var NODE_TYPES = {
|
||||
RESULT: 'result',
|
||||
FEATURE_NUMBER: 'feature_number', // Number value node (cut split)
|
||||
FEATURE_VALUE: 'feature_value', // Category value
|
||||
FEATURE_CATEGORY: 'feature_category' // Symbolic variable node (split)
|
||||
};
|
||||
|
||||
function unique(col) {
|
||||
var u = {}, a = [];
|
||||
for(var i = 0, l = col.length; i < l; ++i){
|
||||
if(u.hasOwnProperty(col[i])) {
|
||||
continue;
|
||||
}
|
||||
a.push(col[i]);
|
||||
u[col[i]] = 1;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
function find(col, pred) {
|
||||
var value;
|
||||
col.forEach(function(item) {
|
||||
var result = pred(item);
|
||||
if (result) {
|
||||
value = item;
|
||||
}
|
||||
});
|
||||
return value;
|
||||
}
|
||||
|
||||
function max(array, fn) {
|
||||
var max = -Infinity;
|
||||
var index;
|
||||
for (var i = 0; i < array.length; i++) {
|
||||
var result = fn(array[i]);
|
||||
if (result >= max) {
|
||||
max = result;
|
||||
index = i;
|
||||
}
|
||||
}
|
||||
return typeof index !== 'undefined' ? array[index] : max;
|
||||
}
|
||||
|
||||
function sortBy(col, fn) {
|
||||
col = [].slice.call(col);
|
||||
return col.sort(fn);
|
||||
}
|
||||
|
||||
var C45 = {
|
||||
create: function () {
|
||||
return {
|
||||
features : [],
|
||||
targets: [],
|
||||
model: null
|
||||
}
|
||||
},
|
||||
/**
|
||||
* train
|
||||
*
|
||||
* @param {object} options
|
||||
* @param {array} options.data - training data
|
||||
* @param {string} options.target - class label
|
||||
* @param {array} options.features - features names
|
||||
* @param {array} options.featureTypes - features type (ie 'category', 'number')
|
||||
*/
|
||||
train: function(model,options) {
|
||||
var data = options.data,
|
||||
target = options.target,
|
||||
features = options.features,
|
||||
featureTypes = options.featureTypes;
|
||||
featureTypes.forEach(function(f) {
|
||||
if (['number','category'].indexOf(f) === -1) {
|
||||
throw new Error('C4.5: Unrecognized option!');
|
||||
}
|
||||
});
|
||||
|
||||
var targets = unique(data.map(function(d) {
|
||||
return d[d.length-1];
|
||||
}));
|
||||
|
||||
model.features = features;
|
||||
model.targets = targets;
|
||||
// model is the generated tree structure
|
||||
model.model = C45._c45(model, data, target, features, featureTypes, 0);
|
||||
},
|
||||
|
||||
_c45: function(model, data, target, features, featureTypes, depth) {
|
||||
var targets = unique(data.map(function(d) {
|
||||
return d[d.length-1];
|
||||
}));
|
||||
|
||||
if (!targets.length) {
|
||||
return {
|
||||
type: 'result',
|
||||
value: 'none data',
|
||||
name: 'none data'
|
||||
};
|
||||
}
|
||||
|
||||
if (targets.length === 1) {
|
||||
return {
|
||||
type: 'result',
|
||||
value: targets[0],
|
||||
name: targets[0]
|
||||
};
|
||||
}
|
||||
|
||||
if (!features.length) {
|
||||
var topTarget = C45.mostCommon(targets);
|
||||
return {
|
||||
type: 'result',
|
||||
value: topTarget,
|
||||
name: topTarget
|
||||
};
|
||||
}
|
||||
|
||||
var bestFeatureData = C45.maxGain(model, data, target, features, featureTypes);
|
||||
var bestFeature = bestFeatureData.feature;
|
||||
|
||||
var remainingFeatures = features.slice(0);
|
||||
remainingFeatures.splice(features.indexOf(bestFeature), 1);
|
||||
|
||||
if (featureTypes[model.features.indexOf(bestFeature)] === 'category') {
|
||||
var possibleValues = unique(data.map(function(d) {
|
||||
return d[model.features.indexOf(bestFeature)];
|
||||
}));
|
||||
var node = {
|
||||
name: bestFeature,
|
||||
type: 'feature_category',
|
||||
values: possibleValues.map(function(v) {
|
||||
var newData = data.filter(function(x) {
|
||||
return x[model.features.indexOf(bestFeature)] === v;
|
||||
});
|
||||
var childNode = {
|
||||
name: v,
|
||||
type: 'feature_value',
|
||||
child: C45._c45(model, newData, target, remainingFeatures, featureTypes, depth+1)
|
||||
};
|
||||
return childNode;
|
||||
})
|
||||
};
|
||||
} else if (featureTypes[model.features.indexOf(bestFeature)] === 'number') {
|
||||
var possibleValues = unique(data.map(function(d) {
|
||||
return d[model.features.indexOf(bestFeature)];
|
||||
}));
|
||||
var node = {
|
||||
name: bestFeature,
|
||||
type: 'feature_number',
|
||||
cut: bestFeatureData.cut,
|
||||
values: []
|
||||
};
|
||||
|
||||
var newDataRight = data.filter(function(x) {
|
||||
return parseFloat(x[model.features.indexOf(bestFeature)]) > bestFeatureData.cut;
|
||||
});
|
||||
var childNodeRight = {
|
||||
name: bestFeatureData.cut.toString(),
|
||||
type: 'feature_value',
|
||||
child: C45._c45(model, newDataRight, target, remainingFeatures, featureTypes, depth+1)
|
||||
};
|
||||
node.values.push(childNodeRight);
|
||||
|
||||
var newDataLeft = data.filter(function(x) {
|
||||
return parseFloat(x[model.features.indexOf(bestFeature)]) <= bestFeatureData.cut;
|
||||
});
|
||||
var childNodeLeft = {
|
||||
name: bestFeatureData.cut.toString(),
|
||||
type: 'feature_value',
|
||||
child: C45._c45(model, newDataLeft, target, remainingFeatures, featureTypes, depth+1),
|
||||
};
|
||||
node.values.push(childNodeLeft);
|
||||
}
|
||||
return node;
|
||||
},
|
||||
|
||||
|
||||
classify: function (model,sample) {
|
||||
// root is feature (attribute) containing all sub values
|
||||
var childNode, featureName, sampleVal;
|
||||
var root = model.model;
|
||||
|
||||
if (typeof root === 'undefined') {
|
||||
callback(new Error('model is undefined'));
|
||||
}
|
||||
|
||||
while (root.type != NODE_TYPES.RESULT) {
|
||||
|
||||
if (root.type == NODE_TYPES.FEATURE_NUMBER) {
|
||||
// feature number attribute
|
||||
featureName = root.name;
|
||||
sampleVal = parseFloat(sample[featureName]);
|
||||
if (sampleVal <= root.cut) {
|
||||
childNode = root.values[1];
|
||||
} else {
|
||||
childNode = root.values[0];
|
||||
}
|
||||
} else if (root.type == NODE_TYPES.FEATURE_CATEGORY) {
|
||||
// feature category attribute
|
||||
featureName = root.name;
|
||||
sampleVal = sample[featureName];
|
||||
|
||||
// sub value , containing n childs
|
||||
childNode = find(root.values, function(x) {
|
||||
return x.name === sampleVal;
|
||||
});
|
||||
}
|
||||
|
||||
// non trained feature
|
||||
if (typeof childNode === 'undefined') {
|
||||
return 'unknown';
|
||||
}
|
||||
root = childNode.child;
|
||||
}
|
||||
return root.value;
|
||||
},
|
||||
|
||||
conditionalEntropy: function(model, data, feature, cut, target) {
|
||||
var subset1 = data.filter(function(x) {
|
||||
return parseFloat(x[model.features.indexOf(feature)]) <= cut;
|
||||
});
|
||||
var subset2 = data.filter(function(x) {
|
||||
return parseFloat(x[model.features.indexOf(feature)]) > cut;
|
||||
});
|
||||
var setSize = data.length;
|
||||
return subset1.length/setSize * C45.entropy(model,
|
||||
subset1.map(function(d) {
|
||||
return d[d.length-1];
|
||||
})
|
||||
) + subset2.length/setSize*C45.entropy(model,
|
||||
subset2.map(function(d) {
|
||||
return d[d.length-1];
|
||||
})
|
||||
);
|
||||
},
|
||||
|
||||
count: function(target, targets) {
|
||||
return targets.filter(function(t) {
|
||||
return t === target;
|
||||
}).length;
|
||||
},
|
||||
|
||||
entropy: function(model, vals) {
|
||||
var uniqueVals = unique(vals);
|
||||
var probs = uniqueVals.map(function(x) {
|
||||
return C45.prob(x, vals);
|
||||
});
|
||||
var logVals = probs.map(function(p) {
|
||||
return -p * C45.log2(p);
|
||||
});
|
||||
return logVals.reduce(function(a, b) {
|
||||
return a + b;
|
||||
}, 0);
|
||||
},
|
||||
|
||||
gain: function(model, data, target, features, feature, featureTypes) {
|
||||
var setEntropy = C45.entropy(model, data.map(function(d) {
|
||||
return d[d.length-1];
|
||||
}));
|
||||
if (featureTypes[model.features.indexOf(feature)] === 'category') {
|
||||
var attrVals = unique(data.map(function(d) {
|
||||
return d[model.features.indexOf(feature)];
|
||||
}));
|
||||
var setSize = data.length;
|
||||
var entropies = attrVals.map(function(n) {
|
||||
var subset = data.filter(function(x) {
|
||||
return x[feature] === n;
|
||||
});
|
||||
return (subset.length/setSize) * C45.entropy(model,
|
||||
subset.map(function(d) {
|
||||
return d[d.length-1];
|
||||
})
|
||||
);
|
||||
});
|
||||
var sumOfEntropies = entropies.reduce(function(a, b) {
|
||||
return a + b;
|
||||
}, 0);
|
||||
return {
|
||||
feature: feature,
|
||||
gain: setEntropy - sumOfEntropies,
|
||||
cut: 0
|
||||
};
|
||||
} else if (featureTypes[model.features.indexOf(feature)] === 'number') {
|
||||
var attrVals = unique(data.map(function(d) {
|
||||
return d[model.features.indexOf(feature)];
|
||||
}));
|
||||
var gainVals = attrVals.map(function(cut) {
|
||||
var cutf = parseFloat(cut);
|
||||
var gain = setEntropy - C45.conditionalEntropy(model, data, feature, cutf, target);
|
||||
return {
|
||||
feature: feature,
|
||||
gain: gain,
|
||||
cut: cutf
|
||||
};
|
||||
});
|
||||
var maxgain = max(gainVals, function(e) {
|
||||
return e.gain;
|
||||
});
|
||||
return maxgain;
|
||||
}
|
||||
},
|
||||
|
||||
log2: function(n) {
|
||||
return Math.log(n) / Math.log(2);
|
||||
},
|
||||
|
||||
maxGain: function(model, data, target, features, featureTypes) {
|
||||
var g45 = features.map(function(feature) {
|
||||
return C45.gain(model, data, target, features, feature, featureTypes);
|
||||
});
|
||||
return max(g45, function(e) {
|
||||
return e.gain;
|
||||
});
|
||||
},
|
||||
|
||||
|
||||
mostCommon: function(targets) {
|
||||
return sortBy(targets, function(target) {
|
||||
return C45.count(target, targets);
|
||||
}).reverse()[0];
|
||||
},
|
||||
|
||||
/** Print the tree
|
||||
*
|
||||
*/
|
||||
print: function (model,indent) {
|
||||
var NL = '\n',
|
||||
line='',sep;
|
||||
if (indent==undefined) indent=0;
|
||||
if (!model) return '';
|
||||
var sp = function () {return Comp.string.create(indent);};
|
||||
switch (model.type) {
|
||||
case NODE_TYPES.RESULT:
|
||||
return sp()+'-> '+model.name+NL;
|
||||
case NODE_TYPES.FEATURE_CATEGORY:
|
||||
line=sp()+'$'+model.name+'?'+NL;
|
||||
Comp.array.iter(model.values,function (v) {
|
||||
line += C45.print(v,indent+2);
|
||||
});
|
||||
return line;
|
||||
case NODE_TYPES.FEATURE_NUMBER:
|
||||
line = sp()+'$'+model.name+'>'+model.cut+'?'+NL;
|
||||
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
|
||||
line = line+C45.print(model.values[0].child,indent+2);
|
||||
else
|
||||
line = line+C45.print(model.values[0],indent+2);
|
||||
line = line+sp()+'$'+model.name+'<='+model.cut+'?'+NL;
|
||||
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
|
||||
line = line+C45.print(model.values[1].child,indent+2);
|
||||
else
|
||||
line = line+C45.print(model.values[1],indent+2);
|
||||
return line;
|
||||
case NODE_TYPES.FEATURE_VALUE:
|
||||
line=sp()+''+model.name+NL;
|
||||
line += C45.print(model.child,indent+2);
|
||||
return line;
|
||||
}
|
||||
return 'model?';
|
||||
},
|
||||
|
||||
prob: function(target, targets) {
|
||||
return C45.count(target,targets)/targets.length;
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
classify:C45.classify,
|
||||
create:C45.create,
|
||||
entropy:C45.entropy,
|
||||
log2:C45.log2,
|
||||
print:function (model,indent) { return C45.print(model.model,indent) },
|
||||
unique:unique,
|
||||
train:C45.train,
|
||||
current:function (module) { current=module.current; Aios=module;}
|
||||
}
|
Loading…
Reference in New Issue
Block a user