/** ** ============================== ** O O O OOOO ** O O O O O O ** O O O O O O ** OOOO OOOO O OOO OOOO ** O O O O O O O ** O O O O O O O ** OOOO OOOO O O OOOO ** ============================== ** Dr. Stefan Bosse http://www.bsslab.de ** ** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED ** BY THE AUTHOR(S). ** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, ** MODIFIED, OR OTHERWISE USED IN A CONTEXT ** OUTSIDE OF THE SOFTWARE SYSTEM. ** ** $AUTHORS: ?, Stefan Bosse ** $INITIAL: (C) ? ** $MODIFIED: (C) 2006-2018 bLAB by sbosse ** $VERSION: 1.1.6 ** ** $INFO: ** ** C45 Decision Tree ML Algorithm ** ** Portable model ** ** $ENDOFINFO */ 'use strict'; var Io = Require('com/io'); var Comp = Require('com/compat'); var current=none; var Aios=none; var NODE_TYPES = { RESULT: 'result', FEATURE_NUMBER: 'feature_number', // Number value node (cut split) FEATURE_VALUE: 'feature_value', // Category value FEATURE_CATEGORY: 'feature_category' // Symbolic variable node (split) }; function unique(col) { var u = {}, a = []; for(var i = 0, l = col.length; i < l; ++i){ if(u.hasOwnProperty(col[i])) { continue; } a.push(col[i]); u[col[i]] = 1; } return a; } function find(col, pred) { var value; col.forEach(function(item) { var result = pred(item); if (result) { value = item; } }); return value; } function max(array, fn) { var max = -Infinity; var index; for (var i = 0; i < array.length; i++) { var result = fn(array[i]); if (result >= max) { max = result; index = i; } } return typeof index !== 'undefined' ? array[index] : max; } function sortBy(col, fn) { col = [].slice.call(col); return col.sort(fn); } var C45 = { create: function () { return { features : [], targets: [], model: null } }, /** * train * * @param {object} options * @param {array} options.data - training data * @param {string} options.target - class label * @param {array} options.features - features names * @param {array} options.featureTypes - features type (ie 'category', 'number') */ train: function(model,options) { var data = options.data, target = options.target, features = options.features, featureTypes = options.featureTypes; featureTypes.forEach(function(f) { if (['number','category'].indexOf(f) === -1) { throw new Error('C4.5: Unrecognized option!'); } }); var targets = unique(data.map(function(d) { return d[d.length-1]; })); model.features = features; model.targets = targets; // model is the generated tree structure model.model = C45._c45(model, data, target, features, featureTypes, 0); }, _c45: function(model, data, target, features, featureTypes, depth) { var targets = unique(data.map(function(d) { return d[d.length-1]; })); if (!targets.length) { return { type: 'result', value: 'none data', name: 'none data' }; } if (targets.length === 1) { return { type: 'result', value: targets[0], name: targets[0] }; } if (!features.length) { var topTarget = C45.mostCommon(targets); return { type: 'result', value: topTarget, name: topTarget }; } var bestFeatureData = C45.maxGain(model, data, target, features, featureTypes); var bestFeature = bestFeatureData.feature; var remainingFeatures = features.slice(0); remainingFeatures.splice(features.indexOf(bestFeature), 1); if (featureTypes[model.features.indexOf(bestFeature)] === 'category') { var possibleValues = unique(data.map(function(d) { return d[model.features.indexOf(bestFeature)]; })); var node = { name: bestFeature, type: 'feature_category', values: possibleValues.map(function(v) { var newData = data.filter(function(x) { return x[model.features.indexOf(bestFeature)] === v; }); var childNode = { name: v, type: 'feature_value', child: C45._c45(model, newData, target, remainingFeatures, featureTypes, depth+1) }; return childNode; }) }; } else if (featureTypes[model.features.indexOf(bestFeature)] === 'number') { var possibleValues = unique(data.map(function(d) { return d[model.features.indexOf(bestFeature)]; })); var node = { name: bestFeature, type: 'feature_number', cut: bestFeatureData.cut, values: [] }; var newDataRight = data.filter(function(x) { return parseFloat(x[model.features.indexOf(bestFeature)]) > bestFeatureData.cut; }); var childNodeRight = { name: bestFeatureData.cut.toString(), type: 'feature_value', child: C45._c45(model, newDataRight, target, remainingFeatures, featureTypes, depth+1) }; node.values.push(childNodeRight); var newDataLeft = data.filter(function(x) { return parseFloat(x[model.features.indexOf(bestFeature)]) <= bestFeatureData.cut; }); var childNodeLeft = { name: bestFeatureData.cut.toString(), type: 'feature_value', child: C45._c45(model, newDataLeft, target, remainingFeatures, featureTypes, depth+1), }; node.values.push(childNodeLeft); } return node; }, classify: function (model,sample) { // root is feature (attribute) containing all sub values var childNode, featureName, sampleVal; var root = model.model; if (typeof root === 'undefined') { callback(new Error('model is undefined')); } while (root.type != NODE_TYPES.RESULT) { if (root.type == NODE_TYPES.FEATURE_NUMBER) { // feature number attribute featureName = root.name; sampleVal = parseFloat(sample[featureName]); if (sampleVal <= root.cut) { childNode = root.values[1]; } else { childNode = root.values[0]; } } else if (root.type == NODE_TYPES.FEATURE_CATEGORY) { // feature category attribute featureName = root.name; sampleVal = sample[featureName]; // sub value , containing n childs childNode = find(root.values, function(x) { return x.name === sampleVal; }); } // non trained feature if (typeof childNode === 'undefined') { return 'unknown'; } root = childNode.child; } return root.value; }, conditionalEntropy: function(model, data, feature, cut, target) { var subset1 = data.filter(function(x) { return parseFloat(x[model.features.indexOf(feature)]) <= cut; }); var subset2 = data.filter(function(x) { return parseFloat(x[model.features.indexOf(feature)]) > cut; }); var setSize = data.length; return subset1.length/setSize * C45.entropy(model, subset1.map(function(d) { return d[d.length-1]; }) ) + subset2.length/setSize*C45.entropy(model, subset2.map(function(d) { return d[d.length-1]; }) ); }, count: function(target, targets) { return targets.filter(function(t) { return t === target; }).length; }, entropy: function(model, vals) { var uniqueVals = unique(vals); var probs = uniqueVals.map(function(x) { return C45.prob(x, vals); }); var logVals = probs.map(function(p) { return -p * C45.log2(p); }); return logVals.reduce(function(a, b) { return a + b; }, 0); }, gain: function(model, data, target, features, feature, featureTypes) { var setEntropy = C45.entropy(model, data.map(function(d) { return d[d.length-1]; })); if (featureTypes[model.features.indexOf(feature)] === 'category') { var attrVals = unique(data.map(function(d) { return d[model.features.indexOf(feature)]; })); var setSize = data.length; var entropies = attrVals.map(function(n) { var subset = data.filter(function(x) { return x[feature] === n; }); return (subset.length/setSize) * C45.entropy(model, subset.map(function(d) { return d[d.length-1]; }) ); }); var sumOfEntropies = entropies.reduce(function(a, b) { return a + b; }, 0); return { feature: feature, gain: setEntropy - sumOfEntropies, cut: 0 }; } else if (featureTypes[model.features.indexOf(feature)] === 'number') { var attrVals = unique(data.map(function(d) { return d[model.features.indexOf(feature)]; })); var gainVals = attrVals.map(function(cut) { var cutf = parseFloat(cut); var gain = setEntropy - C45.conditionalEntropy(model, data, feature, cutf, target); return { feature: feature, gain: gain, cut: cutf }; }); var maxgain = max(gainVals, function(e) { return e.gain; }); return maxgain; } }, log2: function(n) { return Math.log(n) / Math.log(2); }, maxGain: function(model, data, target, features, featureTypes) { var g45 = features.map(function(feature) { return C45.gain(model, data, target, features, feature, featureTypes); }); return max(g45, function(e) { return e.gain; }); }, mostCommon: function(targets) { return sortBy(targets, function(target) { return C45.count(target, targets); }).reverse()[0]; }, /** Print the tree * */ print: function (model,indent) { var NL = '\n', line='',sep; if (indent==undefined) indent=0; if (!model) return ''; var sp = function () {return Comp.string.create(indent);}; switch (model.type) { case NODE_TYPES.RESULT: return sp()+'-> '+model.name+NL; case NODE_TYPES.FEATURE_CATEGORY: line=sp()+'$'+model.name+'?'+NL; Comp.array.iter(model.values,function (v) { line += C45.print(v,indent+2); }); return line; case NODE_TYPES.FEATURE_NUMBER: line = sp()+'$'+model.name+'>'+model.cut+'?'+NL; if (model.values[0].type==NODE_TYPES.FEATURE_VALUE) line = line+C45.print(model.values[0].child,indent+2); else line = line+C45.print(model.values[0],indent+2); line = line+sp()+'$'+model.name+'<='+model.cut+'?'+NL; if (model.values[0].type==NODE_TYPES.FEATURE_VALUE) line = line+C45.print(model.values[1].child,indent+2); else line = line+C45.print(model.values[1],indent+2); return line; case NODE_TYPES.FEATURE_VALUE: line=sp()+''+model.name+NL; line += C45.print(model.child,indent+2); return line; } return 'model?'; }, prob: function(target, targets) { return C45.count(target,targets)/targets.length; }, }; module.exports = { classify:C45.classify, create:C45.create, entropy:C45.entropy, log2:C45.log2, print:function (model,indent) { return C45.print(model.model,indent) }, unique:unique, train:C45.train, current:function (module) { current=module.current; Aios=module;} }