413 lines
11 KiB
JavaScript
413 lines
11 KiB
JavaScript
/**
|
|
** ==============================
|
|
** O O O OOOO
|
|
** O O O O O O
|
|
** O O O O O O
|
|
** OOOO OOOO O OOO OOOO
|
|
** O O O O O O O
|
|
** O O O O O O O
|
|
** OOOO OOOO O O OOOO
|
|
** ==============================
|
|
** Dr. Stefan Bosse http://www.bsslab.de
|
|
**
|
|
** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED
|
|
** BY THE AUTHOR(S).
|
|
** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED,
|
|
** MODIFIED, OR OTHERWISE USED IN A CONTEXT
|
|
** OUTSIDE OF THE SOFTWARE SYSTEM.
|
|
**
|
|
** $AUTHORS: ?, Stefan Bosse
|
|
** $INITIAL: (C) ?
|
|
** $MODIFIED: (C) 2006-2018 bLAB by sbosse
|
|
** $VERSION: 1.1.6
|
|
**
|
|
** $INFO:
|
|
**
|
|
** C45 Decision Tree ML Algorithm
|
|
**
|
|
** Portable model
|
|
**
|
|
** $ENDOFINFO
|
|
*/
|
|
'use strict';
|
|
var Io = Require('com/io');
|
|
var Comp = Require('com/compat');
|
|
var current=none;
|
|
var Aios=none;
|
|
|
|
var NODE_TYPES = {
|
|
RESULT: 'result',
|
|
FEATURE_NUMBER: 'feature_number', // Number value node (cut split)
|
|
FEATURE_VALUE: 'feature_value', // Category value
|
|
FEATURE_CATEGORY: 'feature_category' // Symbolic variable node (split)
|
|
};
|
|
|
|
function unique(col) {
|
|
var u = {}, a = [];
|
|
for(var i = 0, l = col.length; i < l; ++i){
|
|
if(u.hasOwnProperty(col[i])) {
|
|
continue;
|
|
}
|
|
a.push(col[i]);
|
|
u[col[i]] = 1;
|
|
}
|
|
return a;
|
|
}
|
|
|
|
function find(col, pred) {
|
|
var value;
|
|
col.forEach(function(item) {
|
|
var result = pred(item);
|
|
if (result) {
|
|
value = item;
|
|
}
|
|
});
|
|
return value;
|
|
}
|
|
|
|
function max(array, fn) {
|
|
var max = -Infinity;
|
|
var index;
|
|
for (var i = 0; i < array.length; i++) {
|
|
var result = fn(array[i]);
|
|
if (result >= max) {
|
|
max = result;
|
|
index = i;
|
|
}
|
|
}
|
|
return typeof index !== 'undefined' ? array[index] : max;
|
|
}
|
|
|
|
function sortBy(col, fn) {
|
|
col = [].slice.call(col);
|
|
return col.sort(fn);
|
|
}
|
|
|
|
var C45 = {
|
|
create: function () {
|
|
return {
|
|
features : [],
|
|
targets: [],
|
|
model: null
|
|
}
|
|
},
|
|
/**
|
|
* train
|
|
*
|
|
* @param {object} options
|
|
* @param {array} options.data - training data
|
|
* @param {string} options.target - class label
|
|
* @param {array} options.features - features names
|
|
* @param {array} options.featureTypes - features type (ie 'category', 'number')
|
|
*/
|
|
train: function(model,options) {
|
|
var data = options.data,
|
|
target = options.target,
|
|
features = options.features,
|
|
featureTypes = options.featureTypes;
|
|
featureTypes.forEach(function(f) {
|
|
if (['number','category'].indexOf(f) === -1) {
|
|
throw new Error('C4.5: Unrecognized option!');
|
|
}
|
|
});
|
|
|
|
var targets = unique(data.map(function(d) {
|
|
return d[d.length-1];
|
|
}));
|
|
|
|
model.features = features;
|
|
model.targets = targets;
|
|
// model is the generated tree structure
|
|
model.model = C45._c45(model, data, target, features, featureTypes, 0);
|
|
},
|
|
|
|
_c45: function(model, data, target, features, featureTypes, depth) {
|
|
var targets = unique(data.map(function(d) {
|
|
return d[d.length-1];
|
|
}));
|
|
|
|
if (!targets.length) {
|
|
return {
|
|
type: 'result',
|
|
value: 'none data',
|
|
name: 'none data'
|
|
};
|
|
}
|
|
|
|
if (targets.length === 1) {
|
|
return {
|
|
type: 'result',
|
|
value: targets[0],
|
|
name: targets[0]
|
|
};
|
|
}
|
|
|
|
if (!features.length) {
|
|
var topTarget = C45.mostCommon(targets);
|
|
return {
|
|
type: 'result',
|
|
value: topTarget,
|
|
name: topTarget
|
|
};
|
|
}
|
|
|
|
var bestFeatureData = C45.maxGain(model, data, target, features, featureTypes);
|
|
var bestFeature = bestFeatureData.feature;
|
|
|
|
var remainingFeatures = features.slice(0);
|
|
remainingFeatures.splice(features.indexOf(bestFeature), 1);
|
|
|
|
if (featureTypes[model.features.indexOf(bestFeature)] === 'category') {
|
|
var possibleValues = unique(data.map(function(d) {
|
|
return d[model.features.indexOf(bestFeature)];
|
|
}));
|
|
var node = {
|
|
name: bestFeature,
|
|
type: 'feature_category',
|
|
values: possibleValues.map(function(v) {
|
|
var newData = data.filter(function(x) {
|
|
return x[model.features.indexOf(bestFeature)] === v;
|
|
});
|
|
var childNode = {
|
|
name: v,
|
|
type: 'feature_value',
|
|
child: C45._c45(model, newData, target, remainingFeatures, featureTypes, depth+1)
|
|
};
|
|
return childNode;
|
|
})
|
|
};
|
|
} else if (featureTypes[model.features.indexOf(bestFeature)] === 'number') {
|
|
var possibleValues = unique(data.map(function(d) {
|
|
return d[model.features.indexOf(bestFeature)];
|
|
}));
|
|
var node = {
|
|
name: bestFeature,
|
|
type: 'feature_number',
|
|
cut: bestFeatureData.cut,
|
|
values: []
|
|
};
|
|
|
|
var newDataRight = data.filter(function(x) {
|
|
return parseFloat(x[model.features.indexOf(bestFeature)]) > bestFeatureData.cut;
|
|
});
|
|
var childNodeRight = {
|
|
name: bestFeatureData.cut.toString(),
|
|
type: 'feature_value',
|
|
child: C45._c45(model, newDataRight, target, remainingFeatures, featureTypes, depth+1)
|
|
};
|
|
node.values.push(childNodeRight);
|
|
|
|
var newDataLeft = data.filter(function(x) {
|
|
return parseFloat(x[model.features.indexOf(bestFeature)]) <= bestFeatureData.cut;
|
|
});
|
|
var childNodeLeft = {
|
|
name: bestFeatureData.cut.toString(),
|
|
type: 'feature_value',
|
|
child: C45._c45(model, newDataLeft, target, remainingFeatures, featureTypes, depth+1),
|
|
};
|
|
node.values.push(childNodeLeft);
|
|
}
|
|
return node;
|
|
},
|
|
|
|
|
|
classify: function (model,sample) {
|
|
// root is feature (attribute) containing all sub values
|
|
var childNode, featureName, sampleVal;
|
|
var root = model.model;
|
|
|
|
if (typeof root === 'undefined') {
|
|
callback(new Error('model is undefined'));
|
|
}
|
|
|
|
while (root.type != NODE_TYPES.RESULT) {
|
|
|
|
if (root.type == NODE_TYPES.FEATURE_NUMBER) {
|
|
// feature number attribute
|
|
featureName = root.name;
|
|
sampleVal = parseFloat(sample[featureName]);
|
|
if (sampleVal <= root.cut) {
|
|
childNode = root.values[1];
|
|
} else {
|
|
childNode = root.values[0];
|
|
}
|
|
} else if (root.type == NODE_TYPES.FEATURE_CATEGORY) {
|
|
// feature category attribute
|
|
featureName = root.name;
|
|
sampleVal = sample[featureName];
|
|
|
|
// sub value , containing n childs
|
|
childNode = find(root.values, function(x) {
|
|
return x.name === sampleVal;
|
|
});
|
|
}
|
|
|
|
// non trained feature
|
|
if (typeof childNode === 'undefined') {
|
|
return 'unknown';
|
|
}
|
|
root = childNode.child;
|
|
}
|
|
return root.value;
|
|
},
|
|
|
|
conditionalEntropy: function(model, data, feature, cut, target) {
|
|
var subset1 = data.filter(function(x) {
|
|
return parseFloat(x[model.features.indexOf(feature)]) <= cut;
|
|
});
|
|
var subset2 = data.filter(function(x) {
|
|
return parseFloat(x[model.features.indexOf(feature)]) > cut;
|
|
});
|
|
var setSize = data.length;
|
|
return subset1.length/setSize * C45.entropy(model,
|
|
subset1.map(function(d) {
|
|
return d[d.length-1];
|
|
})
|
|
) + subset2.length/setSize*C45.entropy(model,
|
|
subset2.map(function(d) {
|
|
return d[d.length-1];
|
|
})
|
|
);
|
|
},
|
|
|
|
count: function(target, targets) {
|
|
return targets.filter(function(t) {
|
|
return t === target;
|
|
}).length;
|
|
},
|
|
|
|
entropy: function(model, vals) {
|
|
var uniqueVals = unique(vals);
|
|
var probs = uniqueVals.map(function(x) {
|
|
return C45.prob(x, vals);
|
|
});
|
|
var logVals = probs.map(function(p) {
|
|
return -p * C45.log2(p);
|
|
});
|
|
return logVals.reduce(function(a, b) {
|
|
return a + b;
|
|
}, 0);
|
|
},
|
|
|
|
gain: function(model, data, target, features, feature, featureTypes) {
|
|
var setEntropy = C45.entropy(model, data.map(function(d) {
|
|
return d[d.length-1];
|
|
}));
|
|
if (featureTypes[model.features.indexOf(feature)] === 'category') {
|
|
var attrVals = unique(data.map(function(d) {
|
|
return d[model.features.indexOf(feature)];
|
|
}));
|
|
var setSize = data.length;
|
|
var entropies = attrVals.map(function(n) {
|
|
var subset = data.filter(function(x) {
|
|
return x[feature] === n;
|
|
});
|
|
return (subset.length/setSize) * C45.entropy(model,
|
|
subset.map(function(d) {
|
|
return d[d.length-1];
|
|
})
|
|
);
|
|
});
|
|
var sumOfEntropies = entropies.reduce(function(a, b) {
|
|
return a + b;
|
|
}, 0);
|
|
return {
|
|
feature: feature,
|
|
gain: setEntropy - sumOfEntropies,
|
|
cut: 0
|
|
};
|
|
} else if (featureTypes[model.features.indexOf(feature)] === 'number') {
|
|
var attrVals = unique(data.map(function(d) {
|
|
return d[model.features.indexOf(feature)];
|
|
}));
|
|
var gainVals = attrVals.map(function(cut) {
|
|
var cutf = parseFloat(cut);
|
|
var gain = setEntropy - C45.conditionalEntropy(model, data, feature, cutf, target);
|
|
return {
|
|
feature: feature,
|
|
gain: gain,
|
|
cut: cutf
|
|
};
|
|
});
|
|
var maxgain = max(gainVals, function(e) {
|
|
return e.gain;
|
|
});
|
|
return maxgain;
|
|
}
|
|
},
|
|
|
|
log2: function(n) {
|
|
return Math.log(n) / Math.log(2);
|
|
},
|
|
|
|
maxGain: function(model, data, target, features, featureTypes) {
|
|
var g45 = features.map(function(feature) {
|
|
return C45.gain(model, data, target, features, feature, featureTypes);
|
|
});
|
|
return max(g45, function(e) {
|
|
return e.gain;
|
|
});
|
|
},
|
|
|
|
|
|
mostCommon: function(targets) {
|
|
return sortBy(targets, function(target) {
|
|
return C45.count(target, targets);
|
|
}).reverse()[0];
|
|
},
|
|
|
|
/** Print the tree
|
|
*
|
|
*/
|
|
print: function (model,indent) {
|
|
var NL = '\n',
|
|
line='',sep;
|
|
if (indent==undefined) indent=0;
|
|
if (!model) return '';
|
|
var sp = function () {return Comp.string.create(indent);};
|
|
switch (model.type) {
|
|
case NODE_TYPES.RESULT:
|
|
return sp()+'-> '+model.name+NL;
|
|
case NODE_TYPES.FEATURE_CATEGORY:
|
|
line=sp()+'$'+model.name+'?'+NL;
|
|
Comp.array.iter(model.values,function (v) {
|
|
line += C45.print(v,indent+2);
|
|
});
|
|
return line;
|
|
case NODE_TYPES.FEATURE_NUMBER:
|
|
line = sp()+'$'+model.name+'>'+model.cut+'?'+NL;
|
|
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
|
|
line = line+C45.print(model.values[0].child,indent+2);
|
|
else
|
|
line = line+C45.print(model.values[0],indent+2);
|
|
line = line+sp()+'$'+model.name+'<='+model.cut+'?'+NL;
|
|
if (model.values[0].type==NODE_TYPES.FEATURE_VALUE)
|
|
line = line+C45.print(model.values[1].child,indent+2);
|
|
else
|
|
line = line+C45.print(model.values[1],indent+2);
|
|
return line;
|
|
case NODE_TYPES.FEATURE_VALUE:
|
|
line=sp()+''+model.name+NL;
|
|
line += C45.print(model.child,indent+2);
|
|
return line;
|
|
}
|
|
return 'model?';
|
|
},
|
|
|
|
prob: function(target, targets) {
|
|
return C45.count(target,targets)/targets.length;
|
|
},
|
|
|
|
};
|
|
|
|
module.exports = {
|
|
classify:C45.classify,
|
|
create:C45.create,
|
|
entropy:C45.entropy,
|
|
log2:C45.log2,
|
|
print:function (model,indent) { return C45.print(model.model,indent) },
|
|
unique:unique,
|
|
train:C45.train,
|
|
current:function (module) { current=module.current; Aios=module;}
|
|
}
|