/** ** ============================== ** O O O OOOO ** O O O O O O ** O O O O O O ** OOOO OOOO O OOO OOOO ** O O O O O O O ** O O O O O O O ** OOOO OOOO O O OOOO ** ============================== ** Dr. Stefan Bosse http://www.bsslab.de ** ** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED ** BY THE AUTHOR(S). ** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED, ** MODIFIED, OR OTHERWISE USED IN A CONTEXT ** OUTSIDE OF THE SOFTWARE SYSTEM. ** ** $AUTHORS: Stefan Bosse ** $CREATED: (C) 2006-2020 bLAB by sbosse ** $VERSION: 1.1.8 ** ** $INFO: ** ** ML Data Statistics and Utils ** ** New: ** type eps = number | number [] ** ** $ENDOFINFO */ var Io = Require('com/io'); var Comp = Require('com/compat'); ///////// UTILS //////////// var stat = { max: function(array) { return Math.max.apply(null, array); }, min: function(array) { return Math.min.apply(null, array); }, range: function(array) { return stat.max(array) - stat.min(array); }, midrange: function(array) { return stat.range(array) / 2; }, sum: function(array) { var num = 0; for (var i = 0, l = array.length; i < l; i++) num += array[i]; return num; }, mean: function(array) { return stat.sum(array) / array.length; }, median: function(array) { array.sort(function(a, b) { return a - b; }); var mid = array.length / 2; return mid % 1 ? array[mid - 0.5] : (array[mid - 1] + array[mid]) / 2; }, modes: function(array) { if (!array.length) return []; var modeMap = {}, maxCount = 0, modes = []; array.forEach(function(val) { if (!modeMap[val]) modeMap[val] = 1; else modeMap[val]++; if (modeMap[val] > maxCount) { modes = [val]; maxCount = modeMap[val]; } else if (modeMap[val] === maxCount) { modes.push(val); maxCount = modeMap[val]; } }); return modes; }, variance: function(array) { var mean = stat.mean(array); return stat.mean(array.map(function(num) { return Math.pow(num - mean, 2); })); }, standardDeviation: function(array) { return Math.sqrt(stat.variance(array)); }, meanAbsoluteDeviation: function(array) { var mean = stat.mean(array); return stat.mean(array.map(function(num) { return Math.abs(num - mean); })); }, zScores: function(array) { var mean = stat.mean(array); var standardDeviation = stat.standardDeviation(array); return array.map(function(num) { return (num - mean) / standardDeviation; }); } }; // Function aliases: stat.average = stat.mean; // function ({$x:number}|{value:*,prob;number}[]|number [],boolean) // -> {value:*,prob:number}|{index:number, prob:number} // normalize=1: scale output max=[0,1] // normalize=2: scale and weight output max*[0,1] function best(o,normalize) { var p,max,pos=0,sum=0,res; if (Comp.obj.isArray(o) && typeof o[0]=='number') { max=-Infinity; for(p in o) { sum += o[p]; if (o[p] > max) max=o[p],pos=p; } res = {index:pos,prob:max} } else if (Comp.obj.isArray(o) && typeof o[0]=='object') { for(p in o) { sum += o[p].prob; if (!max || o[p].prob>max.prob) max=o[p]; } res = {value:max.value,prob:max.prob} } else if (Comp.obj.isObj(o)) { max=-Infinity; for(p in o) { sum += o[p]; if (o[p]>max) max=o[p],pos=p; } res = {value:pos,prob:max} } if (!res) return; switch (normalize) { case 1: res.prob=res.prob/sum; break; case 2: res.prob=res.prob*(res.prob/sum); break; default: } return res; } function bestNormalize(o) { return best(o,1) } function log2(n) { return Math.log(n) / Math.log(2); } // Select maximal value of an array by values // retuned by optional function applied to array values function max(array,fun) { var res,max,num; for(var i in array) { if (fun) num=fun(array[i],i); else num=array[i]; if (max==undefined) { max=num; res=array[i] } else if (num > max) { max=num; res=array[i] } } return res; } /** * Finds element with highest occurrence in a list * @private */ function mostCommon(list) { var elementFrequencyMap = {}; var largestFrequency = -1; var mostCommonElement = null; list.forEach(function(element) { var elementFrequency = (elementFrequencyMap[element] || 0) + 1; elementFrequencyMap[element] = elementFrequency; if (largestFrequency < elementFrequency) { mostCommonElement = element; largestFrequency = elementFrequency; } }); return mostCommonElement; } function pluck(collection, key) { return collection.map(function(object) { return object == null ? undefined : object[key]; }); } function prob(value, list) { var occurrences = list.filter(function(element) { return element === value }); var numOccurrences = occurrences.length; var numElements = list.length; return numOccurrences / numElements; } function sort(array) { return array.sort(function (a,b) { return a=v-eps && v2<=v+eps) cn++,frac[targets.indexOf(data[row][target])]++; }) var p = cn/data.length; en += (p*entropyN(frac,frac.reduce(sum))) // print(frac,p,frac.reduce(sum)) }) return en; } function features (data,target) { var f; if (Comp.obj.isObj(data[0])) f=Object.keys(data[0]); else if (Comp.obj.isArray(data[0])) f=data[0].map(function (x,i) { return String(i) }); if (f && target) delete f[target]; return f; } function gainEps(data,feature,target,targets,eps) { var et = entropy(pluck(data,target)); return et/entropyTEps(data,feature,target,targets,eps) } function maxGainEps(data,features,target,targets,eps) { var maxgain=max(features, function(feature,index) { var g = gainEps(data,feature,target,targets,selectEps(eps,index)); return g; }); return maxgain; } function partition(data,feature,target,targets) { var parts={}; targets.forEach(function (t) {parts[t]=[]}); data.forEach(function (row) { parts[row[target]].push(row[feature]); }) return parts } function partitionEps(data,feature,target,targets,eps) { var p,parts={} targets.forEach(function (t) {parts[t]={range:[Number.MAX_VALUE,-Number.MAX_VALUE],values:[]}}); data.forEach(function (row) { parts[row[target]].values.push(row[feature]); parts[row[target]].range[0]=Math.min(parts[row[target]].range[0],row[feature]); parts[row[target]].range[1]=Math.max(parts[row[target]].range[1],row[feature]); }) for(p in parts) { parts[p].unique=uniqueEps(parts[p].values,eps) parts[p].noise=2*stat.standardDeviation(parts[p].values); } return parts } // Return only eps-not-overlapping parititions - the most significant are selected // (with the lowest unique column values) function partitionUniqueEps(data,feature,target,targets,eps) { var p, q, parts={} // 1. Create all partitions targets.forEach(function (t) {parts[t]={range:[Number.MAX_VALUE,-Number.MAX_VALUE],values:[]}}); data.forEach(function (row) { parts[row[target]].values.push(row[feature]); parts[row[target]].range[0]=Math.min(parts[row[target]].range[0],row[feature]); parts[row[target]].range[1]=Math.max(parts[row[target]].range[1],row[feature]); }) for(p in parts) { parts[p].unique=uniqueEps(parts[p].values,eps) } // 2. Remove overlapping partitions for(p in parts) { if (!parts[p]) continue; for (q in parts) { if (!parts[p]) break; if (p==q || !parts[q]) continue; if ((parts[p].range[0]-eps)parts[q].range[0]) { // overlapping, select the part with best unique column values if ((parts[p].unique.length/parts[p].values.length)< (parts[q].unique.length/parts[q].values.length)) { //print('delete '+q) delete parts[q]; } else { //print('delete '+p) delete parts[p]; } } } } return parts } function select (data,what) { if (Comp.obj.isArray(what) && what.length==2) { var c0=what[0],c1=what[1]; return data.map(function (row) { return row.slice(c0,c1+1); }) } } function selectEps (eps,index) { if (typeof eps == 'number') return eps; else return eps[index] } /** Split a data set by finding the best feature (column) * based on maximal gain/entropy calculation of columns. * type eps = number | number [] */ function splitEps (data,features,target,targets,eps) { var bestFeature = maxGainEps(data,features,target,targets,eps); var index = features.indexOf(bestFeature); eps = selectEps(eps,index); var remainingFeatures = without(features, bestFeature); var possibleValues = sort(uniqueEps(pluck(data, bestFeature),eps)); var choices = possibleValues.map( function(v) { var dataS = data.filter(function(x) { return Math.abs(x[bestFeature] - v) <= eps }); return { val:v, data:dataS, } }); return { feature:bestFeature, choices:choices, possibleValues:possibleValues, remainingFeatures:remainingFeatures }; } function uniqueEps(array,eps) { var result=[]; array.forEach(function (x) { var found; if (!result.length) result.push(x); else { result.forEach(function (y) { if (found) return; found = Math.abs(x-y)<=eps; }); if (!found) result.push(x); } }); return result; } module.exports = { analyze : function (data,features,target,eps) { var noise=[]; if (!eps) eps=0; var targets = unique(pluck(data,target)); var parts = {}, partsUnique = {},diversity={} features.forEach(function (feature) { partsUnique[feature]=partitionUniqueEps(data,feature,target,targets,eps); parts[feature]=partitionEps(data,feature,target,targets,eps); for(var p in parts[feature]) noise.push(parts[feature][p].noise); }) features.forEach(function (feature) { diversity[feature]=Object.keys(partsUnique[feature]).length; }) return { features:features, partitions:parts, // for each data column diversity:diversity, noise:stat.mean(noise) } }, entropy:entropy, entropyN:entropyN, entropyEps:entropyEps, entropyTEps:entropyTEps, entropyT:entropyT, features:features, gainEps:gainEps, maxGainEps:maxGainEps, mostCommon:mostCommon, partition:partition, partitionEps:partitionEps, partitionUniqueEps:partitionUniqueEps, splitEps:splitEps, unique:unique, uniqueEps:uniqueEps, utils : { // return column by key of a matrix (array array|record array) best:best, bestNormalize:bestNormalize, column:pluck, log2:log2, prob:prob, // transform [v][] -> v[] relax: function (mat) { if (Comp.obj.isMatrix(mat) && mat[0].length==1) return mat.map(function (row) { return row[0]}) else return mat; }, select:select, selectEps:selectEps, sort:sort, stat:stat, without:without, // transform v[] -> [v][] wrap: function (mat) { if (!Comp.obj.isMatrix(mat)) return mat.map(function (v) { return [v]}) else return mat }, }, };