Mon 21 Jul 22:43:21 CEST 2025

This commit is contained in:
sbosse 2025-07-21 23:07:55 +02:00
parent 6900ccf657
commit a3866d172a

523
js/ml/stats.js.bak Normal file
View File

@ -0,0 +1,523 @@
/**
** ==============================
** O O O OOOO
** O O O O O O
** O O O O O O
** OOOO OOOO O OOO OOOO
** O O O O O O O
** O O O O O O O
** OOOO OOOO O O OOOO
** ==============================
** Dr. Stefan Bosse http://www.bsslab.de
**
** COPYRIGHT: THIS SOFTWARE, EXECUTABLE AND SOURCE CODE IS OWNED
** BY THE AUTHOR(S).
** THIS SOURCE CODE MAY NOT BE COPIED, EXTRACTED,
** MODIFIED, OR OTHERWISE USED IN A CONTEXT
** OUTSIDE OF THE SOFTWARE SYSTEM.
**
** $AUTHORS: Stefan Bosse
** $CREATED: (C) 2006-2020 bLAB by sbosse
** $VERSION: 1.1.7
**
** $INFO:
**
** ML Data Statistics and Utils
**
** New:
** type eps = number | number []
**
** $ENDOFINFO
*/
var Io = Require('com/io');
var Comp = Require('com/compat');
///////// UTILS ////////////
var stat = {
max: function(array) {
return Math.max.apply(null, array);
},
min: function(array) {
return Math.min.apply(null, array);
},
range: function(array) {
return stat.max(array) - stat.min(array);
},
midrange: function(array) {
return stat.range(array) / 2;
},
sum: function(array) {
var num = 0;
for (var i = 0, l = array.length; i < l; i++) num += array[i];
return num;
},
mean: function(array) {
return stat.sum(array) / array.length;
},
median: function(array) {
array.sort(function(a, b) {
return a - b;
});
var mid = array.length / 2;
return mid % 1 ? array[mid - 0.5] : (array[mid - 1] + array[mid]) / 2;
},
modes: function(array) {
if (!array.length) return [];
var modeMap = {},
maxCount = 0,
modes = [];
array.forEach(function(val) {
if (!modeMap[val]) modeMap[val] = 1;
else modeMap[val]++;
if (modeMap[val] > maxCount) {
modes = [val];
maxCount = modeMap[val];
}
else if (modeMap[val] === maxCount) {
modes.push(val);
maxCount = modeMap[val];
}
});
return modes;
},
variance: function(array) {
var mean = stat.mean(array);
return stat.mean(array.map(function(num) {
return Math.pow(num - mean, 2);
}));
},
standardDeviation: function(array) {
return Math.sqrt(stat.variance(array));
},
meanAbsoluteDeviation: function(array) {
var mean = stat.mean(array);
return stat.mean(array.map(function(num) {
return Math.abs(num - mean);
}));
},
zScores: function(array) {
var mean = stat.mean(array);
var standardDeviation = stat.standardDeviation(array);
return array.map(function(num) {
return (num - mean) / standardDeviation;
});
}
};
// Function aliases:
stat.average = stat.mean;
// function ({$x:number}|{value:*,prob;number}[]|number [],boolean)
// -> {value:*,prob:number}|{index:number, prob:number}
// normalize=1: scale output max=[0,1]
// normalize=2: scale and weight output max*[0,1]
function best(o,normalize) {
var p,max,pos=0,sum=0,res;
if (Comp.obj.isArray(o) && typeof o[0]=='number') {
max=-Infinity;
for(p in o) {
sum += o[p];
if (o[p] > max) max=o[p],pos=p;
}
res = {index:pos,prob:max}
} else if (Comp.obj.isArray(o) && typeof o[0]=='object') {
for(p in o) {
sum += o[p].prob;
if (!max || o[p].prob>max.prob) max=o[p];
}
res = {value:max.value,prob:max.prob}
} else if (Comp.obj.isObj(o)) {
max=-Infinity;
for(p in o) {
sum += o[p];
if (o[p]>max) max=o[p],pos=p;
}
res = {value:pos,prob:max}
}
if (!res) return;
switch (normalize) {
case 1: res.prob=res.prob/sum; break;
case 2: res.prob=res.prob*(res.prob/sum); break;
default:
}
return res;
}
function bestNormalize(o) { return best(o,1) }
function log2(n) {
return Math.log(n) / Math.log(2);
}
// Select maximal value of an array by values
// retuned by optional function applied to array values
function max(array,fun) {
var res,max,num;
for(var i in array) {
if (fun) num=fun(array[i],i); else num=array[i];
if (max==undefined) { max=num; res=array[i] }
else if (num > max) { max=num; res=array[i] }
}
return res;
}
/**
* Finds element with highest occurrence in a list
* @private
*/
function mostCommon(list) {
var elementFrequencyMap = {};
var largestFrequency = -1;
var mostCommonElement = null;
list.forEach(function(element) {
var elementFrequency = (elementFrequencyMap[element] || 0) + 1;
elementFrequencyMap[element] = elementFrequency;
if (largestFrequency < elementFrequency) {
mostCommonElement = element;
largestFrequency = elementFrequency;
}
});
return mostCommonElement;
}
function pluck(collection, key) {
return collection.map(function(object) {
return object == null ? undefined : object[key];
});
}
function prob(value, list) {
var occurrences = list.filter(function(element) {
return element === value
});
var numOccurrences = occurrences.length;
var numElements = list.length;
return numOccurrences / numElements;
}
function sort(array) {
return array.sort(function (a,b) { return a<b?-1:1 });
}
function sum (a,b) { return a+b }
function unique(array) {
var length = array ? array.length : 0;
function baseUniq(array) {
var index = -1,
length = array.length,
seen,
result = [];
seen = result;
outer:
while (++index < length) {
var value = array[index];
var seenIndex = seen.length;
while (seenIndex--) {
if (seen[seenIndex] === value) {
continue outer;
}
}
result.push(value);
}
return result;
}
if (!length) {
return [];
}
return baseUniq(array);
}
function without () {
var array,
values=[];
for(var i in arguments) {
if (i==0) array=arguments[0];
else values.push(arguments[i]);
}
return array.filter(function (e) {
return values.indexOf(e) == -1;
});
}
////////////////////////////////////////
function entropy(vals) {
var uniqueVals = unique(vals);
var probs = uniqueVals.map(function(x) {
return prob(x, vals)
});
var logVals = probs.map(function(p) {
return -p * log2(p)
});
return logVals.reduce(sum,0);
}
function entropyN(dist,N) {
var p, probs=[];
for(p in dist) probs.push(dist[p]/N);
var logVals = probs.map(function(p) {
return p==0?0:-p * log2(p)
});
return logVals.reduce(sum, 0);
}
function entropyEps(vals,eps) {
var uniqueVals = uniqueEps(vals,eps);
var probs = uniqueVals.map(function(x) {
return probEps(x, vals, eps)
});
var logVals = probs.map(function(p) {
return -p * log2(p)
});
return logVals.reduce(sum, 0);
}
function entropyTEps(data,feature,target,targets,eps) {
var en = 0;
var col = pluck(data,feature);
var uniqueVals = uniqueEps(col,eps);
uniqueVals.forEach(function (v) {
var frac = targets.map(function () { return 0 }),
cn=0;
col.forEach (function (v2,row) {
if (v2>=v-eps && v2<=v+eps) cn++,frac[targets.indexOf(data[row][target])]++;
})
var p = cn/data.length;
en += (p*entropyN(frac,frac.reduce(sum)))
// print(frac,p,frac.reduce(sum))
})
return en;
}
function features (data,target) {
var f;
if (Comp.obj.isObj(data[0]))
f=Object.keys(data[0]);
else if (Comp.obj.isArray(data[0]))
f=data[0].map(function (x,i) { return String(i) });
if (f && target) delete f[target];
return f;
}
function gainEps(data,feature,target,targets,eps) {
var et = entropy(pluck(data,target));
return et/entropyTEps(data,feature,target,targets,eps)
}
function maxGainEps(data,features,target,targets,eps) {
var maxgain=max(features, function(feature,index) {
var g = gainEps(data,feature,target,targets,selectEps(eps,index));
return g;
});
return maxgain;
}
function partition(data,feature,target,targets) {
var parts={};
targets.forEach(function (t) {parts[t]=[]});
data.forEach(function (row) {
parts[row[target]].push(row[feature]);
})
return parts
}
function partitionEps(data,feature,target,targets,eps) {
var p,parts={}
targets.forEach(function (t) {parts[t]={range:[Number.MAX_VALUE,-Number.MAX_VALUE],values:[]}});
data.forEach(function (row) {
parts[row[target]].values.push(row[feature]);
parts[row[target]].range[0]=Math.min(parts[row[target]].range[0],row[feature]);
parts[row[target]].range[1]=Math.max(parts[row[target]].range[1],row[feature]);
})
for(p in parts) {
parts[p].unique=uniqueEps(parts[p].values,eps)
parts[p].noise=2*stat.standardDeviation(parts[p].values);
}
return parts
}
// Return only eps-not-overlapping parititions - the most significant are selected
// (with the lowest unique column values)
function partitionUniqueEps(data,feature,target,targets,eps) {
var p, q, parts={}
// 1. Create all partitions
targets.forEach(function (t) {parts[t]={range:[Number.MAX_VALUE,-Number.MAX_VALUE],values:[]}});
data.forEach(function (row) {
parts[row[target]].values.push(row[feature]);
parts[row[target]].range[0]=Math.min(parts[row[target]].range[0],row[feature]);
parts[row[target]].range[1]=Math.max(parts[row[target]].range[1],row[feature]);
})
for(p in parts) {
parts[p].unique=uniqueEps(parts[p].values,eps)
}
// 2. Remove overlapping partitions
for(p in parts) {
if (!parts[p]) continue;
for (q in parts) {
if (!parts[p]) break;
if (p==q || !parts[q]) continue;
if ((parts[p].range[0]-eps)<parts[q].range[1] ||
(parts[p].range[1]+eps)>parts[q].range[0]) {
// overlapping, select the part with best unique column values
if ((parts[p].unique.length/parts[p].values.length)<
(parts[q].unique.length/parts[q].values.length)) {
//print('delete '+q)
delete parts[q];
} else {
//print('delete '+p)
delete parts[p];
}
}
}
}
return parts
}
function select (data,what) {
if (Comp.obj.isArray(what) && what.length==2) {
var c0=what[0],c1=what[1];
return data.map(function (row) {
return row.slice(c0,c1+1);
})
}
}
function selectEps (eps,index) {
if (typeof eps == 'number') return eps;
else return eps[index]
}
/** Split a data set by finding the best feature (column)
* based on maximal gain/entropy calculation of columns.
* type eps = number | number []
*/
function splitEps (data,features,target,targets,eps) {
var bestFeature = maxGainEps(data,features,target,targets,eps);
var index = features.indexOf(bestFeature);
eps = selectEps(eps,index);
var remainingFeatures = without(features, bestFeature);
var possibleValues = sort(uniqueEps(pluck(data, bestFeature),eps));
var choices = possibleValues.map( function(v) {
var dataS = data.filter(function(x) {
return Math.abs(x[bestFeature] - v) <= eps
});
return {
val:v,
data:dataS,
}
});
return {
feature:bestFeature,
choices:choices,
possibleValues:possibleValues,
remainingFeatures:remainingFeatures
};
}
function uniqueEps(array,eps) {
var result=[];
array.forEach(function (x) {
var found;
if (!result.length) result.push(x);
else {
result.forEach(function (y) {
if (found) return;
found = Math.abs(x-y)<=eps;
});
if (!found) result.push(x);
}
});
return result;
}
module.exports = {
analyze : function (data,features,target,eps) {
var noise=[];
if (!eps) eps=0;
var targets = unique(pluck(data,target));
var parts = {}, partsUnique = {},diversity={}
features.forEach(function (feature) {
partsUnique[feature]=partitionUniqueEps(data,feature,target,targets,eps);
parts[feature]=partitionEps(data,feature,target,targets,eps);
for(var p in parts[feature]) noise.push(parts[feature][p].noise);
})
features.forEach(function (feature) {
diversity[feature]=Object.keys(partsUnique[feature]).length;
})
return {
features:features,
partitions:parts, // for each data column
diversity:diversity,
noise:stat.mean(noise)
}
},
entropy:entropy,
entropyN:entropyN,
entropyEps:entropyEps,
entropyTEps:entropyTEps,
features:features,
gainEps:gainEps,
maxGainEps:maxGainEps,
mostCommon:mostCommon,
partition:partition,
partitionEps:partitionEps,
partitionUniqueEps:partitionUniqueEps,
splitEps:splitEps,
unique:unique,
uniqueEps:uniqueEps,
utils : {
// return column by key of a matrix (array array|record array)
best:best,
bestNormalize:bestNormalize,
column:pluck,
log2:log2,
prob:prob,
// transform [v][] -> v[]
relax: function (mat) {
if (Comp.obj.isMatrix(mat) && mat[0].length==1) return mat.map(function (row) { return row[0]})
else return mat;
},
select:select,
selectEps:selectEps,
sort:sort,
stat:stat,
without:without,
// transform v[] -> [v][]
wrap: function (mat) {
if (!Comp.obj.isMatrix(mat)) return mat.map(function (v) { return [v]})
else return mat
},
},
};