Skip to content

Commit

Permalink
Merge pull request #10 from KIT-HYD/rebuild_tests
Browse files Browse the repository at this point in the history
Rebuild tests
  • Loading branch information
AlexDo1 authored Oct 29, 2021
2 parents f410da0 + 78fd6ad commit 3c083a6
Show file tree
Hide file tree
Showing 38 changed files with 2,890 additions and 121 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,9 @@ venv.bak/

# coverage files
cover

# MacOS DS_Store
.DS_Store

# Matlab autosave file
skinfo/test/matlab_test_metrics/scikit_info_test.asv
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
numpy
scipy
nose
nose
pytest
pandas
18 changes: 15 additions & 3 deletions skinfo/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def entropy(x, bins, normalize=False, use_probs=False):
# get number of bins
nbins = len(p)
# maximal entropy: uniform distribution
normalizer = np.log2(len(nbins))
normalizer = np.log2(nbins)

return - p.dot(np.log2(p)) / normalizer
else:
Expand Down Expand Up @@ -363,10 +363,22 @@ def kullback_leibler(x, y, bins, use_probs=False):
# if y does not sum up to 1, raise an error
if not np.isclose(sum(y),1,atol=0.0001):
raise ValueError('Probabilities in vector y do not sum up to 1.')

px = x
py = y
else:
# get the bins
bins = np.histogram_bin_edges([x, y], bins)
# calculate unconditioned histograms
hist_x = np.histogram(x, bins=bins)[0]
hist_y = np.histogram(y, bins=bins)[0]
#calculate probabilities
px = (hist_x / np.sum(hist_x))
py = (hist_y / np.sum(hist_y))

# calculte the cross entropy and unconditioned entropy of y
hcross = cross_entropy(x, y, bins, use_probs=use_probs)
hx = entropy(x, bins, use_probs=use_probs)
hcross = cross_entropy(px, py, bins, use_probs=True)
hx = entropy(px, bins, use_probs=True)

return hcross - hx

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
function A = allcomb_singleinput(bla)

% Uwe Ehret, 30.7.2017
% This is a modified version of function 'allcomb' by Jos van der Geest, downloaded from Matlab
% Fileexchange in version 4.1. Version 4.2 of allcomb is available at
% https://de.mathworks.com/matlabcentral/fileexchange/10064-allcomb-varargin

% allcomb_singleinput does the same thing as allcomb, but
% input is not provided as collection of individual arrays (A, B, C)
% but as a single [1,n] cell array, where n is the number of formerly individual arrays
% so bla(1,1) = A, bla(1,2) = B etc.
% changes from the original code are indicated by 'UE'

% I acknowledge the work by Jos van der Geest. Please also see the
% following copyright notices for allcomb:
% Copyright (c) 2016, Jos (10584)
% All rights reserved.
%
% Redistribution and use in source and binary forms, with or without
% modification, are permitted provided that the following conditions are
% met:
%
% * Redistributions of source code must retain the above copyright
% notice, this list of conditions and the following disclaimer.
% * Redistributions in binary form must reproduce the above copyright
% notice, this list of conditions and the following disclaimer in
% the documentation and/or other materials provided with the distribution
%
% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
% POSSIBILITY OF SUCH DAMAGE.

% UE:
varargin = bla; % assign the input to the former collective input variable 'varagin'

narginchk(1,Inf) ;

% UE
%NC = nargin ;
NC = size(varargin,2); % get the number of individual arrays in the input

% check if we should flip the order
if ischar(varargin{end}) && (strcmpi(varargin{end},'matlab') || strcmpi(varargin{end},'john')),
% based on a suggestion by JD on the FEX
NC = NC-1 ;
ii = 1:NC ; % now first argument will change fastest
else
% default: enter arguments backwards, so last one (AN) is changing fastest
ii = NC:-1:1 ;
end

args = varargin(1:NC) ;
% check for empty inputs
if any(cellfun('isempty',args)),
warning('ALLCOMB:EmptyInput','One of more empty inputs result in an empty output.') ;
A = zeros(0,NC) ;
elseif NC > 1
isCellInput = cellfun(@iscell,args) ;
if any(isCellInput)
if ~all(isCellInput)
error('ALLCOMB:InvalidCellInput', ...
'For cell input, all arguments should be cell arrays.') ;
end
% for cell input, we use to indices to get all combinations
ix = cellfun(@(c) 1:numel(c), args,'un',0) ;

% flip using ii if last column is changing fastest
[ix{ii}] = ndgrid(ix{ii}) ;

A = cell(numel(ix{1}),NC) ; % pre-allocate the output
for k=1:NC,
% combine
A(:,k) = reshape(args{k}(ix{k}),[],1) ;
end
else
% non-cell input, assuming all numerical values or strings
% flip using ii if last column is changing fastest
[A{ii}] = ndgrid(args{ii}) ;
% concatenate
A = reshape(cat(NC+1,A{:}),[],NC) ;
end
elseif NC==1,
A = args{1}(:) ; % nothing to combine

else % NC==0, there was only the 'matlab' flag argument
A = zeros(0,0) ; % nothing
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
function [pdf_nonzero] = f_NonZeroPDF_anyd(data_histcounts)
% Returns a pdf of an any-dimensional discrete (binned) frequency distribution where all bins have non-zero probabilities
% Method
% - For each bin, its nonzero bin occupation probability is estimated
% as the mean of the confidence interval for p_i based on the binominal distribution.
% These confidence intervals will become narrower the larger the total counts in data_histcounts
% Input
% - data_histcounts: [num_bins of dim 1, num_bins of dim 2, ... , num_bins of dim end]
% matrix, with counts of occurrency of all possible bin combinations
% across dimensions
% Note: data_histcounts must be NaN-free
% Output
% - pdf_nonzero: [num_bins of dim 1, num_bins of dim 2, ... , num_bins of dim end]
% matrix with nonzero bin occupation probability (strictly positive)
% Version
% 2018/11/14 Uwe Ehret, initial version

% check if data_histcounts is NaN-free
if ~isempty(find(isnan(data_histcounts)))
error('data_histcounts contains NaNs')
end

% reshape data_histcounts to one long 1-d array
data_histcounts_1d = reshape(data_histcounts,[numel(data_histcounts),1]);

% get the total number of counts in the 1-d histogram
num_counts = sum(data_histcounts_1d);

% for each bin, compute the confidence interval of its bin occuptation probability, provided as upper and lower value of 95% confidence interval
[~,CI] = binofit(data_histcounts_1d,num_counts);

% the non-zero bin occupation probability is the mean of the confidence interval
pdf_nonzero_1d = mean(CI,2);

% as pdf_nonzero_array = mean(CI,2) does not assure sum(pdf_nonzero)=1, do so by diving with the sum
pdf_nonzero_1d = pdf_nonzero_1d'/sum(pdf_nonzero_1d);

% reshape pdf_nonzero_1d back to the original dimensions
pdf_nonzero = reshape(pdf_nonzero_1d,size(data_histcounts));

end
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
function [combis] = f_all_bincombs(num_bins)
% Creates an array of all possible bin combinations
% Input
% - num_bins: [1,n] array, where for each dimension the number of bins is given
% Output
% - combis: [num_combis,num_dim] array with all possible bin number combinations
% Version
% - 2017/07/15 Uwe Ehret: initial version

% number of dimensions of the matrix
num_dim = size(num_bins,2);

% initialze cell array with all possible bin numbers for each dimension
mycell = cell(1,num_dim);

% loop over all dimensions of the matrix
for d = 1 : num_dim
% write an array with all possible bin numbers
mycell{d} = (1:num_bins(d));
end

% create all possible combinations of predictor bin numbers
combis = allcomb_singleinput(mycell);

end

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
function [combis] = f_all_predictor_bincombs(num_bins)
% returns an array of all possible predictor bin combinations
% Input
% - num_bins: [1,n] array, where for each dimension the number of bins is given
% Note: The first entry is for the target, which will be ignored later
% as we just need the number of predictor combinations
% Output
% - combis: [num_combis,num_dim-1] array with all possible bin number
% combinations across all predictors
% Version
% - 2017/10/22 Uwe Ehret: initial version

% number of dimensions of the target-predictor matrix
num_dim = size(num_bins,2);

% check the number of dimensions (at least one predictor, i.e. num_dim min 2)
if num_dim < 2
error('num_dim too small');
end

% initialze cell array with all possible bin numbers for each dimension
mycell = cell(1,num_dim);

% loop over all dimensions of the target-predictor matrix
for d = 1 : num_dim
% write an array with all possible bin numbers for target and all predictors
mycell{d} = (1:num_bins(d));
end

% Delete the first entry of the cell (the target bins)
mycell(1) = [];

% create all possible combinations of predictor bin numbers
combis = allcomb_singleinput(mycell);

end

Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
function [ combis ] = f_all_predictor_bincombs_indataset( data, edges )
% returns an array of all unique predictor bin combinations in a dataset
% Input
% - data: [num_data,num_dim] array, where each row is a set of related target
% (col=1) and predictors (col= 2:end) values
% Note:
% - data must be NaN-free
% - num_dim must be >=2 (1 target plus at least one predictor)
% - edges: [1,num_dim] cell array, with a [1,num_edges] array of bin edges
% for each dimension inside
% Note: For each dimension, the edges must completely cover the entire
% value range of the respective data
% Output
% - combis: [num_combis,num_dim-1] array with all unique bin number
% combinations across all predictors in the dataset
% Version
% - 2018/07/24 Uwe Ehret: initial version

% get dimensionality of data set
[num_data, num_dim] = size(data);

% check input
% check input data for NaN
if ~isempty(find(isnan(data)))
error('input data contain NaN');
end

% check for at least one predictor
if num_dim < 2
error('need at least two columns in input data');
end

% check if input data fall outside the bin edges
mins = min(data,[],1); % smallest value in each dimension
maxs = max(data,[],1); % largest value in each dimension

% loop over all dimensions
for d = 1 : num_dim
if mins(d) < edges{d}(1) % smallest value is < leftmost edge
error('input data < leftmost edge');
elseif maxs(d) > edges{d}(end) % largest value is > rightmost edge
error('input data > rightmost edge');
end
end

% initialize output variables
% Note: Here for convenience still with the first=target col, will be erased later
combis = NaN(num_data,num_dim);

% loop over all dimensions (except the first = target)
for d = 2 : num_dim
% classify the data in each dimension into bins
[~,~,combis(:,d)] = histcounts(data(:,d),edges{d});
end

% erase the first column (=target)
combis = combis(:,2:end);

% erase all redundant rows
combis = unique(combis,'rows');

end

Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
function [edges, centers] = f_binlevels2edges(indx_binlevels, indx_datatypes, binlevels, datatypes)
% For an array of datatypes, returns bin edges for a desired binlevel
% number
% Input
% - indx_binlevels: [1,num_dim] array with desired binlevel for each
% dimension. The binlevel is used as index in binlevels to retrieve
% the desired number of bins
% - indx_datatypes: [1,num_dim] :double. Datatype index for each dimension.%
% - binlevels: [m,1] :double. Specifies the number of equal-sized bins
% the datatype range [min, max] should be subdivided into
% - datatypes [n,1] :struct. Specifies the datatype
% Output
% - edges: [1,num_dim] cell array, with arrays of bin edges for each dimension
% - centers: [1,num_dim] cell array, with arrays of bin centers for each dimension
% Version
% - 2018/07/13 Uwe Ehret: initial version

% get dimensionality (= number of edge-arrays to create)
num_dim = size(indx_binlevels,2);

% initialize output variables
edges = cell(1,num_dim);
centers = cell(1,num_dim);

% loop over all dimensions
for i = 1 : num_dim

% get parameters for binning
min = datatypes(indx_datatypes(i)).min; % min value for given datatype
max = datatypes(indx_datatypes(i)).max; % max value for given datatype
numbins = binlevels(indx_binlevels(i)); % number of bins for desired binlevel

% create the bins and centers
var_edges = linspace(min, max, numbins+1); % create the bin edges
var_centers = var_edges(1:end-1) + diff(var_edges) / 2; % create the bin centers

% write edges and centers to output
edges{i} = var_edges;
centers{i} = var_centers;

end

end

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
function [ ] = f_check_pdf(pdf)
% Checks a pdf for sum = 1+/- 0.00001. Throws an error if not
% Input
% - pdf: [1,n] or [n,1] array with a probability density function
% Note: pdf must be NaN-free
% Output
% - none (no error)
% Version
% - 2017/10/25 Uwe Ehret: initial version

% check if pdf is NaN-free
if ~isempty(find(isnan(pdf)))
error('pdf contains NaNs')
end

% check if pdf sums up to almost one
if abs(sum(pdf) - 1) > .00001
error('Probablities dont sum to 1.')
end

end

Loading

0 comments on commit 3c083a6

Please sign in to comment.