Merge pull request #10 from KIT-HYD/rebuild_tests

Rebuild tests
KIT-HYD · Oct 29, 2021 · 3c083a6 · 3c083a6
2 parents f410da0 + 78fd6ad
commit 3c083a6
Show file tree

Hide file tree

Showing 38 changed files with 2,890 additions and 121 deletions.
diff --git a/.gitignore b/.gitignore
@@ -111,3 +111,9 @@ venv.bak/
 
 # coverage files
 cover
+
+# MacOS DS_Store
+.DS_Store
+
+# Matlab autosave file
+skinfo/test/matlab_test_metrics/scikit_info_test.asv
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 numpy
 scipy
-nose
+nose
+pytest
+pandas
diff --git a/skinfo/metrics.py b/skinfo/metrics.py
@@ -71,7 +71,7 @@ def entropy(x, bins, normalize=False, use_probs=False):
         # get number of bins
         nbins = len(p)
         # maximal entropy: uniform distribution
-        normalizer = np.log2(len(nbins)) 
+        normalizer = np.log2(nbins) 
 
         return - p.dot(np.log2(p)) / normalizer
     else:
@@ -363,10 +363,22 @@ def kullback_leibler(x, y, bins, use_probs=False):
         # if y does not sum up to 1, raise an error
         if not np.isclose(sum(y),1,atol=0.0001):
             raise ValueError('Probabilities in vector y do not sum up to 1.')
+
+        px = x
+        py = y
+    else:
+        # get the bins
+        bins = np.histogram_bin_edges([x, y], bins)
+        # calculate unconditioned histograms
+        hist_x = np.histogram(x, bins=bins)[0]
+        hist_y = np.histogram(y, bins=bins)[0]
+        #calculate probabilities
+        px = (hist_x / np.sum(hist_x))
+        py = (hist_y / np.sum(hist_y))
 
     # calculte the cross entropy and unconditioned entropy of y
-    hcross = cross_entropy(x, y, bins, use_probs=use_probs)
-    hx = entropy(x, bins, use_probs=use_probs)
+    hcross = cross_entropy(px, py, bins, use_probs=True)
+    hx = entropy(px, bins, use_probs=True)
 
     return hcross - hx
 

diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/allcomb_singleinput.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/allcomb_singleinput.m
@@ -0,0 +1,95 @@
+function A = allcomb_singleinput(bla)
+
+% Uwe Ehret, 30.7.2017
+% This is a modified version of function 'allcomb' by Jos van der Geest, downloaded from Matlab
+% Fileexchange in version 4.1. Version 4.2 of allcomb is available at 
+% https://de.mathworks.com/matlabcentral/fileexchange/10064-allcomb-varargin
+
+% allcomb_singleinput does the same thing as allcomb, but
+% input is not provided as collection of individual arrays (A, B, C)
+% but as a single [1,n] cell array, where n is the number of formerly individual arrays
+% so bla(1,1) = A, bla(1,2) = B etc.
+% changes from the original code are indicated by 'UE'
+
+% I acknowledge the work by Jos van der Geest. Please also see the
+% following copyright notices for allcomb:
+    % Copyright (c) 2016, Jos (10584)
+    % All rights reserved.
+    % 
+    % Redistribution and use in source and binary forms, with or without
+    % modification, are permitted provided that the following conditions are
+    % met:
+    % 
+    %     * Redistributions of source code must retain the above copyright
+    %       notice, this list of conditions and the following disclaimer.
+    %     * Redistributions in binary form must reproduce the above copyright
+    %       notice, this list of conditions and the following disclaimer in
+    %       the documentation and/or other materials provided with the distribution
+    % 
+    % THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    % AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    % IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    % ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    % LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    % CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    % SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    % INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    % CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    % ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    % POSSIBILITY OF SUCH DAMAGE.
+
+% UE: 
+varargin = bla; % assign the input to the former collective input variable 'varagin'
+
+narginchk(1,Inf) ;
+
+% UE
+%NC = nargin ; 
+NC = size(varargin,2); % get the number of individual arrays in the input
+
+% check if we should flip the order
+if ischar(varargin{end}) && (strcmpi(varargin{end},'matlab') || strcmpi(varargin{end},'john')),
+    % based on a suggestion by JD on the FEX
+    NC = NC-1 ;
+    ii = 1:NC ; % now first argument will change fastest
+else
+    % default: enter arguments backwards, so last one (AN) is changing fastest
+    ii = NC:-1:1 ;
+end
+
+args = varargin(1:NC) ;
+% check for empty inputs
+if any(cellfun('isempty',args)),
+    warning('ALLCOMB:EmptyInput','One of more empty inputs result in an empty output.') ;
+    A = zeros(0,NC) ;
+elseif NC > 1
+    isCellInput = cellfun(@iscell,args) ;
+    if any(isCellInput)
+        if ~all(isCellInput)
+            error('ALLCOMB:InvalidCellInput', ...
+                'For cell input, all arguments should be cell arrays.') ;
+        end
+        % for cell input, we use to indices to get all combinations
+        ix = cellfun(@(c) 1:numel(c), args,'un',0) ;
+
+        % flip using ii if last column is changing fastest
+        [ix{ii}] = ndgrid(ix{ii}) ;
+
+        A = cell(numel(ix{1}),NC) ; % pre-allocate the output
+        for k=1:NC,
+            % combine
+            A(:,k) = reshape(args{k}(ix{k}),[],1) ;
+        end
+    else
+        % non-cell input, assuming all numerical values or strings
+        % flip using ii if last column is changing fastest
+        [A{ii}] = ndgrid(args{ii}) ;
+        % concatenate
+        A = reshape(cat(NC+1,A{:}),[],NC) ;
+    end
+elseif NC==1,
+    A = args{1}(:) ; % nothing to combine
+
+else % NC==0, there was only the 'matlab' flag argument
+    A = zeros(0,0) ; % nothing
+end
diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_NonZeroPDF_anyd.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_NonZeroPDF_anyd.m
@@ -0,0 +1,41 @@
+function [pdf_nonzero] = f_NonZeroPDF_anyd(data_histcounts)
+% Returns a pdf of an any-dimensional discrete (binned) frequency distribution where all bins have non-zero probabilities
+% Method
+% - For each bin, its nonzero bin occupation probability is estimated 
+%   as the mean of the confidence interval for p_i based on the binominal distribution.
+%   These confidence intervals will become narrower the larger the total counts in data_histcounts
+% Input
+% - data_histcounts: [num_bins of dim 1, num_bins of dim 2, ... , num_bins of dim end]
+%   matrix, with counts of occurrency of all possible bin combinations
+%   across dimensions
+% Note: data_histcounts must be NaN-free
+% Output
+% - pdf_nonzero: [num_bins of dim 1, num_bins of dim 2, ... , num_bins of dim end]
+%   matrix with nonzero bin occupation probability (strictly positive)
+% Version
+% 2018/11/14 Uwe Ehret, initial version
+
+% check if data_histcounts is NaN-free
+if ~isempty(find(isnan(data_histcounts)))
+    error('data_histcounts contains NaNs')
+end
+
+% reshape data_histcounts to one long 1-d array
+data_histcounts_1d = reshape(data_histcounts,[numel(data_histcounts),1]);
+
+% get the total number of counts in the 1-d histogram
+num_counts = sum(data_histcounts_1d);
+
+% for each bin, compute the confidence interval of its bin occuptation probability, provided as upper and lower value of 95% confidence interval 
+[~,CI] = binofit(data_histcounts_1d,num_counts); 
+
+% the non-zero bin occupation probability is the mean of the confidence interval
+pdf_nonzero_1d = mean(CI,2);
+
+% as pdf_nonzero_array = mean(CI,2) does not assure sum(pdf_nonzero)=1, do so by diving with the sum
+pdf_nonzero_1d = pdf_nonzero_1d'/sum(pdf_nonzero_1d);    
+
+% reshape pdf_nonzero_1d back to the original dimensions
+pdf_nonzero = reshape(pdf_nonzero_1d,size(data_histcounts));     
+
+end
diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_bincombs.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_bincombs.m
@@ -0,0 +1,26 @@
+function [combis] = f_all_bincombs(num_bins)
+% Creates an array of all possible bin combinations
+% Input
+% - num_bins: [1,n] array, where for each dimension the number of bins is given
+% Output
+% - combis: [num_combis,num_dim] array with all possible bin number combinations
+% Version
+% - 2017/07/15 Uwe Ehret: initial version
+
+% number of dimensions of the matrix
+num_dim = size(num_bins,2); 
+
+% initialze cell array with all possible bin numbers for each dimension
+mycell = cell(1,num_dim);
+
+% loop over all dimensions of the matrix
+for d = 1 : num_dim
+    % write an array with all possible bin numbers
+    mycell{d} = (1:num_bins(d)); 
+end
+
+% create all possible combinations of predictor bin numbers
+combis = allcomb_singleinput(mycell); 
+
+end
+
diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_predictor_bincombs.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_predictor_bincombs.m
@@ -0,0 +1,37 @@
+function [combis] = f_all_predictor_bincombs(num_bins)
+% returns an array of all possible predictor bin combinations
+% Input
+% - num_bins: [1,n] array, where for each dimension the number of bins is given
+%   Note: The first entry is for the target, which will be ignored later
+%   as we just need the number of predictor combinations
+% Output
+% - combis: [num_combis,num_dim-1] array with all possible bin number
+%   combinations across all predictors
+% Version
+% - 2017/10/22 Uwe Ehret: initial version
+
+% number of dimensions of the target-predictor matrix
+num_dim = size(num_bins,2); 
+
+% check the number of dimensions (at least one predictor, i.e. num_dim min 2)
+if num_dim < 2
+    error('num_dim too small');
+end
+
+% initialze cell array with all possible bin numbers for each dimension
+mycell = cell(1,num_dim);
+
+% loop over all dimensions of the target-predictor matrix
+for d = 1 : num_dim
+    % write an array with all possible bin numbers for target and all predictors
+    mycell{d} = (1:num_bins(d)); 
+end
+
+% Delete the first entry of the cell (the target bins)
+mycell(1) = []; 
+
+% create all possible combinations of predictor bin numbers
+combis = allcomb_singleinput(mycell); 
+
+end
+
diff --git a/...est/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_predictor_bincombs_indataset.m b/...est/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_all_predictor_bincombs_indataset.m
@@ -0,0 +1,63 @@
+function [ combis ] = f_all_predictor_bincombs_indataset( data, edges )
+% returns an array of all unique predictor bin combinations in a dataset
+% Input
+% - data: [num_data,num_dim] array, where each row is a set of related target
+%   (col=1) and predictors (col= 2:end) values
+%   Note: 
+%   - data must be NaN-free
+%   - num_dim must be >=2 (1 target plus at least one predictor)
+% - edges: [1,num_dim] cell array, with a [1,num_edges] array of bin edges
+%   for each dimension inside
+%   Note: For each dimension, the edges must completely cover the entire
+%   value range of the respective data
+% Output
+% - combis: [num_combis,num_dim-1] array with all unique bin number
+%   combinations across all predictors in the dataset
+% Version
+% - 2018/07/24 Uwe Ehret: initial version
+
+% get dimensionality of data set
+    [num_data, num_dim] = size(data);
+
+% check input
+    % check input data for NaN
+    if ~isempty(find(isnan(data)))
+        error('input data contain NaN');
+    end
+
+    % check for at least one predictor
+    if num_dim < 2
+        error('need at least two columns in input data');
+    end
+
+    % check if input data fall outside the bin edges
+    mins = min(data,[],1);   % smallest value in each dimension
+    maxs = max(data,[],1);   % largest value in each dimension
+
+    % loop over all dimensions
+    for d = 1 : num_dim 
+        if mins(d) < edges{d}(1) % smallest value is < leftmost edge
+            error('input data < leftmost edge');
+        elseif maxs(d) > edges{d}(end)  % largest value is > rightmost edge
+            error('input data > rightmost edge');
+        end
+    end
+
+% initialize output variables
+% Note: Here for convenience still with the first=target col, will be erased later
+combis = NaN(num_data,num_dim);    
+
+% loop over all dimensions (except the first = target)
+for d = 2 : num_dim
+    % classify the data in each dimension into bins
+    [~,~,combis(:,d)] = histcounts(data(:,d),edges{d});
+end
+
+% erase the first column (=target)
+combis = combis(:,2:end);
+
+% erase all redundant rows
+combis = unique(combis,'rows');
+
+end
+
diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_binlevels2edges.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_binlevels2edges.m
@@ -0,0 +1,44 @@
+function [edges, centers] = f_binlevels2edges(indx_binlevels, indx_datatypes, binlevels, datatypes)
+% For an array of datatypes, returns bin edges for a desired binlevel
+% number 
+% Input
+% - indx_binlevels: [1,num_dim] array with desired binlevel for each
+%       dimension. The binlevel is used as index in binlevels to retrieve
+%       the desired number of bins
+% - indx_datatypes: [1,num_dim] :double. Datatype index for each dimension.%       
+% - binlevels: [m,1] :double. Specifies the number of equal-sized bins 
+%       the datatype range [min, max] should be subdivided into
+% - datatypes [n,1] :struct. Specifies the datatype
+% Output
+% - edges: [1,num_dim] cell array, with arrays of bin edges for each dimension
+% - centers: [1,num_dim] cell array, with arrays of bin centers for each dimension
+% Version
+% - 2018/07/13 Uwe Ehret: initial version
+
+% get dimensionality (= number of edge-arrays to create)
+num_dim = size(indx_binlevels,2);
+
+% initialize output variables
+edges = cell(1,num_dim);
+centers = cell(1,num_dim);
+
+% loop over all dimensions 
+for i = 1 : num_dim
+
+    % get parameters for binning
+    min = datatypes(indx_datatypes(i)).min;     % min value for given datatype
+    max = datatypes(indx_datatypes(i)).max;     % max value for given datatype
+    numbins = binlevels(indx_binlevels(i));     % number of bins for desired binlevel
+
+    % create the bins and centers
+    var_edges = linspace(min, max, numbins+1);          % create the bin edges
+    var_centers = var_edges(1:end-1) + diff(var_edges) / 2;     % create the bin centers 
+
+    % write edges and centers to output
+    edges{i} = var_edges;
+    centers{i} = var_centers;
+
+end
+
+end
+
diff --git a/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_check_pdf.m b/skinfo/test/matlab_test_metrics/211019_matlab_code_uwe_ehret/f_check_pdf.m
@@ -0,0 +1,22 @@
+function [  ] = f_check_pdf(pdf)
+% Checks a pdf for sum = 1+/- 0.00001. Throws an error if not
+% Input
+% - pdf: [1,n] or [n,1] array with a probability density function
+%   Note: pdf must be NaN-free
+% Output
+% - none (no error)
+% Version
+% - 2017/10/25 Uwe Ehret: initial version
+
+% check if pdf is NaN-free
+if ~isempty(find(isnan(pdf)))
+    error('pdf contains NaNs')
+end
+
+% check if pdf sums up to almost one
+if abs(sum(pdf) - 1) > .00001
+    error('Probablities dont sum to 1.')
+end
+
+end
+
-Original file line number
+Diff line change
@@ -1,3 +1,5 @@
     numpy
     scipy
-    nose
+    nose
+    pytest
+    pandas