Skip to content

Commit

Permalink
Merge pull request #2 from KDD-OpenSource/feature/pip-package
Browse files Browse the repository at this point in the history
Restructure to use as PIP package
  • Loading branch information
marcuspappik authored Mar 15, 2017
2 parents bb61754 + 2ffead5 commit 34e5e6e
Show file tree
Hide file tree
Showing 10 changed files with 116 additions and 39 deletions.
96 changes: 96 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject
Empty file removed __init__.py
Empty file.
12 changes: 0 additions & 12 deletions bivariate_correlation.py → examples/bivariate_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from math import pow



class ScoredSlices:

def __init__(self, categorical, continuous, to_keep = 5, threshold = None):
self.continuous = pd.Panel({feature : pd.DataFrame(columns = ['end', 'start'])
for feature in continuous})
Expand All @@ -27,7 +25,6 @@ def __init__(self, categorical, continuous, to_keep = 5, threshold = None):
else:
self.threshold = threshold


def add_slices(self, slices):
temp_continuous = {}
temp_categorical = {}
Expand All @@ -48,7 +45,6 @@ def add_slices(self, slices):
self.continuous = pd.Panel(temp_continuous)
self.categorical = pd.Panel(temp_categorical)


def select_slices(self, similarity):
indices = list(range(len(similarity)))
selected = []
Expand All @@ -64,7 +60,6 @@ def select_slices(self, similarity):

return selected


def reduce_slices(self):
if not self.continuous.empty:
continuous_similarity = continuous_similarity_matrix(self.continuous)
Expand Down Expand Up @@ -93,10 +88,7 @@ def reduce_slices(self):
self.scores = self.scores.loc[selected].reset_index(drop = True)




class IncrementalBivariateCorrelation:

def __init__(self, data, target, iterations = 10, alpha = 0.1, drop_discrete = True):
self.subspace_contrast = HiCS(data, alpha, iterations)

Expand All @@ -114,7 +106,6 @@ def __init__(self, data, target, iterations = 10, alpha = 0.1, drop_discrete = T
self.subspace_slices = {}
self.subspace_relevancies = {}


def subspace_relevancy(self, subspace, cach_slices = False):
score, slices = self.subspace_contrast.calculate_contrast(features = subspace, target = self.target, return_slices = True)

Expand All @@ -138,7 +129,6 @@ def subspace_relevancy(self, subspace, cach_slices = False):
scored_slices.reduce_slices()
return score, scored_slices


def update_relevancies(self):
for feature in self.features:
score, dummy = self.subspace_relevancy(subspace = [feature], cach_slices = True)
Expand All @@ -147,7 +137,6 @@ def update_relevancies(self):

self.relevancy_cycles = self.relevancy_cycles + 1


def update_redundancies(self, k = 5, redundancy_checks = 20):
temp_redundancy_table = pd.Panel({'redundancy' : pd.DataFrame(data = 0, columns = self.features, index = self.features),
'weight' : pd.DataFrame(data = 0, columns = self.features, index = self.features)})
Expand All @@ -170,7 +159,6 @@ def update_redundancies(self, k = 5, redundancy_checks = 20):
self.redundancy_table['weight'] = self.redundancy_table['weight'] + temp_redundancy_table['weight']
self.redundancy_table['redundancy'].fillna(0, inplace = True)


def calculate_correlation(self, k = 5, redundancy_checks = 20, callback = print, limit = sys.maxsize):

while self.relevancy_cycles < limit:
Expand Down
File renamed without changes.
5 changes: 1 addition & 4 deletions hics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@

import hics.contrast_meassure
import hics.divergences

from hics import contrast_meassure, divergences
11 changes: 0 additions & 11 deletions hics/contrast_meassure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from random import randint, shuffle



class HiCS:

def __init__(self, data, alpha, iterations, continuous_divergence = KS, categorical_divergence = KLD):
self.iterations = iterations
self.alpha = alpha
Expand All @@ -33,36 +31,31 @@ def __init__(self, data, alpha, iterations, continuous_divergence = KS, categori
else:
self.types[column] = 'continuous'


def values(self, feature):
if not feature in self.values:
return False

else:
return self.values[feature]


def type(self, feature):
if not feature in self.types:
return False

else:
return self.types[feature]


def cached_marginal_distribution(self, feature):
if not feature in self.distributions:
values, counts = np.unique(self.data[feature], return_counts = True)
self.distributions[feature] = pd.DataFrame({'value' : values, 'count' : counts, 'probability' : counts/len(self.data)}).sort_values(by = 'value')
return self.distributions[feature]


def cached_sorted_indices(self, feature):
if not feature in self.sorted_indices.columns:
self.sorted_indices[feature] = self.data.sort_values(by = feature, kind = 'mergesort').index.values
return self.sorted_indices[feature]


def calculate_conditional_distribution(self, slice_conditions, target):
filter_array = np.array([True]*len(self.data))

Expand All @@ -75,7 +68,6 @@ def calculate_conditional_distribution(self, slice_conditions, target):
probabilities = counts/filter_array.sum()
return pd.DataFrame({'value' : values, 'count' : counts, 'probability' : probabilities}).sort_values(by = 'value')


def create_categorical_condition(self, feature, instances_per_dimension):
feature_distribution = self.cached_marginal_distribution(feature)
shuffled_values = np.random.permutation(feature_distribution['value'])
Expand All @@ -93,7 +85,6 @@ def create_categorical_condition(self, feature, instances_per_dimension):
indices = self.data.loc[self.data[feature].isin(selected_values), : ].index.tolist()
return {'feature' : feature, 'indices' : indices, 'values' : selected_values}


def create_continuous_condition(self, feature, instances_per_dimension):
sorted_feature = self.cached_sorted_indices(feature)
max_start = len(sorted_feature) - instances_per_dimension
Expand All @@ -106,7 +97,6 @@ def create_continuous_condition(self, feature, instances_per_dimension):

return {'feature' : feature, 'indices' : indices, 'from_value' : start_value, 'to_value' : end_value}


def output_slices(self, score, conditions, slices):
for condition in conditions:
ft = condition['feature']
Expand All @@ -131,7 +121,6 @@ def output_slices(self, score, conditions, slices):

return slices


def calculate_contrast(self, features, target, return_slices = False):
slices = {'features' : {}, 'scores' : []}

Expand Down
13 changes: 13 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from setuptools import setup


setup(
name='hics',
version='0.1',
author='Markus Pappik',
install_requires=[
'pandas',
'numpy'
],
packages=['hics']
)
8 changes: 2 additions & 6 deletions test_contrast_meassure.py → tests/test_contrast_meassure.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,27 @@
import unittest
from unittest import TestCase
from hics.contrast_meassure import HiCS
import numpy as np
import pandas as pd


class Test_HiCS(unittest.TestCase):
class Test_HiCS(TestCase):
alpha = 0.1
iterations = 100


def test_cashed_marginal_distribution(self):
correct_result = pd.DataFrame({'value' : [1, 2, 3], 'count' : [1, 2, 5], 'probability' : [0.125, 0.25, 0.625]})
dataset = pd.DataFrame({'test_marginal' : [2, 2, 1, 3, 3, 3, 3, 3]})
test_HiCS = HiCS(dataset, self.alpha, self.iterations)
dist = test_HiCS.cashed_marginal_distribution('test_marginal')
self.assertTrue(dist.equals(correct_result))


def test_cashed_sorted_indices(self):
correct_result = np.array([2, 1, 0])
dataset = pd.DataFrame({'to_sort' : [10, 5, 0]})
test_HiCS = HiCS(dataset, self.alpha, self.iterations)
sorted_index = test_HiCS.cashed_sorted_indices('to_sort')
self.assertTrue(np.all(correct_result == sorted_index))


def test_calculate_conditional_distribution(self):
correct_result = pd.DataFrame({'value' : [0, 1, 2], 'count' : [3, 1, 1], 'probability' : [0.6, 0.2, 0.2]})
dataset = pd.DataFrame({'target' : [1, 1, 1, 0, 0, 0, 2, 2, 2], 'feature' : [0, 1, 2, 3, 4, 5, 6, 7, 8]})
Expand All @@ -34,7 +31,6 @@ def test_calculate_conditional_distribution(self):
cond_dist = test_HiCS.calculate_conditional_distribution([condition], target)
self.assertTrue(cond_dist.equals(correct_result))


def test_create_discrete_condition(self):
dataset = pd.DataFrame({'feature' : [1]*20 + [2]*3 + [0]*1 })
test_HiCS = HiCS(dataset, self.alpha, self.iterations)
Expand Down
5 changes: 2 additions & 3 deletions test_scored_slices.py → tests/test_scored_slices.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import unittest
from unittest import TestCase
from bivariate_correlation import ScoredSlices
import numpy as np
import pandas as pd

class Test_scored_slices(unittest.TestCase):

class Test_scored_slices(TestCase):
def test_categorical(self):
result = np.array([[1, 0.5, 0.5], [0.5, 1, 0.25], [0.5, 0.25, 1]])

Expand All @@ -31,6 +31,5 @@ def test_categorical(self):
self.assertTrue(np.all(np.array(scored_slices.categorical['X4']) == np.array([[0, 0, 1, 1], [1, 1, 1, 0]])))



if __name__ == '__main__':
unittest.main()
5 changes: 2 additions & 3 deletions test_slice_similarity.py → tests/test_slice_similarity.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import unittest
from unittest import TestCase
from slice_similarity.slice_similarity import continuous_similarity_matrix, categorical_similarity_matrix
import numpy as np
import pandas as pd

class Test_slice_similarity(unittest.TestCase):

class Test_slice_similarity(TestCase):
def test_categorical(self):
result = np.array([[1, 0.5, 0.5], [0.5, 1, 0.25], [0.5, 0.25, 1]])

Expand All @@ -16,7 +16,6 @@ def test_categorical(self):
similarity = categorical_similarity_matrix(categorical)
self.assertTrue(np.all(similarity == result))


def test_continuous(self):
result = np.array([[1, 0, 0], [0, 1, 2/3], [0, 2/3, 1]])

Expand Down

0 comments on commit 34e5e6e

Please sign in to comment.