Skip to content

Commit

Permalink
Fix runtime errors from columns with no values
Browse files Browse the repository at this point in the history
Also adds tests to prevent it in the future.
  • Loading branch information
qubixes authored Oct 10, 2022
1 parent 764c9bb commit d638f83
Show file tree
Hide file tree
Showing 7 changed files with 1,402 additions and 1,300 deletions.
849 changes: 455 additions & 394 deletions examples/advanced_tutorial.ipynb

Large diffs are not rendered by default.

1,784 changes: 892 additions & 892 deletions examples/demonstration.csv

Large diffs are not rendered by default.

14 changes: 9 additions & 5 deletions examples/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
from metasynth.distribution.datetime import UniformDateTimeDistribution, UniformTimeDistribution
from metasynth.distribution.datetime import UniformDateDistribution
import wget
Expand All @@ -7,27 +8,30 @@

def get_demonstration_fp():
demonstration_fp = Path("demonstration.csv")
titanic_fp = Path("titanic.csv")
if demonstration_fp.is_file():
return demonstration_fp
wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
df = pd.read_csv("titanic.csv")
if not titanic_fp.is_file():
wget.download("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
df = pd.read_csv(titanic_fp)

# Convert Age to a nullable integer.
df["Age"] = df["Age"].round().astype("Int64")

# Add a date column.
date_dist = UniformDateDistribution._example_distribution()
df["Birthday"] = [date_dist.draw() for _ in range(len(df))]
df["Birthday"] = [date_dist.draw() if np.random.rand() < 0.9 else pd.NA for _ in range(len(df))]

# Add a time column.

time_dist = UniformTimeDistribution._example_distribution()
df["Board time"] = [time_dist.draw() for _ in range(len(df))]
df["Board time"] = [time_dist.draw() if np.random.rand() < 0.9 else pd.NA for _ in range(len(df))]

# Add a datetime column
time_dist = UniformDateTimeDistribution._example_distribution()
df["Married since"] = [time_dist.draw() for _ in range(len(df))]
df["Married since"] = [time_dist.draw() if np.random.rand() < 0.9 else pd.NA for _ in range(len(df))]

df["all_NA"] = [pd.NA for _ in range(len(df))]
# Remove some columns for brevity and write to a file.
df = df.drop(["SibSp", "Pclass", "Ticket", "Survived"], axis=1)
df.to_csv(demonstration_fp, index=False)
Expand Down
11 changes: 9 additions & 2 deletions metasynth/distribution/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def fit(cls, series: Sequence, *args, **kwargs) -> BaseDistribution:
Fitted distribution.
"""
pd_series = cls._to_series(series)
if len(pd_series) == 0:
return cls._example_distribution()
distribution = cls._fit(pd_series, *args, **kwargs)
return distribution

Expand Down Expand Up @@ -203,7 +205,10 @@ def __getattr__(self, attr: str):

@classmethod
def _fit(cls, values):
param = cls.dist_class.fit(values[~np.isnan(values)])
if len(values) == 0:
return cls._example_distribution()
values = pandas.to_numeric(values)
param = cls.dist_class.fit(values.values)
return cls(*param)

def to_dict(self):
Expand All @@ -216,5 +221,7 @@ def draw(self):
return self.dist.rvs()

def information_criterion(self, values):
vals = values[~np.isnan(values)]
vals = pandas.to_numeric(self._to_series(values))
if len(vals) == 0:
return 2*self.n_par
return 2*self.n_par - 2*np.sum(self.dist.logpdf(vals))
16 changes: 11 additions & 5 deletions metasynth/distribution/continuous.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Implemented floating point distributions."""

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import uniform, norm, lognorm, truncnorm, expon
from scipy.stats._continuous_distns import FitDataError
Expand Down Expand Up @@ -31,12 +32,14 @@ def __init__(self, min_val: float, max_val: float):

@classmethod
def _fit(cls, values):
vals = values[~np.isnan(values)]
return cls(vals.min(), vals.max())
values = pd.to_numeric(values)
return cls(values.min(), values.max())

def information_criterion(self, values):
vals = values[~np.isnan(values)]
if np.any(np.array(values) < self.min_val) or np.any(np.array(values) > self.max_val):
vals = self._to_series(values).values
if len(vals) == 0:
return 2*self.n_par
if np.any(vals < self.min_val) or np.any(vals > self.max_val):
return 2*self.n_par + 100*len(vals)
return 2*self.n_par - 2*len(vals)*np.log((self.max_val-self.min_val)**-1)

Expand Down Expand Up @@ -94,6 +97,7 @@ def __init__(self, mu: float, sigma: float): # pylint: disable=invalid-name

@classmethod
def _fit(cls, values):
values = pd.to_numeric(values)
try:
sigma, _, scale = cls.dist_class.fit(values, floc=0)
except FitDataError:
Expand Down Expand Up @@ -133,6 +137,7 @@ def __init__(self, lower_bound: float, upper_bound: float,

@classmethod
def _fit(cls, values):
values = pd.to_numeric(values)
lower_bound = np.min(values) - 1e-8
upper_bound = np.max(values) + 1e-8
return cls._fit_with_bounds(values, lower_bound, upper_bound)
Expand Down Expand Up @@ -177,10 +182,11 @@ def __init__(self, rate: float):

@classmethod
def _fit(cls, values):
values = pd.to_numeric(values)
values = values[values > 0]
if len(values) == 0:
return cls._example_distribution()
return cls(rate=1/expon.fit(values[~np.isnan(values)], floc=0)[1])
return cls(rate=1/expon.fit(values, floc=0)[1])

@classmethod
def _example_distribution(cls):
Expand Down
2 changes: 2 additions & 0 deletions metasynth/var.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def get_var_type(pandas_dtype: str) -> str:
"string": "string",
"integer": "discrete",
"floating": "continuous",
"mixed-integer-float": "continuous",
"empty": "continuous",
"date": "date",
"datetime64": "datetime",
"time": "time",
Expand Down
26 changes: 24 additions & 2 deletions tests/test_var.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from pathlib import Path
import json

import pandas as pd
Expand Down Expand Up @@ -35,7 +34,6 @@ def check_similar(series_a, series_b):
check_similar(series, new_series)
assert var.var_type == var_type
assert var.distribution.var_type == var_type
# assert isinstance(var.distribution, dist_class)

new_var = MetaVar.from_dict(var.to_dict())
with raises(ValueError):
Expand Down Expand Up @@ -147,6 +145,30 @@ def test_manual_fit():
var.fit(10)


def test_na_zero():
series = pd.Series([pd.NA for _ in range(10)])
var = MetaVar.detect(series)
var.fit()
assert var.var_type == "continuous"
assert var.prop_missing == 1.0


def test_na_one():
series = pd.Series([pd.NA if i != 0 else 1.0 for i in range(10)])
var = MetaVar.detect(series)
var.fit()
assert var.var_type == "continuous"
assert abs(var.prop_missing-0.9) < 1e7


def test_na_two():
series = pd.Series(np.array([np.nan if i < 2 else 0.123*i for i in range(10)]))
var = MetaVar.detect(series)
var.fit()
assert var.var_type == "continuous"
assert abs(var.prop_missing-0.8) < 1e7


def test_manual_unique():
series = pd.Series(np.random.randint(0, 100000, size=10))
var = MetaVar.detect(series)
Expand Down

0 comments on commit d638f83

Please sign in to comment.