Skip to content

Commit

Permalink
Decouple notebooks and notebook tests.
Browse files Browse the repository at this point in the history
- Display no progress bar for disabled modules (e.g. individual correlations).
- Disable missing plots in minimal mode
- Bump `coverage` requirement
- One test hangs on issue in nbval:
computationalmodelling/nbval#136
  • Loading branch information
sbrugman committed Feb 1, 2020
1 parent ea26631 commit 023bd70
Show file tree
Hide file tree
Showing 12 changed files with 67,828 additions and 13,507 deletions.
7 changes: 1 addition & 6 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ env:
- TEST=examples
- TEST=lint

jobs:
exclude:
- python: "3.5"
env: TEST=examples

install:
- pip install --upgrade pip six
- pip install -r requirements.txt
Expand All @@ -31,7 +26,7 @@ install:
script:
- if [ $TEST == 'unit' ]; then pytest --cov=. tests/unit/; fi
- if [ $TEST == 'issue' ]; then pytest --cov=. tests/issues/; fi
- if [ $TEST == 'examples' ]; then pytest --cov=. --nbval --sanitize-with tests/sanitize-notebook.cfg examples/; fi
- if [ $TEST == 'examples' ]; then pytest --cov=. --nbval test/notebooks/; fi
- if [ $TEST == 'console' ]; then pandas_profiling -h; fi
- if [ $TEST == 'lint' ]; then pytest --black -m black src/; flake8 . --select=E9,F63,F7,F82 --show-source --statistics; fi

Expand Down
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ docs:
rmdir docs/pandas_profiling

test:
pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/unit/
pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/issues/
pytest --black tests/unit/
pytest --black tests/issues/
pytest --nbval tests/notebooks/
flake8 . --select=E9,F63,F7,F82 --show-source --statistics

install:
Expand Down
7,177 changes: 3,597 additions & 3,580 deletions examples/meteorites/meteorites.ipynb

Large diffs are not rendered by default.

19,588 changes: 9,786 additions & 9,802 deletions examples/titanic/titanic.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions make.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ IF "%1%" == "docs" (
)

IF "%1" == "test" (
pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/unit/
pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/issues/
pytest --black tests/unit/
pytest --black tests/issues/
pytest --nbval tests/notebooks/
flake8 . --select=E9,F63,F7,F82 --show-source --statistics
ECHO "Tests completed!"
GOTO end
Expand Down
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pytest
coverage<5
coverage==5.0.0
codecov
pytest-cov
pytest-black
Expand Down
4 changes: 2 additions & 2 deletions src/pandas_profiling/config_minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ sort: None

# which diagrams to show
missing_diagrams:
bar: True
matrix: True
bar: False
matrix: False
heatmap: False
dendrogram: False

Expand Down
195 changes: 101 additions & 94 deletions src/pandas_profiling/model/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,103 +147,110 @@ def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:

disable_progress_bar = not config["progress_bar"].get(bool)

with tqdm(total=6, desc="correlations", disable=disable_progress_bar) as pbar:
for correlation_name in ["pearson", "spearman", "kendall"]:
pbar.set_description_str(
"correlations [{correlation_name}]".format(
correlation_name=correlation_name
correlation_names = [
correlation_name
for correlation_name in [
"pearson",
"spearman",
"kendall",
"phi_k",
"cramers",
"recoded",
]
if config["correlations"][correlation_name]["calculate"].get(bool)
]

categorical_correlations = {"cramers": cramers_matrix, "recoded": recoded_matrix}

if len(correlation_names) > 0:
with tqdm(
total=len(correlation_names), desc="correlations", disable=disable_progress_bar
) as pbar:
for correlation_name in correlation_names:
pbar.set_description_str(
"correlations [{correlation_name}]".format(
correlation_name=correlation_name
)
)
)
if config["correlations"][correlation_name]["calculate"].get(bool):
try:
correlation = df.corr(method=correlation_name)
if len(correlation) > 0:
correlations[correlation_name] = correlation
except (ValueError, AssertionError) as e:
warn_correlation(correlation_name, e)
pbar.update()

pbar.set_description_str("correlations [phi_k]")
if config["correlations"]["phi_k"]["calculate"].get(bool):
import phik

with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Phi_k does not filter non-numerical with high cardinality
selcols = []
intcols = []
for col in df.columns.tolist():

if correlation_name in ["pearson", "spearman", "kendall"]:
try:
tmp = (
df[col]
.value_counts(dropna=False)
.reset_index()
.dropna()
.set_index("index")
.iloc[:, 0]
)
if tmp.index.inferred_type == "mixed":
continue

if pd.api.types.is_numeric_dtype(df[col]):
intcols.append(col)
selcols.append(col)
elif df[col].nunique() <= config[
"categorical_maximum_correlation_distinct"
].get(int):
selcols.append(col)
except (TypeError, ValueError):
continue

if len(selcols) > 1:
correlation = df.corr(method=correlation_name)
if len(correlation) > 0:
correlations[correlation_name] = correlation
except (ValueError, AssertionError) as e:
warn_correlation(correlation_name, e)
pbar.update()
elif correlation_name in ["phi_k"]:
import phik

with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Phi_k does not filter non-numerical with high cardinality
selcols = []
intcols = []
for col in df.columns.tolist():
try:
tmp = (
df[col]
.value_counts(dropna=False)
.reset_index()
.dropna()
.set_index("index")
.iloc[:, 0]
)
if tmp.index.inferred_type == "mixed":
continue

if pd.api.types.is_numeric_dtype(df[col]):
intcols.append(col)
selcols.append(col)
elif df[col].nunique() <= config[
"categorical_maximum_correlation_distinct"
].get(int):
selcols.append(col)
except (TypeError, ValueError):
continue

if len(selcols) > 1:
try:
correlations["phi_k"] = df[selcols].phik_matrix(
interval_cols=intcols
)

# Only do this if the column_order is set
with suppress(NotFoundError):
# Get the preferred order
column_order = config["column_order"].get(list)

# Get the Phi_k sorted order
current_order = (
correlations["phi_k"]
.index.get_level_values("var1")
.tolist()
)

# Intersection (some columns are not used in correlation)
column_order = [
x for x in column_order if x in current_order
]

# Override the Phi_k sorting
correlations["phi_k"] = correlations["phi_k"].reindex(
index=column_order, columns=column_order
)
except (ValueError, DataError, IndexError, TypeError) as e:
warn_correlation("phi_k", e)
elif correlation_name in ["cramers", "recoded"]:
get_matrix = categorical_correlations[correlation_name]

try:
correlations["phi_k"] = df[selcols].phik_matrix(
interval_cols=intcols
)

# Only do this if the column_order is set
with suppress(NotFoundError):
# Get the preferred order
column_order = config["column_order"].get(list)

# Get the Phi_k sorted order
current_order = (
correlations["phi_k"]
.index.get_level_values("var1")
.tolist()
)

# Intersection (some columns are not used in correlation)
column_order = [
x for x in column_order if x in current_order
]

# Override the Phi_k sorting
correlations["phi_k"] = correlations["phi_k"].reindex(
index=column_order, columns=column_order
)
except (ValueError, DataError, IndexError, TypeError) as e:
warn_correlation("phi_k", e)
pbar.update()

categorical_correlations = {
"cramers": cramers_matrix,
"recoded": recoded_matrix,
}
for correlation_name, get_matrix in categorical_correlations.items():
pbar.set_description_str(
"correlations [{correlation_name}]".format(
correlation_name=correlation_name
)
)
if config["correlations"][correlation_name]["calculate"].get(bool):
try:
correlation = get_matrix(df, variables)
if len(correlation) > 0:
correlations[correlation_name] = correlation
except (ValueError, ZeroDivisionError) as e:
warn_correlation(correlation_name, e)
pbar.update()
correlation = get_matrix(df, variables)
if len(correlation) > 0:
correlations[correlation_name] = correlation
except (ValueError, ZeroDivisionError) as e:
warn_correlation(correlation_name, e)
pbar.update()

return correlations

Expand Down
26 changes: 15 additions & 11 deletions src/pandas_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,16 +475,20 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
},
}

with tqdm(
total=len(missing_map), desc="missing", disable=disable_progress_bar
) as pbar:
missing = {}
for name, settings in missing_map.items():
pbar.set_description_str("missing [{name}]".format(name=name))
if (
config["missing_diagrams"][name].get(bool)
and table_stats["n_vars_with_missing"] >= settings["min_missing"]
):
missing_map = {
name: settings
for name, settings in missing_map.items()
if config["missing_diagrams"][name].get(bool)
and table_stats["n_vars_with_missing"] >= settings["min_missing"]
}
missing = {}

if len(missing_map) > 0:
with tqdm(
total=len(missing_map), desc="missing", disable=disable_progress_bar
) as pbar:
for name, settings in missing_map.items():
pbar.set_description_str("missing [{name}]".format(name=name))
try:
if name != "heatmap" or (
table_stats["n_vars_with_missing"]
Expand All @@ -497,7 +501,7 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
}
except ValueError as e:
warn_missing(name, e)
pbar.update()
pbar.update()
return missing


Expand Down
Loading

0 comments on commit 023bd70

Please sign in to comment.