Decouple notebooks and notebook tests.

- Display no progress bar for disabled modules (e.g. individual correlations). - Disable missing plots in minimal mode - Bump `coverage` requirement - One test hangs on issue in nbval: computationalmodelling/nbval#136
ydataai · Feb 1, 2020 · 023bd70 · 023bd70
1 parent ea26631
commit 023bd70
Show file tree

Hide file tree

Showing 12 changed files with 67,828 additions and 13,507 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -17,11 +17,6 @@ env:
   - TEST=examples
   - TEST=lint
 
-jobs:
-  exclude:
-    - python: "3.5"
-      env: TEST=examples
-
 install:
   - pip install --upgrade pip six
   - pip install -r requirements.txt
@@ -31,7 +26,7 @@ install:
 script:
   - if [ $TEST == 'unit' ]; then pytest --cov=. tests/unit/; fi
   - if [ $TEST == 'issue' ]; then pytest --cov=. tests/issues/; fi
-  - if [ $TEST == 'examples' ]; then pytest --cov=. --nbval --sanitize-with tests/sanitize-notebook.cfg examples/; fi
+  - if [ $TEST == 'examples' ]; then pytest --cov=. --nbval test/notebooks/; fi
   - if [ $TEST == 'console' ]; then pandas_profiling -h; fi
   - if [ $TEST == 'lint' ]; then pytest --black -m black src/; flake8 . --select=E9,F63,F7,F82 --show-source --statistics; fi
 

diff --git a/Makefile b/Makefile
@@ -4,8 +4,9 @@ docs:
 	rmdir docs/pandas_profiling
 
 test:
-    pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/unit/
-    pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/issues/
+    pytest --black tests/unit/
+    pytest --black tests/issues/
+    pytest --nbval tests/notebooks/
     flake8 . --select=E9,F63,F7,F82 --show-source --statistics
 
 install:

diff --git a/examples/meteorites/meteorites.ipynb b/examples/meteorites/meteorites.ipynb
diff --git a/examples/titanic/titanic.ipynb b/examples/titanic/titanic.ipynb
diff --git a/make.bat b/make.bat
@@ -9,8 +9,9 @@ IF "%1%" == "docs" (
 )
 
 IF "%1" == "test" (
-    pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/unit/
-    pytest --nbval --cov=./ --black --sanitize-with tests/sanitize-notebook.cfg tests/issues/
+    pytest --black tests/unit/
+    pytest --black tests/issues/
+    pytest --nbval tests/notebooks/
     flake8 . --select=E9,F63,F7,F82 --show-source --statistics
     ECHO "Tests completed!"
     GOTO end

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 pytest
-coverage<5
+coverage==5.0.0
 codecov
 pytest-cov
 pytest-black

diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml
@@ -34,8 +34,8 @@ sort: None
 
 # which diagrams to show
 missing_diagrams:
-    bar: True
-    matrix: True
+    bar: False
+    matrix: False
     heatmap: False
     dendrogram: False
 

diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
@@ -147,103 +147,110 @@ def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:
 
     disable_progress_bar = not config["progress_bar"].get(bool)
 
-    with tqdm(total=6, desc="correlations", disable=disable_progress_bar) as pbar:
-        for correlation_name in ["pearson", "spearman", "kendall"]:
-            pbar.set_description_str(
-                "correlations [{correlation_name}]".format(
-                    correlation_name=correlation_name
+    correlation_names = [
+        correlation_name
+        for correlation_name in [
+            "pearson",
+            "spearman",
+            "kendall",
+            "phi_k",
+            "cramers",
+            "recoded",
+        ]
+        if config["correlations"][correlation_name]["calculate"].get(bool)
+    ]
+
+    categorical_correlations = {"cramers": cramers_matrix, "recoded": recoded_matrix}
+
+    if len(correlation_names) > 0:
+        with tqdm(
+            total=len(correlation_names), desc="correlations", disable=disable_progress_bar
+        ) as pbar:
+            for correlation_name in correlation_names:
+                pbar.set_description_str(
+                    "correlations [{correlation_name}]".format(
+                        correlation_name=correlation_name
+                    )
                 )
-            )
-            if config["correlations"][correlation_name]["calculate"].get(bool):
-                try:
-                    correlation = df.corr(method=correlation_name)
-                    if len(correlation) > 0:
-                        correlations[correlation_name] = correlation
-                except (ValueError, AssertionError) as e:
-                    warn_correlation(correlation_name, e)
-            pbar.update()
-
-        pbar.set_description_str("correlations [phi_k]")
-        if config["correlations"]["phi_k"]["calculate"].get(bool):
-            import phik
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                # Phi_k does not filter non-numerical with high cardinality
-                selcols = []
-                intcols = []
-                for col in df.columns.tolist():
+
+                if correlation_name in ["pearson", "spearman", "kendall"]:
                     try:
-                        tmp = (
-                            df[col]
-                            .value_counts(dropna=False)
-                            .reset_index()
-                            .dropna()
-                            .set_index("index")
-                            .iloc[:, 0]
-                        )
-                        if tmp.index.inferred_type == "mixed":
-                            continue
-
-                        if pd.api.types.is_numeric_dtype(df[col]):
-                            intcols.append(col)
-                            selcols.append(col)
-                        elif df[col].nunique() <= config[
-                            "categorical_maximum_correlation_distinct"
-                        ].get(int):
-                            selcols.append(col)
-                    except (TypeError, ValueError):
-                        continue
-
-                if len(selcols) > 1:
+                        correlation = df.corr(method=correlation_name)
+                        if len(correlation) > 0:
+                            correlations[correlation_name] = correlation
+                    except (ValueError, AssertionError) as e:
+                        warn_correlation(correlation_name, e)
+                    pbar.update()
+                elif correlation_name in ["phi_k"]:
+                    import phik
+
+                    with warnings.catch_warnings():
+                        warnings.simplefilter("ignore")
+                        # Phi_k does not filter non-numerical with high cardinality
+                        selcols = []
+                        intcols = []
+                        for col in df.columns.tolist():
+                            try:
+                                tmp = (
+                                    df[col]
+                                    .value_counts(dropna=False)
+                                    .reset_index()
+                                    .dropna()
+                                    .set_index("index")
+                                    .iloc[:, 0]
+                                )
+                                if tmp.index.inferred_type == "mixed":
+                                    continue
+
+                                if pd.api.types.is_numeric_dtype(df[col]):
+                                    intcols.append(col)
+                                    selcols.append(col)
+                                elif df[col].nunique() <= config[
+                                    "categorical_maximum_correlation_distinct"
+                                ].get(int):
+                                    selcols.append(col)
+                            except (TypeError, ValueError):
+                                continue
+
+                        if len(selcols) > 1:
+                            try:
+                                correlations["phi_k"] = df[selcols].phik_matrix(
+                                    interval_cols=intcols
+                                )
+
+                                # Only do this if the column_order is set
+                                with suppress(NotFoundError):
+                                    # Get the preferred order
+                                    column_order = config["column_order"].get(list)
+
+                                    # Get the Phi_k sorted order
+                                    current_order = (
+                                        correlations["phi_k"]
+                                        .index.get_level_values("var1")
+                                        .tolist()
+                                    )
+
+                                    # Intersection (some columns are not used in correlation)
+                                    column_order = [
+                                        x for x in column_order if x in current_order
+                                    ]
+
+                                    # Override the Phi_k sorting
+                                    correlations["phi_k"] = correlations["phi_k"].reindex(
+                                        index=column_order, columns=column_order
+                                    )
+                            except (ValueError, DataError, IndexError, TypeError) as e:
+                                warn_correlation("phi_k", e)
+                elif correlation_name in ["cramers", "recoded"]:
+                    get_matrix = categorical_correlations[correlation_name]
+
                     try:
-                        correlations["phi_k"] = df[selcols].phik_matrix(
-                            interval_cols=intcols
-                        )
-
-                        # Only do this if the column_order is set
-                        with suppress(NotFoundError):
-                            # Get the preferred order
-                            column_order = config["column_order"].get(list)
-
-                            # Get the Phi_k sorted order
-                            current_order = (
-                                correlations["phi_k"]
-                                .index.get_level_values("var1")
-                                .tolist()
-                            )
-
-                            # Intersection (some columns are not used in correlation)
-                            column_order = [
-                                x for x in column_order if x in current_order
-                            ]
-
-                            # Override the Phi_k sorting
-                            correlations["phi_k"] = correlations["phi_k"].reindex(
-                                index=column_order, columns=column_order
-                            )
-                    except (ValueError, DataError, IndexError, TypeError) as e:
-                        warn_correlation("phi_k", e)
-        pbar.update()
-
-        categorical_correlations = {
-            "cramers": cramers_matrix,
-            "recoded": recoded_matrix,
-        }
-        for correlation_name, get_matrix in categorical_correlations.items():
-            pbar.set_description_str(
-                "correlations [{correlation_name}]".format(
-                    correlation_name=correlation_name
-                )
-            )
-            if config["correlations"][correlation_name]["calculate"].get(bool):
-                try:
-                    correlation = get_matrix(df, variables)
-                    if len(correlation) > 0:
-                        correlations[correlation_name] = correlation
-                except (ValueError, ZeroDivisionError) as e:
-                    warn_correlation(correlation_name, e)
-            pbar.update()
+                        correlation = get_matrix(df, variables)
+                        if len(correlation) > 0:
+                            correlations[correlation_name] = correlation
+                    except (ValueError, ZeroDivisionError) as e:
+                        warn_correlation(correlation_name, e)
+                pbar.update()
 
     return correlations
 

diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py
@@ -475,16 +475,20 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
         },
     }
 
-    with tqdm(
-        total=len(missing_map), desc="missing", disable=disable_progress_bar
-    ) as pbar:
-        missing = {}
-        for name, settings in missing_map.items():
-            pbar.set_description_str("missing [{name}]".format(name=name))
-            if (
-                config["missing_diagrams"][name].get(bool)
-                and table_stats["n_vars_with_missing"] >= settings["min_missing"]
-            ):
+    missing_map = {
+        name: settings
+        for name, settings in missing_map.items()
+        if config["missing_diagrams"][name].get(bool)
+        and table_stats["n_vars_with_missing"] >= settings["min_missing"]
+    }
+    missing = {}
+
+    if len(missing_map) > 0:
+        with tqdm(
+            total=len(missing_map), desc="missing", disable=disable_progress_bar
+        ) as pbar:
+            for name, settings in missing_map.items():
+                pbar.set_description_str("missing [{name}]".format(name=name))
                 try:
                     if name != "heatmap" or (
                         table_stats["n_vars_with_missing"]
@@ -497,7 +501,7 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
                         }
                 except ValueError as e:
                     warn_missing(name, e)
-            pbar.update()
+                pbar.update()
     return missing