Skip to content

Commit

Permalink
Merge pull request #308 from pegasystems/pl_expr_serialization
Browse files Browse the repository at this point in the history
polars expression serialization
  • Loading branch information
yusufuyanik1 authored Jan 6, 2025
2 parents ddecffd + 8b016f6 commit 7d2e247
Showing 8 changed files with 195 additions and 206 deletions.
11 changes: 5 additions & 6 deletions python/pdstools/adm/Plots.py
Original file line number Diff line number Diff line change
@@ -137,7 +137,7 @@ def get_nonperforming_models(df: pl.LazyFrame):
else:
print(fig.data, i)
return fig
num_models = df.select(pl.first().count()).collect().item()
num_models = df.select(pl.first().len()).collect().item()
bottomleft = get_nonperforming_models(df)
newtext = f"{num_models} models: {bottomleft} ({round(bottomleft/num_models*100, 2)}%) at (50,0)"
fig.layout.title.text += f"<br><sup>{newtext}</sup>"
@@ -486,7 +486,7 @@ def score_distribution(
)
).sort("BinIndex")

if df.select(pl.first().count()).collect().item() == 0:
if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(f"There is no data for the provided modelid {model_id}")

if return_df:
@@ -578,7 +578,7 @@ def predictor_binning(
)
).sort("BinIndex")

if df.select(pl.first().count()).collect().item() == 0:
if df.select(pl.first().len()).collect().item() == 0:
raise ValueError(
f"There is no data for the provided modelid {model_id} and predictor {predictor_name}"
)
@@ -1194,7 +1194,7 @@ def binning_lift(
return plot_df

fig = px.bar(
plot_df.collect(), #.to_pandas(use_pyarrow_extension_array=False),
plot_df.collect(), # .to_pandas(use_pyarrow_extension_array=False),
x="Lift",
y="BinSymbolAbbreviated",
color="Direction",
@@ -1260,8 +1260,7 @@ def partitioned_plot(
fig.show()
return figs


# TODO I took the propensity distrib plot out of the HC as
# TODO I took the propensity distrib plot out of the HC as
# it wasn't very clear, also didn't look great visually.

@requires(
126 changes: 6 additions & 120 deletions python/pdstools/adm/Reports.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
__all__ = ["Reports"]
import logging
import os
import re
import shutil
import subprocess
import sys
from os import PathLike
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

import polars as pl

from ..utils import cdh_utils
from ..utils.namespaces import LazyNamespace
from ..utils.types import QUERY
from ..prediction import Prediction
from ..utils.report_utils import _serialize_query, get_quarto_with_version

if TYPE_CHECKING:
from .ADMDatamart import ADMDatamart
@@ -257,7 +255,7 @@ def health_check(
and (self.datamart.predictor_data is not None)
):
model_file_path, predictor_file_path = self.datamart.save_data(temp_dir)

serialized_query = _serialize_query(query)
self.run_quarto(
qmd_file=qmd_file,
output_filename=output_filename,
@@ -267,7 +265,7 @@ def health_check(
"model_file_path": str(model_file_path),
"predictor_file_path": str(predictor_file_path),
"prediction_file_path": str(prediction_file_path),
"query": query,
"query": serialized_query,
"title": title,
"subtitle": subtitle,
},
@@ -331,17 +329,6 @@ def _copy_quarto_file(qmd_file: str, temp_dir: Path) -> None:

shutil.copy(__reports__ / qmd_file, temp_dir)

# Never used?
# def _verify_cached_files(self, temp_dir: Path) -> None:
# """Verify that cached data files exist."""
# modeldata_files = list(temp_dir.glob("cached_modelData*"))
# predictordata_files = list(temp_dir.glob("cached_predictorData*"))

# if not modeldata_files:
# raise FileNotFoundError("No cached model data found.")
# if not predictordata_files:
# logger.warning("No cached predictor data found.")

@staticmethod
def _write_params_files(
temp_dir: Path,
@@ -353,15 +340,13 @@ def _write_params_files(
import yaml

# Parameters to python code

with open(temp_dir / "params.yml", "w") as f:
yaml.dump(
params,
f,
)

# Project/rendering options to quarto

with open(temp_dir / "_quarto.yml", "w") as f:
yaml.dump(
{
@@ -371,103 +356,6 @@ def _write_params_files(
f,
)

@staticmethod
def _find_executable(exec_name: str) -> Path:
"""Find the executable on the system."""

# First find in path
exec_in_path = shutil.which(exec_name) # pragma: no cover
if exec_in_path: # pragma: no cover
return Path(exec_in_path)

# If not in path try find explicitly. TODO not sure this is wise
# maybe we should not try be smart and assume quarto/pandoc are
# properly installed.

if sys.platform == "win32": # pragma: no cover
possible_paths = [
Path(
os.environ.get("USERPROFILE", ""),
"AppData",
"Local",
"Programs",
f"{exec_name}", # assume windows is still case insensitive (NTFS changes this...)
"bin",
f"{exec_name}.cmd",
),
Path(
os.environ.get("PROGRAMFILES", ""),
f"{exec_name}",
"bin",
f"{exec_name}.cmd",
),
]
else: # pragma: no cover
possible_paths = [
Path(f"/usr/local/bin/{exec_name}"),
Path(f"/opt/{exec_name}/bin/{exec_name}"),
Path(os.environ.get("HOME", ""), ".local", "bin", exec_name),
]

for path in possible_paths:
if path.exists():
return path

raise FileNotFoundError(
"Quarto executable not found. Please ensure Quarto is installed and in the system PATH."
) # pragma: no cover

# TODO not conviced about below. This isn't necessarily the same path resolution
# as the os does. What's wrong with just assuming quarto is in the path so we can
# just test for version w code like
# def get_cmd_output(args):
# result = (
# subprocess.run(args, stdout=subprocess.PIPE).stdout.decode("utf-8").split("\n")
# )
# return result
# get_version_only(get_cmd_output(["quarto", "--version"])[0])

@staticmethod
def _get_executable_with_version(
exec_name: str, verbose: bool = False
) -> Tuple[Path, str]:
def get_version_only(versionstr):
return re.sub("[^.0-9]", "", versionstr)

try:
executable = Reports._find_executable(exec_name=exec_name)
except FileNotFoundError as e: # pragma: no cover
logger.error(e)
raise

# Check version
try:
version_result = subprocess.run(
[str(executable), "--version"],
capture_output=True,
text=True,
check=True,
)
version_string = get_version_only(
version_result.stdout.split("\n")[0].strip()
)
message = f"{exec_name} version: {version_string}"
logger.info(message)
if verbose:
print(message)
except subprocess.CalledProcessError as e: # pragma: no cover
logger.warning(f"Failed to check {exec_name} version: {e}")

return (executable, version_string)

@staticmethod
def get_quarto_with_version(verbose: bool = True) -> Tuple[Path, str]:
return Reports._get_executable_with_version("quarto", verbose=verbose)

@staticmethod
def get_pandoc_with_version(verbose: bool = True) -> Tuple[Path, str]:
return Reports._get_executable_with_version("pandoc", verbose=verbose)

@staticmethod
def run_quarto(
qmd_file: str,
@@ -488,7 +376,7 @@ def run_quarto(
analysis=analysis,
)

quarto_exec, _ = Reports.get_quarto_with_version(verbose)
quarto_exec, _ = get_quarto_with_version(verbose)

command = [
str(quarto_exec),
@@ -565,9 +453,7 @@ def excel_report(
}

if self.datamart.predictor_data is not None:
tabs["predictors_overview"] = (
self.datamart.aggregates.predictors_overview()
)
tabs["predictors_overview"] = self.datamart.aggregates.predictors_overview()

if predictor_binning and self.datamart.predictor_data is not None:
tabs["predictor_binning"] = self.datamart.aggregates.last(
14 changes: 10 additions & 4 deletions python/pdstools/app/health_check/pages/2_Data_Filters.py
Original file line number Diff line number Diff line change
@@ -18,9 +18,14 @@
"Upload Filters You Downloaded Earlier", type=["json"]
)
if uploaded_file:
import io

imported_filters = json.load(uploaded_file)
for key, val in imported_filters.items():
expr_list.append(pl.Expr.from_json(json.dumps(val)))
# Convert the JSON string to a StringIO object and specify the format as 'json'
json_str = json.dumps(val)
str_io = io.StringIO(json_str)
expr_list.append(pl.Expr.deserialize(str_io, format="json"))

st.session_state["filters"] = filter_dataframe(
st.session_state["dm"].model_data, queries=expr_list
@@ -33,10 +38,11 @@
filtered_modelid_count, filtered_row_count = model_and_row_counts(
_apply_query(st.session_state["dm"].model_data, st.session_state["filters"])
)
deserialize_exprs = {}
serialized_exprs = {}
for i, expr in enumerate(st.session_state["filters"]):
deserialize_exprs[i] = json.loads(expr.meta.write_json())
data = json.dumps(deserialize_exprs)
serialized = expr.meta.serialize(format="json")
serialized_exprs[i] = json.loads(serialized)
data = json.dumps(serialized_exprs)
st.download_button(
label="Download Filters",
data=data,
25 changes: 22 additions & 3 deletions python/pdstools/reports/HealthCheck.qmd
Original file line number Diff line number Diff line change
@@ -48,6 +48,8 @@ import polars as pl
import numpy as np
import math
import json
import io
cdh_guidelines = CDHGuidelines()
@@ -73,6 +75,22 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
.update_xaxes(title=label, showticklabels=True, visible=True)
)
return fig
def _deserialize_query(serialized_query):
if serialized_query is None:
return None
if serialized_query["type"] == "expr_list":
expr_list = []
for _, val in serialized_query["expressions"].items():
json_str = json.dumps(val)
str_io = io.StringIO(json_str)
expr_list.append(pl.Expr.deserialize(str_io, format="json"))
return expr_list
elif serialized_query["type"] == "dict":
return serialized_query["data"]
raise ValueError(f"Unknown query type: {serialized_query['type']}")
```

```{python}
@@ -84,7 +102,7 @@ def fig_set_xaxis_modelperformance(fig, label="Model Performance"):
title = "ADM Model Overview"
subtitle = "Sample data"
# Insert the paths to your data files here to run the notebook from your IDE.
# Insert the paths to your data files here to run the notebook from your IDE.
# Edit the _quarto.yml to enable/disable specific sections of the quarto output.
# Parameters will be overriden by quarto when a parameters yaml is provided
@@ -116,6 +134,7 @@ if query and query == "None":
query = None
query = _deserialize_query(query)
responsecount_analysis_query = (
pl.col("ResponseCount") > responsecount_analysis_threshold
)
@@ -1390,7 +1409,7 @@ if datamart.predictor_data is not None:
# The default of observed=False is deprecated...
fig = px.treemap(
missing,
missing,
path=path,
color="Percentage without responses",
template="pega",
@@ -1840,4 +1859,4 @@ except Exception as e:
report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")
```
```
2 changes: 1 addition & 1 deletion python/pdstools/utils/polars_ext.py
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ def sample_it(s: pl.Series, n) -> pl.Series:
)

def height(self):
return self._ldf.select(pl.first().count()).collect().item()
return self._ldf.select(pl.first().len()).collect().item()

def shape(self):
return (self.height(), len(self._ldf.columns))
Loading

0 comments on commit 7d2e247

Please sign in to comment.