Skip to content

Commit

Permalink
Merge pull request #125 from yusufuyanik1/master
Browse files Browse the repository at this point in the history
polars version upgrade
  • Loading branch information
yusufuyanik1 authored Sep 28, 2023
2 parents ab02b16 + 9bfd477 commit 60f40cf
Show file tree
Hide file tree
Showing 17 changed files with 100 additions and 100 deletions.
8 changes: 4 additions & 4 deletions examples/articles/ADMExplained.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
" model_id is None\n",
"):\n",
" display(\n",
" model.groupby(\"ModelID\")\n",
" model.group_by(\"ModelID\")\n",
" .agg(\n",
" number_of_predictors=pl.col(\"PredictorName\").n_unique(),\n",
" model_performance=cdh_utils.weighed_performance_polars() * 100,\n",
Expand Down Expand Up @@ -251,7 +251,7 @@
},
"outputs": [],
"source": [
"display(predictorbinning.groupby(\"PredictorName\").agg(\n",
"display(predictorbinning.group_by(\"PredictorName\").agg(\n",
" pl.first(\"ResponseCount\").cast(pl.Int64).alias(\"# Responses\"),\n",
" pl.n_unique(\"BinIndex\").alias(\"# Bins\"),\n",
" (pl.first(\"PerformanceBin\") * 100).alias(\"Predictor Performance(AUC)\"),\n",
Expand Down Expand Up @@ -665,7 +665,7 @@
"\n",
"df = (\n",
" modelpredictors.filter(pl.col(\"PredictorName\") != \"Classifier\")\n",
" .groupby(\"PredictorName\")\n",
" .group_by(\"PredictorName\")\n",
" .agg(\n",
" Value=pl.when(pl.col(\"Type\").first() == \"numeric\")\n",
" .then(\n",
Expand Down Expand Up @@ -873,7 +873,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.11.4"
},
"orig_nbformat": 4
},
Expand Down
2 changes: 1 addition & 1 deletion examples/articles/pdstoolsv3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.11.4"
},
"orig_nbformat": 4
},
Expand Down
6 changes: 3 additions & 3 deletions examples/articles/thompsonsampling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"# Convergence of the Thompson Sampled propensities\n",
"s = thompsonSamplingSimulation['positives']\n",
"thompsonSamplingSimulation2 = thompsonSamplingSimulation.hstack(s.cut(breaks=np.array(range(int(s.min()), int(s.max())+20, 20))-1, series=False).select(bin='category'))\n",
"s = thompsonSamplingSimulation2.groupby(\"p\", \"bin\").agg(\n",
"s = thompsonSamplingSimulation2.group_by(\"p\", \"bin\").agg(\n",
" n=pl.count(),\n",
" n90=(((pl.col(\"sampled_propensity\") - pl.col(\"p\")) / pl.col(\"p\")) < 0.1).sum(),\n",
" positives=pl.min(\"positives\"),\n",
Expand All @@ -98,7 +98,7 @@
").explode('sampled_propensity').with_columns(positives = pl.col('evidence')*pl.col('p'))\n",
"from scipy.stats import gaussian_kde\n",
"results = {}\n",
"for p, series in settings1.groupby('p'):\n",
"for p, series in settings1.group_by('p'):\n",
" results[str(p)] = gaussian_kde(series['sampled_propensity'], 'silverman')(np.arange(0,0.15,0.0001))\n",
"results = pl.DataFrame(results).with_columns(sampledPropensity=pl.Series(np.arange(0,0.15,0.0001))).to_pandas().set_index('sampledPropensity')\n",
"DistributionOfSampled = px.area(results, title='Distribution of the sampled propensities<br><sup>for a few combinations of model propensity and evidence</sup>', template='none', labels={'value':'', 'sampledPropensity':'Sampled Propensity', 'variable':'Propensity'}).update_yaxes({'visible':True}).update_xaxes({'tickformat':',.0%', 'tickmode':'array', 'tickvals':[0, 0.01, 0.05, 0.1]}).update_layout(showlegend=False).update_traces({'line':{'width':0.0}})#.add_annotation()\n",
Expand Down Expand Up @@ -456,7 +456,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.11.4"
},
"orig_nbformat": 4
},
Expand Down
18 changes: 9 additions & 9 deletions python/pdstools/adm/ADMDatamart.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ def discover_modelTypes(
) -> Dict: # pragma: no cover
"""Discovers the type of model embedded in the pyModelData column.
By default, we do a groupby Configuration, because a model rule can only
By default, we do a group_by Configuration, because a model rule can only
contain one type of model. Then, for each configuration, we look into the
pyModelData blob and find the _serialClass, returning it in a dict.
Expand Down Expand Up @@ -791,7 +791,7 @@ def _getType(val):

types = (
df.filter(pl.col("Modeldata").is_not_null())
.groupby(by)
.group_by(by)
.agg(pl.col("Modeldata").last())
.collect()
.with_columns(pl.col("Modeldata").apply(lambda v: _getType(v)))
Expand Down Expand Up @@ -907,7 +907,7 @@ def _create_sign_df(
.alias("Daily_increase")
.over("ModelID")
)
.groupby_dynamic("SnapshotTime", every=every, by=by)
.group_by_dynamic("SnapshotTime", every=every, by=by)
.agg(pl.sum("Daily_increase").alias("Increase"))
)
if pivot:
Expand Down Expand Up @@ -947,7 +947,7 @@ def model_summary(
Returns
-------
pl.LazyFrame:
Groupby dataframe over all models
group_by dataframe over all models
"""
df = self._apply_query(self.modelData, query)
data = self.last(df, strategy="lazy").lazy()
Expand All @@ -959,7 +959,7 @@ def model_summary(
assert required_columns.issubset(set(data.columns) | set(context_keys))

return (
data.groupby(context_keys)
data.group_by(context_keys)
.agg(
[
pl.count(by).suffix("_count"),
Expand Down Expand Up @@ -1027,7 +1027,7 @@ def pivot_df(
if top_n > 0:
top_n_xaxis = (
df.unique(subset=[by], keep="first")
.groupby(by)
.group_by(by)
.agg(
cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount")
)
Expand All @@ -1037,7 +1037,7 @@ def pivot_df(
)
df = top_n_xaxis.join(df, on=by, how="left")
if by not in ["ModelID", "Name"]:
df = df.groupby([by, "PredictorName"]).agg(
df = df.group_by([by, "PredictorName"]).agg(
cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount")
)
df = (
Expand Down Expand Up @@ -1077,7 +1077,7 @@ def response_gain_df(df: any_frame, by: str = "Channel") -> any_frame:
if isinstance(by, list):
by = by[0]
return (
df.groupby([by, "ModelID"])
df.group_by([by, "ModelID"])
.agg(pl.max("ResponseCount"))
.sort([by, "ResponseCount"], descending=True)
.with_columns(
Expand Down Expand Up @@ -1129,7 +1129,7 @@ def models_by_positives_df(
how="left",
)
.lazy()
.groupby([by, "PositivesBin", "break_point"])
.group_by([by, "PositivesBin", "break_point"])
.agg([pl.min("Positives"), pl.n_unique("ModelID").alias("ModelCount")])
.with_columns(
(pl.col("ModelCount") / (pl.sum("ModelCount").over(by))).alias(
Expand Down
6 changes: 3 additions & 3 deletions python/pdstools/adm/ADMTrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def getGroupedGainsPerSplit(self) -> pl.DataFrame:
the mean gains, and the number of times the split is performed.
"""
return (
self.gainsPerSplit.groupby("split", maintain_order=True)
self.gainsPerSplit.group_by("split", maintain_order=True)
.agg(
[
pl.first("predictor"),
Expand Down Expand Up @@ -548,7 +548,7 @@ def plotSplitsPerVariable(self, subset: Optional[Set] = None, show=True):
plt.figure
"""
figlist = []
for name, data in self.gainsPerSplit.groupby("predictor"):
for name, data in self.gainsPerSplit.group_by("predictor"):
if (subset is not None and name in subset) or subset is None:
fig = make_subplots()
fig.add_trace(
Expand Down Expand Up @@ -613,7 +613,7 @@ def getTreeStats(self) -> pl.DataFrame:
def getAllValuesPerSplit(self) -> Dict:
"""Generate a dictionary with the possible values for each split"""
splitvalues = {}
for name, group in self.groupedGainsPerSplit.groupby("predictor"):
for name, group in self.groupedGainsPerSplit.group_by("predictor"):
if name not in splitvalues.keys():
splitvalues[name] = set()
splitvalue = group.get_column("values").to_list()
Expand Down
20 changes: 10 additions & 10 deletions python/pdstools/adm/Tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def _by(self):
return [
col
for col in columns
if col in self.modelData.columns
and self.modelData.schema[col] != pl.Null
if col in self.modelData.columns and self.modelData.schema[col] != pl.Null
]

@property
def AvailableTables(self):
df = pl.DataFrame(
Expand All @@ -56,7 +56,7 @@ def AvailableTables(self):
df = df.transpose().with_columns(pl.Series(df.columns))
df.columns = ["modelData", "predictorData", "Tables"]
return df.select(["Tables", "modelData", "predictorData"])

@property
def ApplicableTables(self):
df = self.AvailableTables
Expand All @@ -70,7 +70,7 @@ def ApplicableTables(self):
def model_overview(self):
return (
self.last(strategy="lazy")
.groupby(["Configuration", "Channel", "Direction"])
.group_by(["Configuration", "Channel", "Direction"])
.agg(
[
pl.col("Name").unique().count().alias("Number of Actions"),
Expand All @@ -93,7 +93,7 @@ def model_overview(self):
@cached_property
def predictors_per_configuration(self):
return (
self.combinedData.groupby("Configuration")
self.combinedData.group_by("Configuration")
.agg(
[
pl.col("PredictorName").unique().count().alias("Predictor Count"),
Expand All @@ -108,7 +108,7 @@ def predictors_per_configuration(self):
def bad_predictors(self):
return (
self.predictorData.filter(pl.col("PredictorName") != "Classifier")
.groupby("PredictorName")
.group_by("PredictorName")
.agg(
[
pl.sum("ResponseCount").alias("Response Count"),
Expand All @@ -123,7 +123,7 @@ def bad_predictors(self):

@property
def _zero_response(self):
return self.modelData.groupby(self._by).agg(
return self.modelData.group_by(self._by).agg(
[pl.sum("ResponseCount"), pl.sum("Positives"), pl.mean("Performance")]
)

Expand All @@ -139,14 +139,14 @@ def zero_positives(self):
def _last_counts(self):
return (
self.last(strategy="lazy")
.groupby(self._by)
.group_by(self._by)
.agg([pl.sum("ResponseCount"), pl.sum("Positives"), pl.mean("Performance")])
)

@cached_property
def reach(self):
def calc_reach(x=pl.col("Positives")):
return 0.02 + 0.98 * (pl.min([pl.lit(200), x]) / 200)
return 0.02 + 0.98 * (pl.min_horizontal([pl.lit(200), x]) / 200)

return (
self._last_counts.filter(
Expand All @@ -165,7 +165,7 @@ def minimum_performance(self):
@cached_property
def appendix(self):
return (
self.modelData.groupby(self._by + ["ModelID"])
self.modelData.group_by(self._by + ["ModelID"])
.agg(
[
pl.max("ResponseCount").alias("Responses"),
Expand Down
2 changes: 1 addition & 1 deletion python/pdstools/ih/IHAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _metricPerPeriod(

df = (
df.sort(OutcomeTime_col)
.groupby_dynamic(OutcomeTime_col, every=period, by=by)
.group_by_dynamic(OutcomeTime_col, every=period, by=by)
.agg(metrics)
)
if isinstance(df, pl.LazyFrame):
Expand Down
24 changes: 12 additions & 12 deletions python/pdstools/ih/legacy_IH.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ def get_total_outcome(df, outcome, rollup): # pragma: no cover
for i in outcome:
_df = (
df[df["pyOutcome"] == i]
.groupby(rollup)
.group_by(rollup)
.count()[["pxInteractionID"]]
.rename(columns={"pxInteractionID": "Count: " + i})
)
_df_all = pd.concat([_df_all, _df], axis=1)
else:
_df_all = (
df[df["pyOutcome"] == outcome]
.groupby(rollup)
.group_by(rollup)
.count()[["pxInteractionID"]]
.rename(columns={"pxInteractionID": "Count: " + outcome})
)
Expand All @@ -75,14 +75,14 @@ def get_accept_rate(df, pos, neg, rollup):

_df = (
df[df["pyOutcome"].isin(total)]
.groupby(rollup)
.group_by(rollup)
.count()[["pxInteractionID"]]
.reset_index()
.rename(columns={"pxInteractionID": "Total"})
)
_df = _df.merge(
df[df["pyOutcome"].isin(pos)]
.groupby(rollup)
.group_by(rollup)
.count()[["pxInteractionID"]]
.reset_index()
.rename(columns={"pxInteractionID": "Accepted"}),
Expand Down Expand Up @@ -134,8 +134,8 @@ def plot_daily_cumulative_accept_rate(df, pos, neg, **kwargs):
_df, rollup, hue = get_accept_rate_time(df, pos, neg, "Date", **kwargs)

if "hue" in kwargs.keys():
_df["Total_cum"] = _df.groupby(hue)["Total"].apply(lambda x: x.cumsum())
_df["Accepted_cum"] = _df.groupby(hue)["Accepted"].apply(lambda x: x.cumsum())
_df["Total_cum"] = _df.group_by(hue)["Total"].apply(lambda x: x.cumsum())
_df["Accepted_cum"] = _df.group_by(hue)["Accepted"].apply(lambda x: x.cumsum())
_df["hue"] = _df[hue].agg("__".join, axis=1)
kwargs["hue"] = "hue"
else:
Expand Down Expand Up @@ -221,7 +221,7 @@ def plot_outcome_count_time(df, outcome, time, **kwargs):
else:
rollup.append(kwargs["hue"])
hue.append(kwargs["hue"])
_df = _df.groupby(rollup).count().reset_index()
_df = _df.group_by(rollup).count().reset_index()
if len(hue) > 0:
_df["hue"] = _df[hue].agg("__".join, axis=1)
kwargs["hue"] = "hue"
Expand Down Expand Up @@ -261,7 +261,7 @@ def get_allDays_df(_df, inds_df, hue):
def get_total_outcome_share_per_level(df, outcome, level):
_df = (
df[df["pyOutcome"] == outcome]
.groupby(level)
.group_by(level)
.count()[["pxInteractionID"]]
.rename(columns={"pxInteractionID": "Count"})
.reset_index()
Expand Down Expand Up @@ -298,7 +298,7 @@ def get_outcome_share_time(df, outcome, level, time="daily"):

_df = df[df["pyOutcome"] == outcome].reset_index(drop=True)
outcome_per_gra = (
_df.groupby([gra])
_df.group_by([gra])
.count()[["pxInteractionID"]]
.rename(columns={"pxInteractionID": "total " + time + " " + outcome})
.reset_index()
Expand All @@ -309,7 +309,7 @@ def get_outcome_share_time(df, outcome, level, time="daily"):
).rename(columns={"newCol": level})

level_outcome_share_gra = (
_df.groupby([level, gra])
_df.group_by([level, gra])
.count()[["pxInteractionID"]]
.rename(columns={"pxInteractionID": level + " " + outcome + " Count"})
.reset_index()
Expand Down Expand Up @@ -368,14 +368,14 @@ def get_delta_df(df, outcome, level, dates):
total_range_outcomes = (
share_delta[["Date", "Date Range", "total daily " + outcome]]
.drop_duplicates()
.groupby("Date Range")
.group_by("Date Range")
.sum()
.reset_index()
.rename(columns={"total daily " + outcome: "total range " + outcome})
)
share_delta = (
share_delta.drop("total daily " + outcome, axis=1)
.groupby([level, "Date Range"])
.group_by([level, "Date Range"])
.sum()
.reset_index()
)
Expand Down
Loading

0 comments on commit 60f40cf

Please sign in to comment.