Skip to content

Commit

Permalink
Added IH data generator
Browse files Browse the repository at this point in the history
  • Loading branch information
operdeck committed Dec 16, 2024
1 parent 28af59e commit ab90dc1
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 167 deletions.
141 changes: 39 additions & 102 deletions examples/ih/Conversion_Modeling_Reporting.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"source": [
"import polars as pl\n",
"from pdstools import read_ds_export\n",
"import re\n",
"from pdstools.utils import cdh_utils\n",
"from ih_helper import ih_generator\n",
"\n",
"import plotly.io as pio\n",
"import plotly as plotly\n",
Expand All @@ -20,95 +21,26 @@
"pio.renderers.default = \"vscode\"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def capitalize(fields: list) -> list:\n",
" \"\"\"Applies automatic capitalization.\n",
" Parameters\n",
" ----------\n",
" fields : list\n",
" A list of names\n",
"\n",
" Returns\n",
" -------\n",
" fields : list\n",
" The input list, but each value properly capitalized\n",
" \"\"\"\n",
" capitalize_end_words = [\n",
" \"ID\",\n",
" \"Key\",\n",
" \"Name\",\n",
" \"Treatment\",\n",
" \"Count\",\n",
" \"Category\",\n",
" \"Class\",\n",
" \"Time\",\n",
" \"DateTime\",\n",
" \"UpdateTime\",\n",
" \"Version\",\n",
" \"Rate\",\n",
" \"Ratio\",\n",
" \"Negatives\",\n",
" \"Positives\",\n",
" \"Threshold\",\n",
" \"Error\",\n",
" \"Importance\",\n",
" \"Type\",\n",
" \"Percentage\",\n",
" \"Index\",\n",
" \"Symbol\",\n",
" \"ResponseCount\",\n",
" \"ConfigurationName\",\n",
" \"Configuration\",\n",
" ]\n",
" if not isinstance(fields, list):\n",
" fields = [fields]\n",
" fields_new = [re.sub(\"^p([xyz])\", \"\", field) for field in fields]\n",
" seen = set(fields)\n",
" for i, item in enumerate(fields_new):\n",
" if item in seen:\n",
" fields_new[i] = fields[i]\n",
" for word in capitalize_end_words:\n",
" fields_new = [re.sub(word + '\\b', word, field, flags=re.I) for field in fields_new]\n",
" fields_new = [field[:1].upper() + field[1:] for field in fields_new]\n",
" return fields_new"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TODO: see if we can generate such data rather than shipping it"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ih = read_ds_export(\"Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\", path=\".\")\n",
"from pathlib import Path\n",
"\n",
"# we really only need a few columns\n",
"# Outcome outcomes: Conversionm, Impression, Pending\n",
"ih = ih.select([\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"])\n",
"ih.collect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dframe_columns = ih.collect_schema().names()\n",
"cols = capitalize(dframe_columns)\n",
"ih = ih.rename(dict(map(lambda i, j: (i, j), dframe_columns, cols)))\n",
"ih.collect_schema()"
"ih_cols = [\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"]\n",
"\n",
"ih_export_file = Path(\"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\")\n",
"if not ih_export_file.exists():\n",
" ih = ih_generator().generate(100000).select(ih_cols)\n",
"else:\n",
" ih = read_ds_export(ih_export_file).select(ih_cols)\n",
"ih = cdh_utils._polars_capitalize(ih)\n",
"ih = ih.filter(pl.col('ExperimentGroup').is_not_null())\n",
"ih.collect().group_by(\"Outcome\").agg(pl.len())"
]
},
{
Expand Down Expand Up @@ -152,20 +84,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"positive_model_response = [\"Conversion\"]\n",
"all_model_response = [\"Impression\", \"Pending\"]\n",
"group_by = [\"Day\", \"Month\", \"Year\", \"Quarter\", \"Channel\", \"Issue\", \"Group\", \"Name\", \"ExperimentGroup\"]\n",
"\n",
"ih = ih.filter(pl.col('ExperimentGroup').is_not_null())"
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -220,8 +151,7 @@
" (\n",
" (pl.col(\"ConversionRate\") * (1 - pl.col(\"ConversionRate\")))\n",
" / (pl.col(\"Positives\") + pl.col(\"Negatives\"))\n",
" )\n",
" ** 0.5\n",
" ).sqrt()\n",
" )\n",
" ).alias(\"StdErr\")\n",
" ]\n",
Expand All @@ -230,13 +160,13 @@
" .collect()\n",
" )\n",
"\n",
"gauge_data = gauge_data.to_pandas()\n",
"\n",
"cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
"rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
"\n",
"gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
"gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
"gauge_data = gauge_data.with_columns(\n",
" pl.concat_str(gauge_group_by).alias('Name'),\n",
" pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
" )\n",
"\n",
"fig = make_subplots(rows=rows,\n",
" cols=cols,\n",
Expand All @@ -247,8 +177,8 @@
" autosize=True,\n",
" title='[CONV] Conversion (Channel/Model Type)',\n",
" margin=dict(b=10, t=120, l=10, r=10))\n",
"\n",
"for index, row in gauge_data.iterrows():\n",
"index = 0\n",
"for row in gauge_data.iter_rows(named=True):\n",
" ref_value = reference.get(row['CName'], None)\n",
" gauge = {\n",
" 'axis': {'tickformat': ',.2%'},\n",
Expand Down Expand Up @@ -282,6 +212,8 @@
" trace1,\n",
" row=(r + 1), col=(c + 1)\n",
" )\n",
" index = index + 1\n",
"\n",
"fig.show()\n",
"gauge_data"
]
Expand Down Expand Up @@ -328,7 +260,7 @@
" .collect()\n",
" )\n",
"\n",
"treemap_data = treemap_data.to_pandas()\n",
"treemap_data = treemap_data\n",
"\n",
"fig = px.treemap(treemap_data, path=[px.Constant(\"ALL\")] + treemap_group_by, values='Count',\n",
" color=\"ConversionRate\",\n",
Expand Down Expand Up @@ -386,7 +318,7 @@
" .collect()\n",
" )\n",
"\n",
"line_data = line_data.to_pandas()\n",
"line_data = line_data\n",
"\n",
"if len(line_data[\"Day\"].unique()) < 30:\n",
" fig = px.bar(line_data,\n",
Expand Down Expand Up @@ -514,13 +446,16 @@
" .collect()\n",
" )\n",
"\n",
"gauge_data = gauge_data.to_pandas()\n",
"cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
"rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
"\n",
"cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
"rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
"\n",
"gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
"gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
"gauge_data = gauge_data.with_columns(\n",
" pl.concat_str(gauge_group_by).alias('Name'),\n",
" pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
" )\n",
"\n",
"fig = make_subplots(rows=rows,\n",
" cols=cols,\n",
Expand All @@ -529,10 +464,10 @@
"fig.update_layout(\n",
" height=270 * rows,\n",
" autosize=True,\n",
" title='[ENG] Click-through rates (Channel/Model Type)',\n",
" title='[CONV] Conversion (Channel/Model Type)',\n",
" margin=dict(b=10, t=120, l=10, r=10))\n",
"\n",
"for index, row in gauge_data.iterrows():\n",
"index = 0\n",
"for row in gauge_data.iter_rows(named=True):\n",
" ref_value = reference.get(row['CName'], None)\n",
" gauge = {\n",
" 'axis': {'tickformat': ',.2%'},\n",
Expand Down Expand Up @@ -566,6 +501,8 @@
" trace1,\n",
" row=(r + 1), col=(c + 1)\n",
" )\n",
" index = index + 1\n",
" \n",
"fig.show()\n",
"gauge_data"
]
Expand Down Expand Up @@ -605,7 +542,7 @@
" .collect()\n",
" )\n",
"\n",
"line_data = line_data.to_pandas()\n",
"line_data = line_data\n",
"\n",
"if len(line_data[\"Day\"].unique()) < 30:\n",
" fig = px.bar(line_data,\n",
Expand Down
82 changes: 82 additions & 0 deletions examples/ih/ih_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import datetime
import random
import polars as pl
from pdstools.utils import cdh_utils

# Some day will move into a proper IH class

class ih_generator:
interactions_period_days = 21
accept_rate = 0.2
accept_avg_duration_minutes = 10
convert_over_accept_rate_test = 0.5
convert_over_accept_rate_control = 0.3
convert_avg_duration_days = 2

def generate(self, n):
now = datetime.datetime.now()


def _interpolate(min, max, i, n):
return min + (max - min) * i / (n - 1)


def to_prpc_time_str(timestamp):
return cdh_utils.to_prpc_date_time(timestamp)[0:15]


ih_fake_impressions = pl.DataFrame(
{
"InteractionID": [str(int(1e9 + i)) for i in range(n)],
"TimeStamp": [
(now - datetime.timedelta(days=i * self.interactions_period_days / n))
for i in range(n)
],
"AcceptDurationMinutes": [
random.uniform(0, 2 * self.accept_avg_duration_minutes) for i in range(n)
],
"ConvertDurationDays": [
random.uniform(0, 2 * self.convert_avg_duration_days) for i in range(n)
],
"pyChannel": random.choices(["Web"], k=n), # random.choices(["Web", "Email"], k=n),
"pyIssue": "Acquisition",
"pyGroup": "Phones",
"pyName": "AppleIPhone1564GB",
"ExperimentGroup": ["Conversion-Test", "Conversion-Control"] * int(n / 2),
"pyOutcome": None,
}
).with_columns(
pyOutcome=pl.when(pl.col.pyChannel == "Web")
.then(pl.lit("Impression"))
.otherwise(pl.lit("Pending"))
)
ih_fake_accepts = ih_fake_impressions.sample(fraction=self.accept_rate).with_columns(
pl.col.TimeStamp + pl.duration(minutes=pl.col("AcceptDurationMinutes")),
pyOutcome=pl.when(pl.col.pyChannel == "Web")
.then(pl.lit("Clicked"))
.otherwise(pl.lit("Accepted")),
)
ih_fake_converts_test = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Test").sample(
fraction=self.convert_over_accept_rate_test
).with_columns(
pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
pyOutcome=pl.lit("Conversion"),
)
ih_fake_converts_control = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Control").sample(
fraction=self.convert_over_accept_rate_control
).with_columns(
pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
pyOutcome=pl.lit("Conversion"),
)

ih_data=pl.concat([ih_fake_impressions, ih_fake_accepts, ih_fake_converts_test, ih_fake_converts_control]).with_columns(
pxOutcomeTime=pl.col("TimeStamp").map_elements(
to_prpc_time_str, return_dtype=pl.String
),
).filter(pl.col("TimeStamp") < pl.lit(now)).drop(
["AcceptDurationMinutes", "ConvertDurationDays", "TimeStamp"]
).sort(
"InteractionID", "pxOutcomeTime"
).lazy()

return ih_data
Loading

0 comments on commit ab90dc1

Please sign in to comment.