From ab90dc12e818460a462addeca1c8cb4321747ce6 Mon Sep 17 00:00:00 2001 From: Otto Perdeck Date: Mon, 16 Dec 2024 15:12:30 +0100 Subject: [PATCH] Added IH data generator --- .../ih/Conversion_Modeling_Reporting.ipynb | 141 +++++------------- examples/ih/ih_helper.py | 82 ++++++++++ python/pdstools/utils/cdh_utils.py | 131 ++++++++-------- 3 files changed, 187 insertions(+), 167 deletions(-) create mode 100644 examples/ih/ih_helper.py diff --git a/examples/ih/Conversion_Modeling_Reporting.ipynb b/examples/ih/Conversion_Modeling_Reporting.ipynb index 8f436adf..3896f3c5 100644 --- a/examples/ih/Conversion_Modeling_Reporting.ipynb +++ b/examples/ih/Conversion_Modeling_Reporting.ipynb @@ -8,7 +8,8 @@ "source": [ "import polars as pl\n", "from pdstools import read_ds_export\n", - "import re\n", + "from pdstools.utils import cdh_utils\n", + "from ih_helper import ih_generator\n", "\n", "import plotly.io as pio\n", "import plotly as plotly\n", @@ -20,95 +21,26 @@ "pio.renderers.default = \"vscode\"\n" ] }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "def capitalize(fields: list) -> list:\n", - " \"\"\"Applies automatic capitalization.\n", - " Parameters\n", - " ----------\n", - " fields : list\n", - " A list of names\n", - "\n", - " Returns\n", - " -------\n", - " fields : list\n", - " The input list, but each value properly capitalized\n", - " \"\"\"\n", - " capitalize_end_words = [\n", - " \"ID\",\n", - " \"Key\",\n", - " \"Name\",\n", - " \"Treatment\",\n", - " \"Count\",\n", - " \"Category\",\n", - " \"Class\",\n", - " \"Time\",\n", - " \"DateTime\",\n", - " \"UpdateTime\",\n", - " \"Version\",\n", - " \"Rate\",\n", - " \"Ratio\",\n", - " \"Negatives\",\n", - " \"Positives\",\n", - " \"Threshold\",\n", - " \"Error\",\n", - " \"Importance\",\n", - " \"Type\",\n", - " \"Percentage\",\n", - " \"Index\",\n", - " \"Symbol\",\n", - " \"ResponseCount\",\n", - " \"ConfigurationName\",\n", - " \"Configuration\",\n", - " ]\n", - " if not isinstance(fields, list):\n", - " fields = [fields]\n", - " fields_new = [re.sub(\"^p([xyz])\", \"\", field) for field in fields]\n", - " seen = set(fields)\n", - " for i, item in enumerate(fields_new):\n", - " if item in seen:\n", - " fields_new[i] = fields[i]\n", - " for word in capitalize_end_words:\n", - " fields_new = [re.sub(word + '\\b', word, field, flags=re.I) for field in fields_new]\n", - " fields_new = [field[:1].upper() + field[1:] for field in fields_new]\n", - " return fields_new" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "TODO: see if we can generate such data rather than shipping it" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "ih = read_ds_export(\"Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\", path=\".\")\n", + "from pathlib import Path\n", "\n", "# we really only need a few columns\n", "# Outcome outcomes: Conversionm, Impression, Pending\n", - "ih = ih.select([\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"])\n", - "ih.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dframe_columns = ih.collect_schema().names()\n", - "cols = capitalize(dframe_columns)\n", - "ih = ih.rename(dict(map(lambda i, j: (i, j), dframe_columns, cols)))\n", - "ih.collect_schema()" + "ih_cols = [\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"]\n", + "\n", + "ih_export_file = Path(\"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\")\n", + "if not ih_export_file.exists():\n", + " ih = ih_generator().generate(100000).select(ih_cols)\n", + "else:\n", + " ih = read_ds_export(ih_export_file).select(ih_cols)\n", + "ih = cdh_utils._polars_capitalize(ih)\n", + "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())\n", + "ih.collect().group_by(\"Outcome\").agg(pl.len())" ] }, { @@ -152,20 +84,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "positive_model_response = [\"Conversion\"]\n", "all_model_response = [\"Impression\", \"Pending\"]\n", "group_by = [\"Day\", \"Month\", \"Year\", \"Quarter\", \"Channel\", \"Issue\", \"Group\", \"Name\", \"ExperimentGroup\"]\n", - "\n", - "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())" + "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -220,8 +151,7 @@ " (\n", " (pl.col(\"ConversionRate\") * (1 - pl.col(\"ConversionRate\")))\n", " / (pl.col(\"Positives\") + pl.col(\"Negatives\"))\n", - " )\n", - " ** 0.5\n", + " ).sqrt()\n", " )\n", " ).alias(\"StdErr\")\n", " ]\n", @@ -230,13 +160,13 @@ " .collect()\n", " )\n", "\n", - "gauge_data = gauge_data.to_pandas()\n", - "\n", "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n", "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n", "\n", - "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n", - "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n", + "gauge_data = gauge_data.with_columns(\n", + " pl.concat_str(gauge_group_by).alias('Name'),\n", + " pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n", + " )\n", "\n", "fig = make_subplots(rows=rows,\n", " cols=cols,\n", @@ -247,8 +177,8 @@ " autosize=True,\n", " title='[CONV] Conversion (Channel/Model Type)',\n", " margin=dict(b=10, t=120, l=10, r=10))\n", - "\n", - "for index, row in gauge_data.iterrows():\n", + "index = 0\n", + "for row in gauge_data.iter_rows(named=True):\n", " ref_value = reference.get(row['CName'], None)\n", " gauge = {\n", " 'axis': {'tickformat': ',.2%'},\n", @@ -282,6 +212,8 @@ " trace1,\n", " row=(r + 1), col=(c + 1)\n", " )\n", + " index = index + 1\n", + "\n", "fig.show()\n", "gauge_data" ] @@ -328,7 +260,7 @@ " .collect()\n", " )\n", "\n", - "treemap_data = treemap_data.to_pandas()\n", + "treemap_data = treemap_data\n", "\n", "fig = px.treemap(treemap_data, path=[px.Constant(\"ALL\")] + treemap_group_by, values='Count',\n", " color=\"ConversionRate\",\n", @@ -386,7 +318,7 @@ " .collect()\n", " )\n", "\n", - "line_data = line_data.to_pandas()\n", + "line_data = line_data\n", "\n", "if len(line_data[\"Day\"].unique()) < 30:\n", " fig = px.bar(line_data,\n", @@ -514,13 +446,16 @@ " .collect()\n", " )\n", "\n", - "gauge_data = gauge_data.to_pandas()\n", + "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n", + "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n", "\n", "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n", "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n", "\n", - "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n", - "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n", + "gauge_data = gauge_data.with_columns(\n", + " pl.concat_str(gauge_group_by).alias('Name'),\n", + " pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n", + " )\n", "\n", "fig = make_subplots(rows=rows,\n", " cols=cols,\n", @@ -529,10 +464,10 @@ "fig.update_layout(\n", " height=270 * rows,\n", " autosize=True,\n", - " title='[ENG] Click-through rates (Channel/Model Type)',\n", + " title='[CONV] Conversion (Channel/Model Type)',\n", " margin=dict(b=10, t=120, l=10, r=10))\n", - "\n", - "for index, row in gauge_data.iterrows():\n", + "index = 0\n", + "for row in gauge_data.iter_rows(named=True):\n", " ref_value = reference.get(row['CName'], None)\n", " gauge = {\n", " 'axis': {'tickformat': ',.2%'},\n", @@ -566,6 +501,8 @@ " trace1,\n", " row=(r + 1), col=(c + 1)\n", " )\n", + " index = index + 1\n", + " \n", "fig.show()\n", "gauge_data" ] @@ -605,7 +542,7 @@ " .collect()\n", " )\n", "\n", - "line_data = line_data.to_pandas()\n", + "line_data = line_data\n", "\n", "if len(line_data[\"Day\"].unique()) < 30:\n", " fig = px.bar(line_data,\n", diff --git a/examples/ih/ih_helper.py b/examples/ih/ih_helper.py new file mode 100644 index 00000000..38e8c5c5 --- /dev/null +++ b/examples/ih/ih_helper.py @@ -0,0 +1,82 @@ +import datetime +import random +import polars as pl +from pdstools.utils import cdh_utils + +# Some day will move into a proper IH class + +class ih_generator: + interactions_period_days = 21 + accept_rate = 0.2 + accept_avg_duration_minutes = 10 + convert_over_accept_rate_test = 0.5 + convert_over_accept_rate_control = 0.3 + convert_avg_duration_days = 2 + + def generate(self, n): + now = datetime.datetime.now() + + + def _interpolate(min, max, i, n): + return min + (max - min) * i / (n - 1) + + + def to_prpc_time_str(timestamp): + return cdh_utils.to_prpc_date_time(timestamp)[0:15] + + + ih_fake_impressions = pl.DataFrame( + { + "InteractionID": [str(int(1e9 + i)) for i in range(n)], + "TimeStamp": [ + (now - datetime.timedelta(days=i * self.interactions_period_days / n)) + for i in range(n) + ], + "AcceptDurationMinutes": [ + random.uniform(0, 2 * self.accept_avg_duration_minutes) for i in range(n) + ], + "ConvertDurationDays": [ + random.uniform(0, 2 * self.convert_avg_duration_days) for i in range(n) + ], + "pyChannel": random.choices(["Web"], k=n), # random.choices(["Web", "Email"], k=n), + "pyIssue": "Acquisition", + "pyGroup": "Phones", + "pyName": "AppleIPhone1564GB", + "ExperimentGroup": ["Conversion-Test", "Conversion-Control"] * int(n / 2), + "pyOutcome": None, + } + ).with_columns( + pyOutcome=pl.when(pl.col.pyChannel == "Web") + .then(pl.lit("Impression")) + .otherwise(pl.lit("Pending")) + ) + ih_fake_accepts = ih_fake_impressions.sample(fraction=self.accept_rate).with_columns( + pl.col.TimeStamp + pl.duration(minutes=pl.col("AcceptDurationMinutes")), + pyOutcome=pl.when(pl.col.pyChannel == "Web") + .then(pl.lit("Clicked")) + .otherwise(pl.lit("Accepted")), + ) + ih_fake_converts_test = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Test").sample( + fraction=self.convert_over_accept_rate_test + ).with_columns( + pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")), + pyOutcome=pl.lit("Conversion"), + ) + ih_fake_converts_control = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Control").sample( + fraction=self.convert_over_accept_rate_control + ).with_columns( + pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")), + pyOutcome=pl.lit("Conversion"), + ) + + ih_data=pl.concat([ih_fake_impressions, ih_fake_accepts, ih_fake_converts_test, ih_fake_converts_control]).with_columns( + pxOutcomeTime=pl.col("TimeStamp").map_elements( + to_prpc_time_str, return_dtype=pl.String + ), + ).filter(pl.col("TimeStamp") < pl.lit(now)).drop( + ["AcceptDurationMinutes", "ConvertDurationDays", "TimeStamp"] + ).sort( + "InteractionID", "pxOutcomeTime" + ).lazy() + + return ih_data \ No newline at end of file diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py index 86730a84..f26d728b 100644 --- a/python/pdstools/utils/cdh_utils.py +++ b/python/pdstools/utils/cdh_utils.py @@ -216,8 +216,7 @@ def _extract_keys( .alias(c) for c in overlap ] - ) - .drop([f"{c}_decoded" for c in overlap]) + ).drop([f"{c}_decoded" for c in overlap]) ) @@ -488,81 +487,81 @@ def _capitalize(fields: Union[str, Iterable[str]]) -> List[str]: The input list, but each value properly capitalized """ capitalize_endwords = [ - "ID", - "Key", - "Name", - "Treatment", - "Count", + "Active", + "BinNegatives", + "BinPositives", + "BinResponseCount", + "BinSymbol", + "Bins", + "Cap", "Category", "Class", - "Time", + "Component", + "Configuration", + "ConfigurationName", + "Context", + "Control", + "Count", + "Date", "DateTime", - "UpdateTime", - "ToClass", - "Version", - "Predictor", - "Predictors", - "Rate", - "Ratio", - "Negatives", - "Positives", - "Threshold", + "Description", + "Email", + "Enabled", "Error", + "Evidence", + "Execution", + "Group", + "GroupIndex", + "Hash", + "ID", + "Identifier", "Importance", - "Type", - "Percentage", "Index", - "Symbol", + "Issue", + "Key", + "Limit", "LowerBound", - "UpperBound", - "Bins", - "GroupIndex", - "ResponseCount", + "Message", + "ModelTechnique", + "Name", + "Negatives", "NegativesPercentage", - "PositivesPercentage", - "BinPositives", - "BinNegatives", - "BinResponseCount", - "BinSymbol", - "ResponseCountPercentage", - "ConfigurationName", - "Configuration", - "SMS", - "Relevant", - "Proposition", - "Active", - "Description", - "Reference", - "Date", + "Offline", + "Omni", + "Outcome", + "Paid", + "Percentage", "Performance", - "Identifier", - "Component", + "Positives", + "PositivesPercentage", "Prediction", - "Outcome", - "Hash", - "URL", - "Cap", - "Template", - "Issue", - "Group", - "Control", - "Evidence", + "Predictor", + "Predictors", "Propensity", - "Paid", - "Subject", - "Email", - "Web", - "Context", - "Limit", + "Proposition", + "Rate", + "Ratio", + "Reference", + "Relevant", + "ResponseCount", + "ResponseCountPercentage", + "SMS", "Stage", - "Omni", - "Execution", - "Enabled", - "Message", - "Offline", - "Update", "Strategy", - "ModelTechnique" + "Subject", + "Symbol", + "Template", + "Threshold", + "Time", + "ToClass", + "Treatment", + "Type", + "URL", + "Update", + "UpdateTime", + "UpperBound", + "Version", + "Web", ] if not isinstance(fields, list): fields = [fields] @@ -806,7 +805,9 @@ def lift_impl(bin_pos, bin_neg, total_pos, total_neg): # TODO not sure how polars (mis)behaves when there are no positives at all # I would hope for a NaN but base python doesn't do that. Polars perhaps. # Stijn: It does have proper None value support, may work like you say - bin_pos * (total_pos + total_neg) / ((bin_pos + bin_neg) * total_pos) + bin_pos + * (total_pos + total_neg) + / ((bin_pos + bin_neg) * total_pos) ).alias("Lift") return lift_impl(pos_col, neg_col, pos_col.sum(), neg_col.sum())