From ab90dc12e818460a462addeca1c8cb4321747ce6 Mon Sep 17 00:00:00 2001
From: Otto Perdeck <otto.perdeck@pega.com>
Date: Mon, 16 Dec 2024 15:12:30 +0100
Subject: [PATCH] Added IH data generator

---
 .../ih/Conversion_Modeling_Reporting.ipynb    | 141 +++++-------------
 examples/ih/ih_helper.py                      |  82 ++++++++++
 python/pdstools/utils/cdh_utils.py            | 131 ++++++++--------
 3 files changed, 187 insertions(+), 167 deletions(-)
 create mode 100644 examples/ih/ih_helper.py

diff --git a/examples/ih/Conversion_Modeling_Reporting.ipynb b/examples/ih/Conversion_Modeling_Reporting.ipynb
index 8f436adf..3896f3c5 100644
--- a/examples/ih/Conversion_Modeling_Reporting.ipynb
+++ b/examples/ih/Conversion_Modeling_Reporting.ipynb
@@ -8,7 +8,8 @@
    "source": [
     "import polars as pl\n",
     "from pdstools import read_ds_export\n",
-    "import re\n",
+    "from pdstools.utils import cdh_utils\n",
+    "from ih_helper import ih_generator\n",
     "\n",
     "import plotly.io as pio\n",
     "import plotly as plotly\n",
@@ -20,95 +21,26 @@
     "pio.renderers.default = \"vscode\"\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def capitalize(fields: list) -> list:\n",
-    "    \"\"\"Applies automatic capitalization.\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    fields : list\n",
-    "        A list of names\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    fields : list\n",
-    "        The input list, but each value properly capitalized\n",
-    "    \"\"\"\n",
-    "    capitalize_end_words = [\n",
-    "        \"ID\",\n",
-    "        \"Key\",\n",
-    "        \"Name\",\n",
-    "        \"Treatment\",\n",
-    "        \"Count\",\n",
-    "        \"Category\",\n",
-    "        \"Class\",\n",
-    "        \"Time\",\n",
-    "        \"DateTime\",\n",
-    "        \"UpdateTime\",\n",
-    "        \"Version\",\n",
-    "        \"Rate\",\n",
-    "        \"Ratio\",\n",
-    "        \"Negatives\",\n",
-    "        \"Positives\",\n",
-    "        \"Threshold\",\n",
-    "        \"Error\",\n",
-    "        \"Importance\",\n",
-    "        \"Type\",\n",
-    "        \"Percentage\",\n",
-    "        \"Index\",\n",
-    "        \"Symbol\",\n",
-    "        \"ResponseCount\",\n",
-    "        \"ConfigurationName\",\n",
-    "        \"Configuration\",\n",
-    "    ]\n",
-    "    if not isinstance(fields, list):\n",
-    "        fields = [fields]\n",
-    "    fields_new = [re.sub(\"^p([xyz])\", \"\", field) for field in fields]\n",
-    "    seen = set(fields)\n",
-    "    for i, item in enumerate(fields_new):\n",
-    "        if item in seen:\n",
-    "            fields_new[i] = fields[i]\n",
-    "    for word in capitalize_end_words:\n",
-    "        fields_new = [re.sub(word + '\\b', word, field, flags=re.I) for field in fields_new]\n",
-    "        fields_new = [field[:1].upper() + field[1:] for field in fields_new]\n",
-    "    return fields_new"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "TODO: see if we can generate such data rather than shipping it"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "ih = read_ds_export(\"Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\", path=\".\")\n",
+    "from pathlib import Path\n",
     "\n",
     "# we really only need a few columns\n",
     "# Outcome outcomes: Conversionm, Impression, Pending\n",
-    "ih = ih.select([\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"])\n",
-    "ih.collect()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dframe_columns = ih.collect_schema().names()\n",
-    "cols = capitalize(dframe_columns)\n",
-    "ih = ih.rename(dict(map(lambda i, j: (i, j), dframe_columns, cols)))\n",
-    "ih.collect_schema()"
+    "ih_cols = [\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"]\n",
+    "\n",
+    "ih_export_file = Path(\"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\")\n",
+    "if not ih_export_file.exists():\n",
+    "    ih = ih_generator().generate(100000).select(ih_cols)\n",
+    "else:\n",
+    "    ih = read_ds_export(ih_export_file).select(ih_cols)\n",
+    "ih = cdh_utils._polars_capitalize(ih)\n",
+    "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())\n",
+    "ih.collect().group_by(\"Outcome\").agg(pl.len())"
    ]
   },
   {
@@ -152,20 +84,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "positive_model_response = [\"Conversion\"]\n",
     "all_model_response = [\"Impression\", \"Pending\"]\n",
     "group_by = [\"Day\", \"Month\", \"Year\", \"Quarter\", \"Channel\", \"Issue\", \"Group\", \"Name\", \"ExperimentGroup\"]\n",
-    "\n",
-    "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())"
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,8 +151,7 @@
     "                        (\n",
     "                            (pl.col(\"ConversionRate\") * (1 - pl.col(\"ConversionRate\")))\n",
     "                            / (pl.col(\"Positives\") + pl.col(\"Negatives\"))\n",
-    "                        )\n",
-    "                        ** 0.5\n",
+    "                        ).sqrt()\n",
     "                    )\n",
     "                ).alias(\"StdErr\")\n",
     "            ]\n",
@@ -230,13 +160,13 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "gauge_data = gauge_data.to_pandas()\n",
-    "\n",
     "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
     "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
-    "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
-    "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
+    "gauge_data = gauge_data.with_columns(\n",
+    "     pl.concat_str(gauge_group_by).alias('Name'),\n",
+    "     pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
+    "    )\n",
     "\n",
     "fig = make_subplots(rows=rows,\n",
     "                        cols=cols,\n",
@@ -247,8 +177,8 @@
     "        autosize=True,\n",
     "        title='[CONV] Conversion (Channel/Model Type)',\n",
     "        margin=dict(b=10, t=120, l=10, r=10))\n",
-    "\n",
-    "for index, row in gauge_data.iterrows():\n",
+    "index = 0\n",
+    "for row in gauge_data.iter_rows(named=True):\n",
     "        ref_value = reference.get(row['CName'], None)\n",
     "        gauge = {\n",
     "            'axis': {'tickformat': ',.2%'},\n",
@@ -282,6 +212,8 @@
     "            trace1,\n",
     "            row=(r + 1), col=(c + 1)\n",
     "        )\n",
+    "        index = index + 1\n",
+    "\n",
     "fig.show()\n",
     "gauge_data"
    ]
@@ -328,7 +260,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "treemap_data = treemap_data.to_pandas()\n",
+    "treemap_data = treemap_data\n",
     "\n",
     "fig = px.treemap(treemap_data, path=[px.Constant(\"ALL\")] + treemap_group_by, values='Count',\n",
     "                     color=\"ConversionRate\",\n",
@@ -386,7 +318,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "line_data = line_data.to_pandas()\n",
+    "line_data = line_data\n",
     "\n",
     "if len(line_data[\"Day\"].unique()) < 30:\n",
     "        fig = px.bar(line_data,\n",
@@ -514,13 +446,16 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "gauge_data = gauge_data.to_pandas()\n",
+    "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
+    "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
     "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
     "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
-    "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
-    "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
+    "gauge_data = gauge_data.with_columns(\n",
+    "     pl.concat_str(gauge_group_by).alias('Name'),\n",
+    "     pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
+    "    )\n",
     "\n",
     "fig = make_subplots(rows=rows,\n",
     "                        cols=cols,\n",
@@ -529,10 +464,10 @@
     "fig.update_layout(\n",
     "        height=270 * rows,\n",
     "        autosize=True,\n",
-    "        title='[ENG] Click-through rates (Channel/Model Type)',\n",
+    "        title='[CONV] Conversion (Channel/Model Type)',\n",
     "        margin=dict(b=10, t=120, l=10, r=10))\n",
-    "\n",
-    "for index, row in gauge_data.iterrows():\n",
+    "index = 0\n",
+    "for row in gauge_data.iter_rows(named=True):\n",
     "        ref_value = reference.get(row['CName'], None)\n",
     "        gauge = {\n",
     "            'axis': {'tickformat': ',.2%'},\n",
@@ -566,6 +501,8 @@
     "            trace1,\n",
     "            row=(r + 1), col=(c + 1)\n",
     "        )\n",
+    "        index = index + 1\n",
+    "        \n",
     "fig.show()\n",
     "gauge_data"
    ]
@@ -605,7 +542,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "line_data = line_data.to_pandas()\n",
+    "line_data = line_data\n",
     "\n",
     "if len(line_data[\"Day\"].unique()) < 30:\n",
     "        fig = px.bar(line_data,\n",
diff --git a/examples/ih/ih_helper.py b/examples/ih/ih_helper.py
new file mode 100644
index 00000000..38e8c5c5
--- /dev/null
+++ b/examples/ih/ih_helper.py
@@ -0,0 +1,82 @@
+import datetime
+import random
+import polars as pl
+from pdstools.utils import cdh_utils
+
+# Some day will move into a proper IH class
+
+class ih_generator:
+    interactions_period_days = 21
+    accept_rate = 0.2
+    accept_avg_duration_minutes = 10
+    convert_over_accept_rate_test = 0.5
+    convert_over_accept_rate_control = 0.3
+    convert_avg_duration_days = 2
+
+    def generate(self, n):
+        now = datetime.datetime.now()
+
+
+        def _interpolate(min, max, i, n):
+            return min + (max - min) * i / (n - 1)
+
+
+        def to_prpc_time_str(timestamp):
+            return cdh_utils.to_prpc_date_time(timestamp)[0:15]
+
+
+        ih_fake_impressions = pl.DataFrame(
+            {
+                "InteractionID": [str(int(1e9 + i)) for i in range(n)],
+                "TimeStamp": [
+                    (now - datetime.timedelta(days=i * self.interactions_period_days / n))
+                    for i in range(n)
+                ],
+                "AcceptDurationMinutes": [
+                    random.uniform(0, 2 * self.accept_avg_duration_minutes) for i in range(n)
+                ],
+                "ConvertDurationDays": [
+                    random.uniform(0, 2 * self.convert_avg_duration_days) for i in range(n)
+                ],
+                "pyChannel": random.choices(["Web"], k=n), # random.choices(["Web", "Email"], k=n),
+                "pyIssue": "Acquisition",
+                "pyGroup": "Phones",
+                "pyName": "AppleIPhone1564GB",
+                "ExperimentGroup": ["Conversion-Test", "Conversion-Control"] * int(n / 2),
+                "pyOutcome": None,
+            }
+        ).with_columns(
+            pyOutcome=pl.when(pl.col.pyChannel == "Web")
+            .then(pl.lit("Impression"))
+            .otherwise(pl.lit("Pending"))
+        )
+        ih_fake_accepts = ih_fake_impressions.sample(fraction=self.accept_rate).with_columns(
+            pl.col.TimeStamp + pl.duration(minutes=pl.col("AcceptDurationMinutes")),
+            pyOutcome=pl.when(pl.col.pyChannel == "Web")
+            .then(pl.lit("Clicked"))
+            .otherwise(pl.lit("Accepted")),
+        )
+        ih_fake_converts_test = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Test").sample(
+            fraction=self.convert_over_accept_rate_test
+        ).with_columns(
+            pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
+            pyOutcome=pl.lit("Conversion"),
+        )
+        ih_fake_converts_control = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Control").sample(
+            fraction=self.convert_over_accept_rate_control
+        ).with_columns(
+            pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
+            pyOutcome=pl.lit("Conversion"),
+        )
+
+        ih_data=pl.concat([ih_fake_impressions, ih_fake_accepts, ih_fake_converts_test, ih_fake_converts_control]).with_columns(
+            pxOutcomeTime=pl.col("TimeStamp").map_elements(
+                to_prpc_time_str, return_dtype=pl.String
+            ),
+        ).filter(pl.col("TimeStamp") < pl.lit(now)).drop(
+            ["AcceptDurationMinutes", "ConvertDurationDays", "TimeStamp"]
+        ).sort(
+            "InteractionID", "pxOutcomeTime"
+        ).lazy()
+
+        return ih_data
\ No newline at end of file
diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py
index 86730a84..f26d728b 100644
--- a/python/pdstools/utils/cdh_utils.py
+++ b/python/pdstools/utils/cdh_utils.py
@@ -216,8 +216,7 @@ def _extract_keys(
                 .alias(c)
                 for c in overlap
             ]
-        )
-        .drop([f"{c}_decoded" for c in overlap])
+        ).drop([f"{c}_decoded" for c in overlap])
     )
 
 
@@ -488,81 +487,81 @@ def _capitalize(fields: Union[str, Iterable[str]]) -> List[str]:
         The input list, but each value properly capitalized
     """
     capitalize_endwords = [
-        "ID",
-        "Key",
-        "Name",
-        "Treatment",
-        "Count",
+        "Active",
+        "BinNegatives",
+        "BinPositives",
+        "BinResponseCount",
+        "BinSymbol",
+        "Bins",
+        "Cap",
         "Category",
         "Class",
-        "Time",
+        "Component",
+        "Configuration",
+        "ConfigurationName",
+        "Context",
+        "Control",
+        "Count",
+        "Date",
         "DateTime",
-        "UpdateTime",
-        "ToClass",
-        "Version",
-        "Predictor",
-        "Predictors",
-        "Rate",
-        "Ratio",
-        "Negatives",
-        "Positives",
-        "Threshold",
+        "Description",
+        "Email",
+        "Enabled",
         "Error",
+        "Evidence",
+        "Execution",
+        "Group",
+        "GroupIndex",
+        "Hash",
+        "ID",
+        "Identifier",
         "Importance",
-        "Type",
-        "Percentage",
         "Index",
-        "Symbol",
+        "Issue",
+        "Key",
+        "Limit",
         "LowerBound",
-        "UpperBound",
-        "Bins",
-        "GroupIndex",
-        "ResponseCount",
+        "Message",
+        "ModelTechnique",
+        "Name",
+        "Negatives",
         "NegativesPercentage",
-        "PositivesPercentage",
-        "BinPositives",
-        "BinNegatives",
-        "BinResponseCount",
-        "BinSymbol",
-        "ResponseCountPercentage",
-        "ConfigurationName",
-        "Configuration",
-        "SMS",
-        "Relevant",
-        "Proposition",
-        "Active",
-        "Description",
-        "Reference",
-        "Date",
+        "Offline",
+        "Omni",
+        "Outcome",
+        "Paid",
+        "Percentage",
         "Performance",
-        "Identifier",
-        "Component",
+        "Positives",
+        "PositivesPercentage",
         "Prediction",
-        "Outcome",
-        "Hash",
-        "URL",
-        "Cap",
-        "Template",
-        "Issue",
-        "Group",
-        "Control",
-        "Evidence",
+        "Predictor",
+        "Predictors",
         "Propensity",
-        "Paid",
-        "Subject",
-        "Email",
-        "Web",
-        "Context",
-        "Limit",
+        "Proposition",
+        "Rate",
+        "Ratio",
+        "Reference",
+        "Relevant",
+        "ResponseCount",
+        "ResponseCountPercentage",
+        "SMS",
         "Stage",
-        "Omni",
-        "Execution",
-        "Enabled",
-        "Message",
-        "Offline",
-        "Update",
         "Strategy",
-        "ModelTechnique"
+        "Subject",
+        "Symbol",
+        "Template",
+        "Threshold",
+        "Time",
+        "ToClass",
+        "Treatment",
+        "Type",
+        "URL",
+        "Update",
+        "UpdateTime",
+        "UpperBound",
+        "Version",
+        "Web",
     ]
     if not isinstance(fields, list):
         fields = [fields]
@@ -806,7 +805,9 @@ def lift_impl(bin_pos, bin_neg, total_pos, total_neg):
             # TODO not sure how polars (mis)behaves when there are no positives at all
             # I would hope for a NaN but base python doesn't do that. Polars perhaps.
             # Stijn: It does have proper None value support, may work like you say
-            bin_pos * (total_pos + total_neg) / ((bin_pos + bin_neg) * total_pos)
+            bin_pos
+            * (total_pos + total_neg)
+            / ((bin_pos + bin_neg) * total_pos)
         ).alias("Lift")
 
     return lift_impl(pos_col, neg_col, pos_col.sum(), neg_col.sum())