Added IH data generator

pegasystems · Dec 16, 2024 · ab90dc1 · ab90dc1
1 parent 28af59e
commit ab90dc1
Show file tree

Hide file tree

Showing 3 changed files with 187 additions and 167 deletions.
diff --git a/examples/ih/Conversion_Modeling_Reporting.ipynb b/examples/ih/Conversion_Modeling_Reporting.ipynb
@@ -8,7 +8,8 @@
    "source": [
     "import polars as pl\n",
     "from pdstools import read_ds_export\n",
-    "import re\n",
+    "from pdstools.utils import cdh_utils\n",
+    "from ih_helper import ih_generator\n",
     "\n",
     "import plotly.io as pio\n",
     "import plotly as plotly\n",
@@ -20,95 +21,26 @@
     "pio.renderers.default = \"vscode\"\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def capitalize(fields: list) -> list:\n",
-    "    \"\"\"Applies automatic capitalization.\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    fields : list\n",
-    "        A list of names\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    fields : list\n",
-    "        The input list, but each value properly capitalized\n",
-    "    \"\"\"\n",
-    "    capitalize_end_words = [\n",
-    "        \"ID\",\n",
-    "        \"Key\",\n",
-    "        \"Name\",\n",
-    "        \"Treatment\",\n",
-    "        \"Count\",\n",
-    "        \"Category\",\n",
-    "        \"Class\",\n",
-    "        \"Time\",\n",
-    "        \"DateTime\",\n",
-    "        \"UpdateTime\",\n",
-    "        \"Version\",\n",
-    "        \"Rate\",\n",
-    "        \"Ratio\",\n",
-    "        \"Negatives\",\n",
-    "        \"Positives\",\n",
-    "        \"Threshold\",\n",
-    "        \"Error\",\n",
-    "        \"Importance\",\n",
-    "        \"Type\",\n",
-    "        \"Percentage\",\n",
-    "        \"Index\",\n",
-    "        \"Symbol\",\n",
-    "        \"ResponseCount\",\n",
-    "        \"ConfigurationName\",\n",
-    "        \"Configuration\",\n",
-    "    ]\n",
-    "    if not isinstance(fields, list):\n",
-    "        fields = [fields]\n",
-    "    fields_new = [re.sub(\"^p([xyz])\", \"\", field) for field in fields]\n",
-    "    seen = set(fields)\n",
-    "    for i, item in enumerate(fields_new):\n",
-    "        if item in seen:\n",
-    "            fields_new[i] = fields[i]\n",
-    "    for word in capitalize_end_words:\n",
-    "        fields_new = [re.sub(word + '\\b', word, field, flags=re.I) for field in fields_new]\n",
-    "        fields_new = [field[:1].upper() + field[1:] for field in fields_new]\n",
-    "    return fields_new"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "TODO: see if we can generate such data rather than shipping it"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "ih = read_ds_export(\"Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\", path=\".\")\n",
+    "from pathlib import Path\n",
     "\n",
     "# we really only need a few columns\n",
     "# Outcome outcomes: Conversionm, Impression, Pending\n",
-    "ih = ih.select([\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"])\n",
-    "ih.collect()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dframe_columns = ih.collect_schema().names()\n",
-    "cols = capitalize(dframe_columns)\n",
-    "ih = ih.rename(dict(map(lambda i, j: (i, j), dframe_columns, cols)))\n",
-    "ih.collect_schema()"
+    "ih_cols = [\"pyOutcome\", \"pxOutcomeTime\", \"pyChannel\", \"pyIssue\", \"pyGroup\", \"pyName\", \"ExperimentGroup\"]\n",
+    "\n",
+    "ih_export_file = Path(\"./Data-pxStrategyResult_InteractionFiles_20241213T091932_GMT.zip\")\n",
+    "if not ih_export_file.exists():\n",
+    "    ih = ih_generator().generate(100000).select(ih_cols)\n",
+    "else:\n",
+    "    ih = read_ds_export(ih_export_file).select(ih_cols)\n",
+    "ih = cdh_utils._polars_capitalize(ih)\n",
+    "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())\n",
+    "ih.collect().group_by(\"Outcome\").agg(pl.len())"
    ]
   },
   {
@@ -152,20 +84,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "positive_model_response = [\"Conversion\"]\n",
     "all_model_response = [\"Impression\", \"Pending\"]\n",
     "group_by = [\"Day\", \"Month\", \"Year\", \"Quarter\", \"Channel\", \"Issue\", \"Group\", \"Name\", \"ExperimentGroup\"]\n",
-    "\n",
-    "ih = ih.filter(pl.col('ExperimentGroup').is_not_null())"
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,8 +151,7 @@
     "                        (\n",
     "                            (pl.col(\"ConversionRate\") * (1 - pl.col(\"ConversionRate\")))\n",
     "                            / (pl.col(\"Positives\") + pl.col(\"Negatives\"))\n",
-    "                        )\n",
-    "                        ** 0.5\n",
+    "                        ).sqrt()\n",
     "                    )\n",
     "                ).alias(\"StdErr\")\n",
     "            ]\n",
@@ -230,13 +160,13 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "gauge_data = gauge_data.to_pandas()\n",
-    "\n",
     "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
     "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
-    "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
-    "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
+    "gauge_data = gauge_data.with_columns(\n",
+    "     pl.concat_str(gauge_group_by).alias('Name'),\n",
+    "     pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
+    "    )\n",
     "\n",
     "fig = make_subplots(rows=rows,\n",
     "                        cols=cols,\n",
@@ -247,8 +177,8 @@
     "        autosize=True,\n",
     "        title='[CONV] Conversion (Channel/Model Type)',\n",
     "        margin=dict(b=10, t=120, l=10, r=10))\n",
-    "\n",
-    "for index, row in gauge_data.iterrows():\n",
+    "index = 0\n",
+    "for row in gauge_data.iter_rows(named=True):\n",
     "        ref_value = reference.get(row['CName'], None)\n",
     "        gauge = {\n",
     "            'axis': {'tickformat': ',.2%'},\n",
@@ -282,6 +212,8 @@
     "            trace1,\n",
     "            row=(r + 1), col=(c + 1)\n",
     "        )\n",
+    "        index = index + 1\n",
+    "\n",
     "fig.show()\n",
     "gauge_data"
    ]
@@ -328,7 +260,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "treemap_data = treemap_data.to_pandas()\n",
+    "treemap_data = treemap_data\n",
     "\n",
     "fig = px.treemap(treemap_data, path=[px.Constant(\"ALL\")] + treemap_group_by, values='Count',\n",
     "                     color=\"ConversionRate\",\n",
@@ -386,7 +318,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "line_data = line_data.to_pandas()\n",
+    "line_data = line_data\n",
     "\n",
     "if len(line_data[\"Day\"].unique()) < 30:\n",
     "        fig = px.bar(line_data,\n",
@@ -514,13 +446,16 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "gauge_data = gauge_data.to_pandas()\n",
+    "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
+    "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
     "cols = gauge_data[gauge_group_by[0]].unique().shape[0]\n",
     "rows = gauge_data[gauge_group_by[1]].unique().shape[0]\n",
     "\n",
-    "gauge_data['Name'] = gauge_data[gauge_group_by].apply(lambda r: ' '.join(r.values.astype(str)), axis=1)\n",
-    "gauge_data['CName'] = gauge_data[gauge_group_by].apply(lambda r: '_'.join(r.values.astype(str)), axis=1)\n",
+    "gauge_data = gauge_data.with_columns(\n",
+    "     pl.concat_str(gauge_group_by).alias('Name'),\n",
+    "     pl.concat_str(gauge_group_by, separator = \"_\").alias('CName'),\n",
+    "    )\n",
     "\n",
     "fig = make_subplots(rows=rows,\n",
     "                        cols=cols,\n",
@@ -529,10 +464,10 @@
     "fig.update_layout(\n",
     "        height=270 * rows,\n",
     "        autosize=True,\n",
-    "        title='[ENG] Click-through rates (Channel/Model Type)',\n",
+    "        title='[CONV] Conversion (Channel/Model Type)',\n",
     "        margin=dict(b=10, t=120, l=10, r=10))\n",
-    "\n",
-    "for index, row in gauge_data.iterrows():\n",
+    "index = 0\n",
+    "for row in gauge_data.iter_rows(named=True):\n",
     "        ref_value = reference.get(row['CName'], None)\n",
     "        gauge = {\n",
     "            'axis': {'tickformat': ',.2%'},\n",
@@ -566,6 +501,8 @@
     "            trace1,\n",
     "            row=(r + 1), col=(c + 1)\n",
     "        )\n",
+    "        index = index + 1\n",
+    "        \n",
     "fig.show()\n",
     "gauge_data"
    ]
@@ -605,7 +542,7 @@
     "        .collect()\n",
     "    )\n",
     "\n",
-    "line_data = line_data.to_pandas()\n",
+    "line_data = line_data\n",
     "\n",
     "if len(line_data[\"Day\"].unique()) < 30:\n",
     "        fig = px.bar(line_data,\n",

diff --git a/examples/ih/ih_helper.py b/examples/ih/ih_helper.py
@@ -0,0 +1,82 @@
+import datetime
+import random
+import polars as pl
+from pdstools.utils import cdh_utils
+
+# Some day will move into a proper IH class
+
+class ih_generator:
+    interactions_period_days = 21
+    accept_rate = 0.2
+    accept_avg_duration_minutes = 10
+    convert_over_accept_rate_test = 0.5
+    convert_over_accept_rate_control = 0.3
+    convert_avg_duration_days = 2
+
+    def generate(self, n):
+        now = datetime.datetime.now()
+
+
+        def _interpolate(min, max, i, n):
+            return min + (max - min) * i / (n - 1)
+
+
+        def to_prpc_time_str(timestamp):
+            return cdh_utils.to_prpc_date_time(timestamp)[0:15]
+
+
+        ih_fake_impressions = pl.DataFrame(
+            {
+                "InteractionID": [str(int(1e9 + i)) for i in range(n)],
+                "TimeStamp": [
+                    (now - datetime.timedelta(days=i * self.interactions_period_days / n))
+                    for i in range(n)
+                ],
+                "AcceptDurationMinutes": [
+                    random.uniform(0, 2 * self.accept_avg_duration_minutes) for i in range(n)
+                ],
+                "ConvertDurationDays": [
+                    random.uniform(0, 2 * self.convert_avg_duration_days) for i in range(n)
+                ],
+                "pyChannel": random.choices(["Web"], k=n), # random.choices(["Web", "Email"], k=n),
+                "pyIssue": "Acquisition",
+                "pyGroup": "Phones",
+                "pyName": "AppleIPhone1564GB",
+                "ExperimentGroup": ["Conversion-Test", "Conversion-Control"] * int(n / 2),
+                "pyOutcome": None,
+            }
+        ).with_columns(
+            pyOutcome=pl.when(pl.col.pyChannel == "Web")
+            .then(pl.lit("Impression"))
+            .otherwise(pl.lit("Pending"))
+        )
+        ih_fake_accepts = ih_fake_impressions.sample(fraction=self.accept_rate).with_columns(
+            pl.col.TimeStamp + pl.duration(minutes=pl.col("AcceptDurationMinutes")),
+            pyOutcome=pl.when(pl.col.pyChannel == "Web")
+            .then(pl.lit("Clicked"))
+            .otherwise(pl.lit("Accepted")),
+        )
+        ih_fake_converts_test = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Test").sample(
+            fraction=self.convert_over_accept_rate_test
+        ).with_columns(
+            pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
+            pyOutcome=pl.lit("Conversion"),
+        )
+        ih_fake_converts_control = ih_fake_accepts.filter(pl.col.ExperimentGroup=="Conversion-Control").sample(
+            fraction=self.convert_over_accept_rate_control
+        ).with_columns(
+            pl.col.TimeStamp + pl.duration(days=pl.col("ConvertDurationDays")),
+            pyOutcome=pl.lit("Conversion"),
+        )
+
+        ih_data=pl.concat([ih_fake_impressions, ih_fake_accepts, ih_fake_converts_test, ih_fake_converts_control]).with_columns(
+            pxOutcomeTime=pl.col("TimeStamp").map_elements(
+                to_prpc_time_str, return_dtype=pl.String
+            ),
+        ).filter(pl.col("TimeStamp") < pl.lit(now)).drop(
+            ["AcceptDurationMinutes", "ConvertDurationDays", "TimeStamp"]
+        ).sort(
+            "InteractionID", "pxOutcomeTime"
+        ).lazy()
+
+        return ih_data