Merge pull request #39 from amosproj/feature/evp-skeleton

Implemented the EVP skeleton class. Issue #22
amosproj · Nov 3, 2023 · f3c4d14 · f3c4d14
2 parents 42d0169 + b587b22
commit f3c4d14
Show file tree

Hide file tree

Showing 19 changed files with 1,187 additions and 8 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -46,24 +46,22 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
       - name: Set up Python 3.10
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install flake8 pytest pipenv
+          pip install pipenv
           # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-          pipenv install
+          pipenv install --dev
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          pipenv run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+          pipenv run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
       - name: Test with pytest
         run: |
           pipenv run pytest
diff --git a/.gitignore b/.gitignore
@@ -21,7 +21,7 @@ pids
 # Python
 *.pyc
 __pycache__/
-Pipfile.lock
+# Pipfile.lock
 
 # Jupyter Notebook
 .ipynb_checkpoints

diff --git a/LICENSES/CC-BY-4.0.txt b/LICENSES/CC-BY-4.0.txt
diff --git a/Pipfile b/Pipfile
@@ -7,10 +7,15 @@ verify_ssl = true
 name = "pypi"
 
 [dev-packages]
-pytest = "==7.4.3"
+pytest = "*"
+pre-commit = "*"
+flake8 = "*"
 
 [packages]
 numpy = "==1.26.1"
+scikit-learn = "==1.3.2"
+pydantic = "==2.4.2"
+email-validator = "==2.1.0"
 
 [requires]
 python_version = "3.10"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/Pipfile.lock.license b/Pipfile.lock.license
diff --git a/src/database/__init__.py b/src/database/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+from .database_dummy import DatabaseDummy
+
+_database = None
+
+
+def get_database() -> DatabaseDummy:
+    global _database
+    if _database is None:
+        _database = DatabaseDummy()
+    return _database
diff --git a/src/database/database_dummy.py b/src/database/database_dummy.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+import json
+from typing import List
+
+from database.models import Lead
+from database.parsers import LeadParser
+
+
+class DatabaseDummy:
+    def __init__(self) -> None:
+        with open("src/database/dummy_leads.json") as f:
+            json_data = json.load(f)["training_leads"]
+            self.data = {d["lead_id"]: d for d in json_data}
+
+    def get_lead_by_id(self, id_: int) -> Lead:
+        return LeadParser.parse_lead_from_dict(self.data[id_])
+
+    def get_all_leads(self) -> List[Lead]:
+        return [LeadParser.parse_lead_from_dict(entry) for entry in self.data.values()]
+
+    def update_lead(self, lead: Lead):
+        print(f"Updating database entry for lead#{lead.lead_id}")
+        print(f"Update values: {lead}")
diff --git a/src/database/dummy_leads.json b/src/database/dummy_leads.json
@@ -0,0 +1,59 @@
+{
+  "training_leads": [
+    {
+      "lead_id": 0,
+      "annual_income": 25000,
+      "product_of_interest": "Terminals",
+      "first_name": "Anton",
+      "last_name": "Kerner",
+      "phone_number": "49176123123",
+      "email_address": "[email protected]",
+      "customer_probability": 0.1,
+      "life_time_value": 400000
+    },
+    {
+      "lead_id": 1,
+      "annual_income": 70000,
+      "product_of_interest": "Terminals",
+      "first_name": "Anton",
+      "last_name": "Kerner",
+      "phone_number": "49176123123",
+      "email_address": "[email protected]",
+      "customer_probability": 0.4,
+      "life_time_value": 40000
+    },
+    {
+      "lead_id": 2,
+      "annual_income": 15000,
+      "product_of_interest": "Terminals",
+      "first_name": "Anton",
+      "last_name": "Kerner",
+      "phone_number": "49176123123",
+      "email_address": "[email protected]",
+      "customer_probability": 0.8,
+      "life_time_value": 40000
+    },
+    {
+      "lead_id": 3,
+      "annual_income": 2500000,
+      "product_of_interest": "Terminals",
+      "first_name": "Anton",
+      "last_name": "Kerner",
+      "phone_number": "49176123123",
+      "email_address": "[email protected]",
+      "customer_probability": 0.08,
+      "life_time_value": 400000
+    },
+    {
+      "lead_id": 4,
+      "annual_income": 1200,
+      "product_of_interest": "Terminals",
+      "first_name": "Anton",
+      "last_name": "Kerner",
+      "phone_number": "49176123123",
+      "email_address": "[email protected]",
+      "customer_probability": 0.9,
+      "life_time_value": 3400.23
+    }
+  ]
+}
diff --git a/src/database/dummy_leads.json.license b/src/database/dummy_leads.json.license
@@ -0,0 +1,2 @@
+SPDX-License-Identifier: CC-BY-4.0
+SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
diff --git a/src/database/models.py b/src/database/models.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+from enum import Enum, IntEnum
+from typing import List, Optional
+
+from pydantic import BaseModel, EmailStr, Field
+
+
+class AnnualIncome(IntEnum):
+    Nothing = 0  # 0€
+    Class1 = 1  # (0€, 35000€]
+    Class2 = 35001  # (35000€, 60000€]
+    Class3 = 60001  # (60000€, 100000€]
+    Class4 = 100001  # (100000€, 200000€]
+    Class5 = 200001  # (200000€, 400000€]
+    Class6 = 400001  # (400000€, 600000€]
+    Class7 = 600001  # (600000€, 1000000€]
+    Class8 = 1000001  # (1000000€, 2000000€]
+    Class9 = 2000001  # (2000000€, 5000000€]
+    Class10 = 5000001  # (5000000€, inf€]
+
+
+class ProductOfInterest(str, Enum):
+    Nothing = "Nothing"
+    Terminals = "Terminals"
+    CashRegisterSystem = "Cash Register System"
+    BusinessAccount = "Business Account"
+    All = "All"
+    Other = "Other"
+
+
+class LeadValue(BaseModel):
+    life_time_value: float = Field(..., ge=0)
+    customer_probability: float = Field(..., ge=0, le=1)
+
+    def get_lead_value(self) -> float:
+        return self.life_time_value * self.customer_probability
+
+
+class Lead(BaseModel):
+    lead_id: int  # could be expended to a UUID later
+    first_name: str
+    last_name: str
+    email_address: EmailStr
+    phone_number: str
+    annual_income: AnnualIncome
+    product_of_interest: ProductOfInterest
+    lead_value: Optional[LeadValue]
diff --git a/src/database/parsers.py b/src/database/parsers.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+from typing import Dict
+
+from database.models import AnnualIncome, Lead, LeadValue, ProductOfInterest
+
+
+class LeadParser:
+    @staticmethod
+    def parse_lead_from_dict(data: Dict) -> Lead:
+        customer_probability = (
+            data["customer_probability"]
+            if "customer_probability" in data.keys()
+            else None
+        )
+        life_time_value = (
+            data["life_time_value"] if "life_time_value" in data.keys() else None
+        )
+
+        if customer_probability is not None and life_time_value is not None:
+            lead_value = LeadValue(
+                life_time_value=life_time_value,
+                customer_probability=customer_probability,
+            )
+        else:
+            lead_value = None
+
+        annual_income = AnnualIncome.Nothing
+        for income_value in AnnualIncome:
+            if data["annual_income"] < income_value:
+                break
+            annual_income = income_value
+
+        return Lead(
+            lead_id=data["lead_id"],
+            first_name=data["first_name"],
+            last_name=data["last_name"],
+            email_address=data["email_address"],
+            phone_number=data["phone_number"],
+            annual_income=annual_income,
+            product_of_interest=ProductOfInterest(data["product_of_interest"]),
+            lead_value=lead_value,
+        )
diff --git a/src/evp/__init__.py b/src/evp/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
diff --git a/src/evp/evp.py b/src/evp/evp.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+import numpy as np
+from sklearn.linear_model import LinearRegression
+
+from database import get_database
+from database.models import LeadValue
+
+
+class EstimatedValuePredictor:
+    def __init__(self) -> None:
+        self.probability_predictor = LinearRegression()
+        self.life_time_value_predictor = LinearRegression()
+
+        all_leads = get_database().get_all_leads()
+        X = np.identity(len(all_leads))
+        y_probability = np.array(
+            [lead.lead_value.customer_probability for lead in all_leads]
+        )
+        y_value = np.array([lead.lead_value.life_time_value for lead in all_leads])
+
+        self.probability_predictor.fit(X, y_probability)
+        self.life_time_value_predictor.fit(X, y_value)
+
+    def estimate_value(self, lead_id) -> LeadValue:
+        # make call to data base to retrieve relevant fields for this lead
+        lead = get_database().get_lead_by_id(lead_id)
+
+        # preprocess lead_data to get feature vector for our ML model
+        feature_vector = np.zeros((1, 5))
+        feature_vector[0][lead.lead_id] = 1.0
+
+        # use the models to predict required values
+        lead_value_pred = self.life_time_value_predictor.predict(feature_vector)
+        # manually applying sigmoid to ensure value in range 0, 1
+        cust_prob_pred = 1 / (
+            1 + np.exp(-self.probability_predictor.predict(feature_vector))
+        )
+
+        lead.lead_value = LeadValue(
+            life_time_value=lead_value_pred, customer_probability=cust_prob_pred
+        )
+        get_database().update_lead(lead)
+
+        # might not need to return here if the database is updated by this function
+        return lead.lead_value
diff --git a/src/evp_demo.py b/src/evp_demo.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+from database import get_database
+from evp.evp import EstimatedValuePredictor
+
+lead_id = 1
+
+lead = get_database().get_lead_by_id(lead_id)
+
+evp = EstimatedValuePredictor()
+lead_value = evp.estimate_value(lead_id)
+
+print(
+    f"""
+    Dummy prediction for lead#{lead.lead_id}:
+
+    Lead:
+    {lead}
+
+    This lead has a predicted probability of {lead_value.customer_probability:.2f} to become a customer.
+    This lead has a predicted life time value of {lead_value.life_time_value:.2f}.
+
+    This results in a total lead value of {lead_value.get_lead_value():.2f}.
+"""
+)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+import json
+from typing import Dict
+
+import pytest
+
+
+@pytest.fixture
+def create_lead_dict(request) -> Dict:
+    lead_value_adjustments = request.param
+    lead_data = {
+        "lead_id": 0,
+        "annual_income": 0,
+        "product_of_interest": "Nothing",
+        "first_name": "Manu",
+        "last_name": "Musterperson",
+        "phone_number": "49123123123",
+        "email_address": "[email protected]",
+    }
+    for key, value in lead_value_adjustments.items():
+        lead_data[key] = value
+    yield lead_data
diff --git a/src/test_dummy.py → tests/test_dummy.py b/src/test_dummy.py → tests/test_dummy.py
diff --git a/tests/test_evp.py b/tests/test_evp.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Felix Zailskas <[email protected]>
+
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")))
+
+from database import get_database
+from database.models import LeadValue
+from evp.evp import EstimatedValuePredictor
+
+
+def test_estimate_value():
+    leads = get_database().get_all_leads()
+    evp = EstimatedValuePredictor()
+    for lead in leads:
+        value = evp.estimate_value(lead.lead_id)
+        assert type(value) == LeadValue