Skip to content

Commit

Permalink
Merge pull request #238 from amosproj/dev
Browse files Browse the repository at this point in the history
Final sprint release
  • Loading branch information
ultiwinter authored Feb 7, 2024
2 parents 31bdcae + 4a6ca72 commit 34576ab
Show file tree
Hide file tree
Showing 79 changed files with 2,979 additions and 1,543 deletions.
10 changes: 2 additions & 8 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,9 @@
GOOGLE_PLACES_API_KEY=
OPEN_AI_API_KEY=

DB_USER=
DB_PASSWORD=
DB_CONNECTION=

FACEBOOK_APP_ID=
FACEBOOK_APP_SECRET=
OPEN_AI_API_KEY=

# Need to be set when 'DATABASE_TYPE' is 'S3'
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=

# Choose between 'Local' and 'S3'
DATABASE_TYPE=
39 changes: 39 additions & 0 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <[email protected]>

name: documentation

on: [push, pull_request, workflow_dispatch]

permissions:
contents: write

jobs:
docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pipenv
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pipenv install --dev
- name: Generate Sphinx
run: |
cd src/docs
pipenv run sphinx-apidoc -o . ..
pipenv run make clean
pipenv run make html
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v3
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
with:
publish_branch: gh-pages
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: src/docs/_build/html/
force_orphan: true
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ bin/
!**/data/merged_geo.geojson
**/data/reviews/*.json
**/data/gpt-results/*.json
**/data/models/*
**/data/models/*.pkl
**/data/models/*.joblib
**/data/classification_reports/*

**/docs/*
!**/docs/conf.py
!**/docs/index.rst
!**/docs/make.bat
!**/docs/Makefile
!**/docs/readme_link.md

# Env files
*.env

Expand All @@ -70,3 +78,6 @@ report.pdf
**/cache/*

!.gitkeep

# testing
.coverage
16 changes: 0 additions & 16 deletions Dockerfile

This file was deleted.

61 changes: 61 additions & 0 deletions Documentation/SBOM_generator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Automatic SBOM generation

```console
pipenv install
pipenv shell

pip install pipreqs
pip install cyclonedx-bom
pip install pip-licenses

# Create the SBOM (cyclonedx-bom) based on (pipreqs) requirements that are actually imported in the .py files

$sbom = pipreqs --print | cyclonedx-py -r -pb -o - -i -

# Create an XmlDocument object
$xml = New-Object System.Xml.XmlDocument

# Load XML content into the XmlDocument
$xml.LoadXml($sbom)


# Create an empty CSV file
$csvPath = "SBOM.csv"

# Initialize an empty array to store rows
$result = @()

# Iterate through the XML nodes and create rows for each node
$xml.SelectNodes("//*[local-name()='component']") | ForEach-Object {

$row = @{
"Version" = $_.Version
"Context" = $_.Purl
"Name" = if ($_.Name -eq 'scikit_learn') { 'scikit-learn' } else { $_.Name }
}

# Get license information
$match = pip-licenses --from=mixed --format=csv --with-system --packages $row.Name | ConvertFrom-Csv

# Add license information to the row
$result += [PSCustomObject]@{
"Context" = $row.Context
"Name" = $row.Name
"Version" = $row.Version
"License" = $match.License
}
}

# Export the data to the CSV file
$result | Export-Csv -Path $csvPath -NoTypeInformation

# Create the license file
$licensePath = $csvPath + '.license'
@"
SPDX-License-Identifier: CC-BY-4.0
SPDX-FileCopyrightText: 2023 Fabian-Paul Utech <[email protected]>
"@ | Out-File -FilePath $licensePath

exit

```
41 changes: 41 additions & 0 deletions Documentation/ideas.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<!--
SPDX-License-Identifier: MIT
SPDX-FileCopyrightText: 2024 Felix Zailskas <[email protected]>
-->

# Unused Ideas

This document lists ideas and implementations which have either not been tried yet or have been deprecated as they are not used in the current product version but still carry some conceptual value.

## Deprecated

The original implementation of the deprecated modules can be found in the `deprecated/` directory.

### Controller

**_Note:_** This package has the additional dependency `pydantic==2.4.2`

The controller module was originally planned to be used as a communication device between EVP and BDC. Whenever the salesperson interface would register a new lead the controller is supposed to trigger the BDC pipeline to enrich the data of that lead and preprocess it to create a feature vector. The successful completion of the BDC pipeline is then registered at the controller which will then trigger an inference of the EVP to compute the predicted merchant size and write this back to the lead data. The computed merchant size can then be used to rank the leads and allow the salesperson to decide the value of the leads and which one to call.

The current implementation of the module supports queueing messages from the BDC and EVP as indicated by their type. Depending on the message type the message is then routed to the corresponding module (EVP or BDC). The actual processing of the messages by the modules is not implemented. All of this is done asynchronously by using the python threading library.

### FacebookGraphAPI

**_Note:_** This package has the additional dependency `facebook-sdk==3.1.0`. Also the environment variables `FACEBOOK_APP_ID` `FACEBOOK_APP_SECRET` need to be set with a valid token.

This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU.

### ScrapeAddresses

This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running
for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise
and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places
step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated.

## Possible ML improvements

### Creating data subsets

The data collected by the BDC pipeline has not been refined to only include semantically valuable data fields. It is possible that some data fields contain no predictive power. This would mean they are practically polluting the dataset with unnecessary information. A proper analysis of the predictive power of all data fields would allow cutting down on the amount of data for each lead, reducing processing time and possibly make predictions more precise. This approach has been explored very briefly by the subset 1 as described in `Classifier-Comparison.md`. However, the choice of included features has not been justified by experiments making them somewhat arbitrary. Additionally, an analysis of this type could give insights on which data fields to expand on and what new data one might want to collect to increase the EVP's performance in predicting merchant sizes.

Possibly filtering data based on some quality metric could also improve general performance. The regional_atlas_score and google_confidence_score have been tried for this but did not improve performance. However, these values are computed somewhat arbitrarily and implementing a more refined quality metric might result in more promising results.
63 changes: 30 additions & 33 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,53 +7,50 @@ verify_ssl = true
name = "pypi"

[dev-packages]
pytest = "==7.4.0"
coverage = "==7.4.1"
pre-commit = "==3.5.0"
flake8 = "==6.0.0"
pytest-env = "==1.0.1"
matplotlib = "==3.8.2"
plotly = "==5.18.0"
geopy = "==2.4.1"
matplotlib = "==3.8.2"
notebook = "==7.0.6"
plotly = "==5.18.0"
pre-commit = "==3.5.0"
pytest = "==7.4.0"
pytest-env = "==1.0.1"
sphinx = "==7.2.6"
sphinx_rtd_theme = "==2.0.0"
myst_parser = "==2.0.0"

[packages]
numpy = "==1.26.1"
requests = "==2.31.0"
scikit-learn = "==1.3.2"
pydantic = "==2.4.2"
email-validator = "==2.1.0.post1"
pandas = "==2.0.3"
autocorrect = "==2.6.1"
beautifulsoup4 = "==4.12.2"
tqdm = "==4.65.0"
python-dotenv = "==0.21.0"
googlemaps = "==4.10.0"
phonenumbers = "==8.13.25"
pymongo = "==4.6.0"
facebook-sdk = "==3.1.0"
boto3 = "==1.33.1"
colorama = "==0.4.6"
deep-translator = "==1.11.4"
deutschland = "==0.4.0"
email-validator = "==2.1.0.post1"
fsspec = "==2023.12.2"
geopandas = "==0.14.1"
googlemaps = "==4.10.0"
joblib = "==1.3.2"
lightgbm = "==4.3.0"
numpy = "==1.26.1"
openai = "==1.3.3"
tiktoken = "==0.5.1"
osmnx = "==1.7.1"
pandas = "==2.0.3"
phonenumbers = "==8.13.25"
pylanguagetool = "==0.10.0"
pyspellchecker = "==0.7.2"
python-dotenv = "==0.21.0"
reportlab = "==4.0.7"
osmnx = "==1.7.1"
geopandas = "==0.14.1"
requests = "==2.31.0"
s3fs = "==2023.12.2"
scikit-learn = "==1.3.2"
shapely = "==2.0.2"
pyspellchecker = "==0.7.2"
autocorrect = "==2.6.1"
textblob = "==0.17.1"
deep-translator = "==1.11.4"
fsspec = "2023.12.2"
s3fs = "2023.12.2"
imblearn = "==0.0"
sagemaker = "==2.198.0"
joblib = "1.3.2"
tiktoken = "==0.5.1"
torch = "==2.1.2"
tqdm = "==4.65.0"
xgboost = "==2.0.3"
colorama = "==0.4.6"
torch = "2.1.2"
deutschland = "0.4.0"
bs4 = "0.0.2"
lightgbm = "==4.3.0"

[requires]
python_version = "3.10"
Loading

0 comments on commit 34576ab

Please sign in to comment.