Skip to content

Commit

Permalink
feat: Add logging and update to ydata_profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
100mi committed Sep 10, 2023
1 parent 2201ce5 commit d8ae96c
Show file tree
Hide file tree
Showing 22 changed files with 2,954 additions and 1,844 deletions.
11 changes: 9 additions & 2 deletions .Dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ target/
# Virtual environment
.venv/
venv/
.vscode/

# PyCharm
.idea
Expand All @@ -101,7 +102,6 @@ venv/
README.md

# Library dependecy metadata
poetry.lock

# github workflows
.github/
Expand All @@ -112,4 +112,11 @@ poetry.lock
volumes/

# Task
tasks/
tasks/

# Example
app/example/

# Gitpod
scripts/gitpod*
scripts/codespaces*
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ cython_debug/
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
.vscode

# Local History for Visual Studio Code
.history/
Expand All @@ -175,4 +176,7 @@ cython_debug/
.ionide

# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
n
n

# Example
app/example/
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ repos:
hooks:
- id: flake8
- repo: https://github.com/timothycrosley/isort
rev: 5.9.3
rev: 5.12.0
hooks:
- id: isort
30 changes: 14 additions & 16 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
FROM python:3.10-slim-buster as requirements-stage

WORKDIR /app
WORKDIR /tmp
RUN pip install poetry
COPY ./pyproject.toml ./poetry.lock* /tmp/

ENV POETRY_VERSION=1.2.0
RUN mkdir -p /tmp/app
COPY ./app /tmp/app

# Install Poetry
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
cd /usr/local/bin && \
ln -s /opt/poetry/bin/poetry && \
poetry config experimental.new-installer false && \
poetry config virtualenvs.create false
RUN poetry export -f requirements.txt --output requirements.txt --without-hashes

# Copy poetry.lock* in case it doesn't exist in the repo
COPY ./pyproject.toml ./poetry.lock* /

# Allow installing dev dependencies to run tests
ARG INSTALL_DEV=false
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
FROM python:3.10-slim-buster

COPY . .
ENV PYTHONPATH=/app
WORKDIR /code

COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt

COPY --from=requirements-stage /tmp/app /code/app
11 changes: 4 additions & 7 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10

WORKDIR /app

ENV POETRY_VERSION=1.2.0

# Install Poetry
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
RUN curl -sSL https://install.python-poetry.org/ | POETRY_HOME=/opt/poetry python && \
cd /usr/local/bin && \
ln -s /opt/poetry/bin/poetry && \
poetry config experimental.new-installer false && \
poetry config virtualenvs.create false

# Copy poetry.lock* in case it doesn't exist in the repo
COPY ./pyproject.toml ./poetry.lock* /
COPY ./pyproject.toml /

# Allow installing dev dependencies to run tests
ARG INSTALL_DEV=false
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --no-dev ; fi"
RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then poetry install --no-root ; else poetry install --no-root --only main ; fi"

COPY . .
ENV PYTHONPATH=/app
2 changes: 1 addition & 1 deletion app/api/api_v1/routers/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List

from fastapi import APIRouter, Depends
from pandas_profiling import ProfileReport
from ydata_profiling import ProfileReport

from app.core.config import Settings
from app.models.alerts import Alerts
Expand Down
7 changes: 7 additions & 0 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,12 @@ class Settings(BaseSettings):
# PROFILE SEGMENTS
SAMPLE_DATA_RENDERER: List[str] = ["head"]

# LOGGING SETTINGS
LOG_LEVEL: str = "DEBUG"
LOG_FILE_PATH: str = "logs/app.log"
LOG_FILE_SIZE: int = 100_000_000 # 100MB
LOG_FILE_BACKUP_COUNT: int = 5
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

class Config:
env_file = ".env"
56 changes: 56 additions & 0 deletions app/core/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging
import logging.config
import os

from app.core.config import Settings

settings = Settings()

# Create the logs directory if it doesn't exist
log_directory = os.path.dirname(settings.LOG_FILE_PATH)
if not os.path.exists(log_directory):
os.makedirs(log_directory)

# Configuration dictionary for logging
LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(asctime)s [%(levelname)s] [%(name)s:%(lineno)d] - %(message)s", # noqa: E501
"datefmt": "%Y-%m-%d %H:%M:%S",
},
},
"handlers": {
"console": {
"class": "rich.logging.RichHandler",
"level": settings.LOG_LEVEL,
},
},
"loggers": {
"": {
"level": settings.LOG_LEVEL,
"handlers": ["console"],
"propagate": True,
},
"celery": {
"level": settings.LOG_LEVEL,
"handlers": ["console"],
"propagate": True,
},
},
}

# Load the logging configuration
logging.config.dictConfig(LOGGING_CONFIG)


def get_logger(name: str) -> logging.Logger:
"""
Get a logger with the specified name.
Args:
name (str): The name of the logger.
Returns:
logging.Logger: The logger instance.
"""
return logging.getLogger(name)
5 changes: 3 additions & 2 deletions app/models/analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime, timedelta
from datetime import datetime

from pydantic.main import BaseModel

Expand All @@ -7,7 +7,8 @@ class Analysis(BaseModel):
title: str
date_start: datetime
date_end: datetime
duration: timedelta
# TIME DELTA IS REMOVED IN PROFILING VERSION 4 AND ABOVE
# duration: timedelta

class Config:
underscore_attrs_are_private = True
1 change: 1 addition & 0 deletions app/models/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Correlations(BaseModel):
kendall: Optional[Union[Json, Dict]]
cramers: Optional[Union[Json, Dict]]
phi_k: Optional[Union[Json, Dict]]
# auto: Optional[Union[Json, Dict, Any]]

class Config:
underscore_attrs_are_private = True
6 changes: 3 additions & 3 deletions app/models/duplicates.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Union
from typing import Any

from pydantic import BaseModel, Json
from pydantic import BaseModel


class Duplicates(BaseModel):
__root__: Union[Json, str]
__root__: Any
4 changes: 2 additions & 2 deletions app/models/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@


class Package(BaseModel):
pandas_profiling_version: str
pandas_profiling_config: str
ydata_profiling_version: str
ydata_profiling_version: str

class Config:
underscore_attrs_are_private = True
4 changes: 3 additions & 1 deletion app/models/scatter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

from typing import Any, Dict

from pydantic import BaseModel


class Scatter(BaseModel):
pass
data: Dict[str, Any]
4 changes: 2 additions & 2 deletions app/models/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class VariableProperties(BaseModel):
mean: Optional[float]
std: Optional[float]
variance: Optional[float]
min: Optional[int]
max: Optional[float]
min: Optional[Any]
max: Optional[Any]
kurtosis: Optional[float]
skewness: Optional[float]
sum: Optional[float]
Expand Down
48 changes: 33 additions & 15 deletions app/utils/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
import polars.exceptions as pl_exc
import s3fs
from charset_normalizer import from_bytes
from fastapi.logger import logger
from numpy import bool_
from requests import get

from app.core.config import Settings
from app.core.logging import get_logger

setting = Settings()
logger = get_logger(__name__)


def get_encoding(obj: Union[str, bytes], is_object=False) -> str:
Expand Down Expand Up @@ -50,7 +51,8 @@ async def get_dataframe_honouring_encoding_async(
try:
df = pl.read_csv(source, null_values="NA", infer_schema_length=0)
except (UnicodeDecodeError, pl_exc.ComputeError) as err:
logger.error(f"Could not interpret File encoding : {err}")
logger.warning(f"File encoding is not default: {err}")
logger.warning("Trying to read file with proper encoding")
encoding = get_encoding(obj=source, is_object=is_object)
logger.info(f"File encoding : {encoding}")
df = pl.read_csv(
Expand Down Expand Up @@ -122,7 +124,9 @@ async def get_dataframe_async(file_url: str):
url = urlparse(file_url)

if url.scheme == "http" or url.scheme == "https":
logger.info("Check for files with http/https extension")
df = await get_dataframe_honouring_encoding_async(file_url)
logger.info("Dataframe generated from http/https file")
return df

elif url.scheme == "s3":
Expand All @@ -132,12 +136,19 @@ async def get_dataframe_async(file_url: str):
secret=setting.S3_SECRET_ACCESS_KEY,
client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
)

with fs.open(f"{url.netloc}{url.path}", "rb") as f:
obj = f.read()

df = await get_dataframe_honouring_encoding_async(obj, is_object=True)
return df
try:
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
obj = f.read()
logger.info(f"File read from s3 : {url.path}")
except Exception as err:
logger.error("Could not read file from s3")
raise err
else:
df = await get_dataframe_honouring_encoding_async(
obj, is_object=True
)
logger.info("Dataframe generated from s3 file")
return df


def get_dataframe(file_url: str):
Expand All @@ -156,7 +167,9 @@ def get_dataframe(file_url: str):
url = urlparse(file_url)

if url.scheme == "http" or url.scheme == "https":
logger.info("Check for files with http/https extension")
df = get_dataframe_honouring_encoding(source=file_url, is_object=False)
logger.info("Dataframe generated from http/https file")
return df

elif url.scheme == "s3":
Expand All @@ -166,10 +179,15 @@ def get_dataframe(file_url: str):
secret=setting.S3_SECRET_ACCESS_KEY,
client_kwargs={"endpoint_url": setting.S3_ENDPOINT_URL},
)

with fs.open(f"{url.netloc}{url.path}", "rb") as f:
file_content = f.read()
df = get_dataframe_honouring_encoding(
source=file_content, is_object=True
)
return df
try:
with fs.open(f"{url.netloc}{url.path}", "rb") as f:
file_content = f.read()
except Exception as err:
logger.error("Could not read file from s3")
raise err
else:
df = get_dataframe_honouring_encoding(
source=file_content, is_object=True
)
logger.info("Dataframe generated from s3 file")
return df
Loading

0 comments on commit d8ae96c

Please sign in to comment.