Skip to content

Commit

Permalink
Refactor POC analysis module to use POLARS instead of pandas. FOR MAC…
Browse files Browse the repository at this point in the history
…OS: you’ll need to install polars via pip install polars-lts-cpu.

may have to delete,
pip uninstall polars
pip install polars-lts-cpu
(we also use pandera[ioc,polars]
  • Loading branch information
Molier committed Aug 19, 2024
1 parent 518a3b3 commit cc40abb
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 179 deletions.
8 changes: 4 additions & 4 deletions openenergyid/pandera_poc/analysis.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Proof of concept of a data analysis module using pandera."""

import pandas as pd
import polars as pl


def analyse(
df: pd.DataFrame,
) -> pd.DataFrame:
lf: pl.LazyFrame,
) -> pl.LazyFrame:
"""Perform analysis on the input data and return the output data."""
# Perform analysis
return df
return lf
38 changes: 19 additions & 19 deletions openenergyid/pandera_poc/models.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
"""Example of a Pandera schema for input and output data validation."""

import pandera as pa
import pandera.polars as pap
import polars as pl
# from pandera.typing import Series

from pandera.typing import Series, Index


class InputModel(pa.DataFrameModel):
class InputModel(pap.DataFrameModel):
"""Pandera schema for input data validation."""

index: Index[int] = pa.Field(ge=0)
column1: Series[int] = pa.Field(le=10)
column2: Series[float] = pa.Field(lt=-1.2)
column3: Series[str] = pa.Field(str_startswith="value_")
# index: Optional[Index[int]]
column1: int = pap.Field(le=10)
column2: pl.Float64 = pap.Field(lt=-1.2)
column3: str = pap.Field(str_startswith="value_")

@pa.check("column3")
@pap.check("column3")
@classmethod
def column_3_check(cls, series: Series[str]) -> bool:
def column_3_check(cls, series: pl.Series) -> bool:
"""Check that column3 values have two elements after being split with '_'"""
return series.str.split("_", expand=True).shape[1] == 2
return len(series.str.split("_")) == 2


class OutputModel(pa.DataFrameModel):
class OutputModel(pap.DataFrameModel):
"""Pandera schema for output data validation."""

index: Index[int] = pa.Field(ge=0)
column1: Series[int] = pa.Field(le=10)
column2: Series[float] = pa.Field(lt=-1.2)
column3: Series[str] = pa.Field(str_startswith="value_")
# index: pl.int = pap.Field(ge=0)
column1: int = pap.Field(le=10)
column2: float = pap.Field(lt=-1.2)
column3: str = pap.Field(str_startswith="value_")

@pa.check("column3")
@pap.check("column3")
@classmethod
def column_3_check(cls, series: Series[str]) -> bool:
def column_3_check(cls, series: pl.Series) -> bool:
"""Check that column3 values have two elements after being split with '_'"""
return series.str.split("_", expand=True).shape[1] == 2
return len(series.str.split("_")) == 2

class Config:
"""Pandera schema configuration."""
Expand Down
Loading

0 comments on commit cc40abb

Please sign in to comment.