-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat: Validate dataframe with Pydantic schema #522
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from typing import List, Dict | ||
from pydantic import ValidationError | ||
from pydantic import BaseModel | ||
from pandasai.helpers.df_info import DataFrameType, df_type | ||
|
||
|
||
class DFValidationResult: | ||
def __init__(self, passed: bool = True, errors: List[Dict] = []): | ||
self._passed = passed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Default mutable arguments are a common source of unexpected behaviors and bugs in Python. Here, - def __init__(self, passed: bool = True, errors: List[Dict] = []):
+ def __init__(self, passed: bool = True, errors: List[Dict] = None):
+ if errors is None:
+ errors = [] |
||
self._errors = errors | ||
|
||
@property | ||
def passed(self): | ||
return self._passed | ||
|
||
def errors(self) -> List[Dict]: | ||
return self._errors | ||
|
||
def add_error(self, error_message: str): | ||
""" | ||
Add an error message to the validation results. | ||
""" | ||
self.passed = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are trying to set the attribute - self.passed = False
+ self._passed = False |
||
self._errors.append(error_message) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The - self.passed = False
+ self._passed = False |
||
|
||
def __bool__(self) -> bool: | ||
""" | ||
Define the truthiness of ValidationResults. | ||
""" | ||
return self.passed | ||
|
||
|
||
class DFValidator: | ||
def __init__(self, df, verbose=False): | ||
self._df = df | ||
self._verbose = verbose | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider making - def __init__(self, df, verbose=False):
+ def __init__(self, *, df, verbose=False): |
||
def _validate_batch(self, schema, df_json: List[Dict]): | ||
""" | ||
Args: | ||
schema: Pydantic schema | ||
batch_df: dataframe batch | ||
""" | ||
try: | ||
# Create a Pydantic Validator to validate rows of dataframe | ||
class PdVal(BaseModel): | ||
df: List[schema] | ||
|
||
PdVal(df=df_json) | ||
return [] | ||
|
||
except ValidationError as e: | ||
if self._verbose: | ||
print(e) | ||
return e.errors() | ||
|
||
def _df_to_list_of_dict(self, df: DataFrameType, dataframe_type: str) -> List[Dict]: | ||
""" | ||
Create list of dict of dataframe rows on basis of dataframe type | ||
Supports only polars and pandas dataframe | ||
""" | ||
if dataframe_type == "pandas": | ||
return df.to_dict(orient="records") | ||
elif dataframe_type == "polars": | ||
return df.to_dicts() | ||
else: | ||
[] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line seems to be a no-op. It creates an empty list and then does nothing with it. If you intended to return an empty list when the dataframe type is neither "pandas" nor "polars", you should use the - []
+ return [] |
||
|
||
def validate(self, schema: BaseModel) -> DFValidationResult: | ||
""" | ||
Args: | ||
schema: Pydantic schema to be validated for the dataframe row | ||
""" | ||
dataframe_type = df_type(self._df) | ||
if dataframe_type is None: | ||
raise ValueError("UnSupported DataFrame") | ||
|
||
df_json: List[Dict] = self._df_to_list_of_dict(self._df, dataframe_type) | ||
|
||
errors = self._validate_batch(schema, df_json) | ||
|
||
if len(errors) > 0: | ||
return DFValidationResult(False, errors) | ||
else: | ||
return DFValidationResult(True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method could be simplified by directly returning the - if len(errors) > 0:
- return DFValidationResult(False, errors)
- else:
- return DFValidationResult(True)
+ return DFValidationResult(len(errors) == 0, errors) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,9 @@ | |
from io import StringIO | ||
|
||
import pandas as pd | ||
import pydantic | ||
|
||
from pandasai.helpers.df_validator import DFValidator | ||
|
||
from ..smart_datalake import SmartDatalake | ||
from ..schemas.df_config import Config | ||
|
@@ -235,6 +238,17 @@ def _get_head_csv(self): | |
self._sample_head = df_head.to_csv(index=False) | ||
return self._sample_head | ||
|
||
def validate(self, schema: pydantic.BaseModel, verbose: bool = False): | ||
""" | ||
Validates Dataframe rows on the basis Pydantic schema input | ||
(Args): | ||
schema: Pydantic schema class | ||
n_jobs: Parallelism for larger dataframe | ||
verbose: Print Errors | ||
""" | ||
df_validator = DFValidator(self.original_import, verbose) | ||
return df_validator.validate(schema) | ||
|
||
@property | ||
def datalake(self): | ||
return self._dl | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The - n_jobs: Parallelism for larger dataframe |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Default mutable arguments in Python can lead to unexpected behavior. Here,
errors: List[Dict] = []
is a mutable default argument. If this list is modified, the change will persist across function calls. Consider changing the default value toNone
and assigning an empty list within the function if the argument isNone
.