sinaptik-ai · gventuri · Sep 4, 2023 · Sep 2, 2023 · Sep 2, 2023 · Sep 2, 2023
diff --git a/pandasai/helpers/df_validator.py b/pandasai/helpers/df_validator.py
@@ -0,0 +1,85 @@
+from typing import List, Dict
+from pydantic import ValidationError
+from pydantic import BaseModel
+from pandasai.helpers.df_info import DataFrameType, df_type
+
+
+class DFValidationResult:
+    def __init__(self, passed: bool = True, errors: List[Dict] = []):
+        self._passed = passed
+        self._errors = errors
+
+    @property
+    def passed(self):
+        return self._passed
+
+    def errors(self) -> List[Dict]:
+        return self._errors
+
+    def add_error(self, error_message: str):
+        """
+        Add an error message to the validation results.
+        """
+        self.passed = False
+        self._errors.append(error_message)
+
+    def __bool__(self) -> bool:
+        """
+        Define the truthiness of ValidationResults.
+        """
+        return self.passed
+
+
+class DFValidator:
+    def __init__(self, df, verbose=False):
+        self._df = df
+        self._verbose = verbose
+
+    def _validate_batch(self, schema, df_json: List[Dict]):
+        """
+        Args:
+            schema: Pydantic schema
+            batch_df: dataframe batch
+        """
+        try:
+            # Create a Pydantic Validator to validate rows of dataframe
+            class PdVal(BaseModel):
+                df: List[schema]
+
+            PdVal(df=df_json)
+            return []
+
+        except ValidationError as e:
+            if self._verbose:
+                print(e)
+            return e.errors()
+
+    def _df_to_list_of_dict(self, df: DataFrameType, dataframe_type: str) -> List[Dict]:
+        """
+        Create list of dict of dataframe rows on basis of dataframe type
+        Supports only polars and pandas dataframe
+        """
+        if dataframe_type == "pandas":
+            return df.to_dict(orient="records")
+        elif dataframe_type == "polars":
+            return df.to_dicts()
+        else:
+            []
+
+    def validate(self, schema: BaseModel) -> DFValidationResult:
+        """
+        Args:
+                schema: Pydantic schema to be validated for the dataframe row
+        """
+        dataframe_type = df_type(self._df)
+        if dataframe_type is None:
+            raise ValueError("UnSupported DataFrame")
+
+        df_json: List[Dict] = self._df_to_list_of_dict(self._df, dataframe_type)
+
+        errors = self._validate_batch(schema, df_json)
+
+        if len(errors) > 0:
+            return DFValidationResult(False, errors)
+        else:
+            return DFValidationResult(True)
diff --git a/pandasai/smart_dataframe/__init__.py b/pandasai/smart_dataframe/__init__.py
@@ -22,6 +22,9 @@
 from io import StringIO
 
 import pandas as pd
+import pydantic
+
+from pandasai.helpers.df_validator import DFValidator
 
 from ..smart_datalake import SmartDatalake
 from ..schemas.df_config import Config
@@ -235,6 +238,17 @@ def _get_head_csv(self):
         self._sample_head = df_head.to_csv(index=False)
         return self._sample_head
 
+    def validate(self, schema: pydantic.BaseModel, verbose: bool = False):
+        """
+        Validates Dataframe rows on the basis Pydantic schema input
+        (Args):
+            schema: Pydantic schema class
+            n_jobs: Parallelism for larger dataframe
+            verbose: Print Errors
+        """
+        df_validator = DFValidator(self.original_import, verbose)
+        return df_validator.validate(schema)
+
     @property
     def datalake(self):
         return self._dl

diff --git a/tests/test_smartdataframe.py b/tests/test_smartdataframe.py
@@ -8,6 +8,7 @@
 
 import pandas as pd
 import polars as pl
+from pydantic import BaseModel, Field
 import pytest
 
 from pandasai import SmartDataframe
@@ -658,3 +659,101 @@ def test_save_pandas_no_name(self, llm):
         # Recover file for next test case
         with open("pandasai.json", "w") as json_file:
             json_file.write(backup_pandasai)
+
+    def test_pydantic_validate(self, llm):
+        # Create a sample DataFrame
+        df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
+
+        # Create an instance of YourDataFrameClass without a name
+        df_object = SmartDataframe(
+            df, description="Name", config={"llm": llm, "enable_cache": False}
+        )
+
+        # Pydantic Schema
+        class TestSchema(BaseModel):
+            A: int
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+
+        assert validation_result.passed is True
+
+    def test_pydantic_validate_false(self, llm):
+        # Create a sample DataFrame
+        df = pd.DataFrame({"A": ["Test", "Test2", "Test3", "Test4"], "B": [5, 6, 7, 8]})
+
+        # Create an instance of YourDataFrameClass without a name
+        df_object = SmartDataframe(
+            df, description="Name", config={"llm": llm, "enable_cache": False}
+        )
+
+        # Pydantic Schema
+        class TestSchema(BaseModel):
+            A: int
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+
+        assert validation_result.passed is False
+
+    def test_pydantic_validate_polars(self, llm):
+        # Create a sample DataFrame
+        df = pl.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
+
+        # Create an instance of YourDataFrameClass without a name
+        df_object = SmartDataframe(
+            df, description="Name", config={"llm": llm, "enable_cache": False}
+        )
+
+        # Pydantic Schema
+        class TestSchema(BaseModel):
+            A: int
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+        assert validation_result.passed is True
+
+    def test_pydantic_validate_false_one_record(self, llm):
+        # Create a sample DataFrame
+        df = pd.DataFrame({"A": [1, "test", 3, 4], "B": [5, 6, 7, 8]})
+
+        # Create an instance of YourDataFrameClass without a name
+        df_object = SmartDataframe(
+            df, description="Name", config={"llm": llm, "enable_cache": False}
+        )
+
+        # Pydantic Schema
+        class TestSchema(BaseModel):
+            A: int
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+        assert (
+            validation_result.passed is False and len(validation_result.errors()) == 1
+        )
+
+    def test_pydantic_validate_complex_Schema(self, llm):
+        # Create a sample DataFrame
+        df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
+
+        # Create an instance of YourDataFrameClass without a name
+        df_object = SmartDataframe(
+            df, description="Name", config={"llm": llm, "enable_cache": False}
+        )
+
+        # Pydantic Schema
+        class TestSchema(BaseModel):
+            A: int = Field(..., gt=5)
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+
+        assert validation_result.passed is False
+
+        class TestSchema(BaseModel):
+            A: int = Field(..., lt=5)
+            B: int
+
+        validation_result = df_object.validate(TestSchema)
+
+        assert validation_result.passed is True