From 0162eaead26328ec6351e7a99e1bd96291f9c989 Mon Sep 17 00:00:00 2001 From: Arslan Saleem Date: Tue, 3 Dec 2024 11:48:11 +0100 Subject: [PATCH] feat(Dataframe): save dataframe yml (#1443) * feat(dataframe): save dataframe to path * feat(dataframe): save dataframe to path --- pandasai/__init__.py | 7 ++++ pandasai/dataframe/base.py | 74 ++++++++++++++++++++++++++++++++++++++ pandasai/helpers/utils.py | 27 ++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 pandasai/helpers/utils.py diff --git a/pandasai/__init__.py b/pandasai/__init__.py index e71352fc6..99a8f847b 100644 --- a/pandasai/__init__.py +++ b/pandasai/__init__.py @@ -4,6 +4,8 @@ """ from typing import List + +import pandas as pd from .agent import Agent from .helpers.cache import Cache from .dataframe.base import DataFrame @@ -75,6 +77,11 @@ def load(dataset_path: str, virtualized=False) -> DataFrame: return _dataset_loader.load(dataset_path, virtualized) +def read_csv(filepath: str) -> DataFrame: + data = pd.read_csv(filepath) + return DataFrame(data, filepath=filepath) + + __all__ = [ "Agent", "clear_cache", diff --git a/pandasai/dataframe/base.py b/pandasai/dataframe/base.py index bfcd3d46a..a35590335 100644 --- a/pandasai/dataframe/base.py +++ b/pandasai/dataframe/base.py @@ -1,13 +1,18 @@ from __future__ import annotations +import os +import shutil import pandas as pd from typing import TYPE_CHECKING, Optional, Union, Dict, Any, ClassVar +import yaml + from pandasai.config import Config import hashlib from pandasai.helpers.dataframe_serializer import ( DataframeSerializer, DataframeSerializerType, ) +from pandasai.helpers.path import find_project_root if TYPE_CHECKING: @@ -28,6 +33,7 @@ class DataFrame(pd.DataFrame): _metadata: ClassVar[list] = [ "name", "description", + "filepath", "schema", "config", "_agent", @@ -37,6 +43,7 @@ class DataFrame(pd.DataFrame): def __init__(self, *args, **kwargs): self.name: Optional[str] = kwargs.pop("name", None) self.description: Optional[str] = kwargs.pop("description", None) + self.filepath: Optional[str] = kwargs.pop("filepath", None) schema: Optional[Dict] = kwargs.pop("schema", None) super().__init__(*args, **kwargs) @@ -157,3 +164,70 @@ def serialize_dataframe( def get_head(self): return self.head() + + def _create_yml_template(self, name, description, output_yml_path: str): + """ + Generate a .yml file with a simplified metadata template from a pandas DataFrame. + + Args: + dataframe (pd.DataFrame): The DataFrame to document. + filepath (str): The path to the original data source file. + output_yml_path (str): The file path where the .yml file will be saved. + table_name (str): Name of the table or dataset. + """ + # Metadata template + metadata = { + "name": name, + "description": description, + "columns": [ + {"name": column, "type": str(self[column].dtype)} + for column in self.columns + ], + "source": { + "type": "csv", + "path": ( + "data.csv" if self.filepath.endswith(".csv") else "data.parquet" + ), + }, + } + + # Save metadata to a .yml file + with open(output_yml_path, "w") as yml_file: + yaml.dump(metadata, yml_file, sort_keys=False) + + print(f"YML file created at: {output_yml_path}") + + def save(self, path: str, name: str, description: str = None): + self.name = name + self.description = description + + # Validate path format + path_parts = path.split("/") + if len(path_parts) != 2: + raise ValueError("Path must be in format 'organization/dataset'") + + org_name, dataset_name = path_parts + if not org_name or not dataset_name: + raise ValueError("Both organization and dataset names are required") + + # Validate dataset name format + if not dataset_name.islower() or " " in dataset_name: + raise ValueError( + "Dataset name must be lowercase and use hyphens instead of spaces (e.g. 'my-dataset')" + ) + + # Create full path with slugified dataset name + dataset_directory = os.path.join( + find_project_root(), "datasets", org_name, dataset_name + ) + + os.makedirs(dataset_directory, exist_ok=True) + + # save csv file + new_file_path = os.path.join(dataset_directory, "data.csv") + shutil.copy(self.filepath, new_file_path) + + # create schema yaml file + schema_path = os.path.join(dataset_directory, "schema.yaml") + self._create_yml_template(self.name, self.description, schema_path) + print(f"Dataset saved successfully to path: {dataset_directory}") diff --git a/pandasai/helpers/utils.py b/pandasai/helpers/utils.py new file mode 100644 index 000000000..4095c6f45 --- /dev/null +++ b/pandasai/helpers/utils.py @@ -0,0 +1,27 @@ +import re +import unicodedata + + +def create_slug(text: str) -> str: + """ + Generate a slug from a given text. + + Args: + text (str): The input text to convert into a slug. + + Returns: + str: A URL-friendly slug. + """ + # Normalize text to remove accents and special characters + text = unicodedata.normalize("NFKD", text) + text = text.encode("ascii", "ignore").decode("ascii") + + # Convert to lowercase + text = text.lower() + + # Replace spaces and unwanted characters with a hyphen + text = re.sub(r"[^\w\s-]", "", text) + text = re.sub(r"[\s_]+", "-", text.strip()) + + # Remove leading or trailing hyphens + return text.strip("-")