Skip to content

Commit

Permalink
feat(Dataframe): save dataframe yml (#1443)
Browse files Browse the repository at this point in the history
* feat(dataframe): save dataframe to path

* feat(dataframe): save dataframe to path
  • Loading branch information
ArslanSaleem authored Dec 3, 2024
1 parent 43e2265 commit 0162eae
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
7 changes: 7 additions & 0 deletions pandasai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""

from typing import List

import pandas as pd
from .agent import Agent
from .helpers.cache import Cache
from .dataframe.base import DataFrame
Expand Down Expand Up @@ -75,6 +77,11 @@ def load(dataset_path: str, virtualized=False) -> DataFrame:
return _dataset_loader.load(dataset_path, virtualized)


def read_csv(filepath: str) -> DataFrame:
data = pd.read_csv(filepath)
return DataFrame(data, filepath=filepath)


__all__ = [
"Agent",
"clear_cache",
Expand Down
74 changes: 74 additions & 0 deletions pandasai/dataframe/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from __future__ import annotations
import os
import shutil
import pandas as pd
from typing import TYPE_CHECKING, Optional, Union, Dict, Any, ClassVar

import yaml

from pandasai.config import Config
import hashlib
from pandasai.helpers.dataframe_serializer import (
DataframeSerializer,
DataframeSerializerType,
)
from pandasai.helpers.path import find_project_root


if TYPE_CHECKING:
Expand All @@ -28,6 +33,7 @@ class DataFrame(pd.DataFrame):
_metadata: ClassVar[list] = [
"name",
"description",
"filepath",
"schema",
"config",
"_agent",
Expand All @@ -37,6 +43,7 @@ class DataFrame(pd.DataFrame):
def __init__(self, *args, **kwargs):
self.name: Optional[str] = kwargs.pop("name", None)
self.description: Optional[str] = kwargs.pop("description", None)
self.filepath: Optional[str] = kwargs.pop("filepath", None)
schema: Optional[Dict] = kwargs.pop("schema", None)

super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -157,3 +164,70 @@ def serialize_dataframe(

def get_head(self):
return self.head()

def _create_yml_template(self, name, description, output_yml_path: str):
"""
Generate a .yml file with a simplified metadata template from a pandas DataFrame.
Args:
dataframe (pd.DataFrame): The DataFrame to document.
filepath (str): The path to the original data source file.
output_yml_path (str): The file path where the .yml file will be saved.
table_name (str): Name of the table or dataset.
"""
# Metadata template
metadata = {
"name": name,
"description": description,
"columns": [
{"name": column, "type": str(self[column].dtype)}
for column in self.columns
],
"source": {
"type": "csv",
"path": (
"data.csv" if self.filepath.endswith(".csv") else "data.parquet"
),
},
}

# Save metadata to a .yml file
with open(output_yml_path, "w") as yml_file:
yaml.dump(metadata, yml_file, sort_keys=False)

print(f"YML file created at: {output_yml_path}")

def save(self, path: str, name: str, description: str = None):
self.name = name
self.description = description

# Validate path format
path_parts = path.split("/")
if len(path_parts) != 2:
raise ValueError("Path must be in format 'organization/dataset'")

org_name, dataset_name = path_parts
if not org_name or not dataset_name:
raise ValueError("Both organization and dataset names are required")

# Validate dataset name format
if not dataset_name.islower() or " " in dataset_name:
raise ValueError(
"Dataset name must be lowercase and use hyphens instead of spaces (e.g. 'my-dataset')"
)

# Create full path with slugified dataset name
dataset_directory = os.path.join(
find_project_root(), "datasets", org_name, dataset_name
)

os.makedirs(dataset_directory, exist_ok=True)

# save csv file
new_file_path = os.path.join(dataset_directory, "data.csv")
shutil.copy(self.filepath, new_file_path)

# create schema yaml file
schema_path = os.path.join(dataset_directory, "schema.yaml")
self._create_yml_template(self.name, self.description, schema_path)
print(f"Dataset saved successfully to path: {dataset_directory}")
27 changes: 27 additions & 0 deletions pandasai/helpers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import re
import unicodedata


def create_slug(text: str) -> str:
"""
Generate a slug from a given text.
Args:
text (str): The input text to convert into a slug.
Returns:
str: A URL-friendly slug.
"""
# Normalize text to remove accents and special characters
text = unicodedata.normalize("NFKD", text)
text = text.encode("ascii", "ignore").decode("ascii")

# Convert to lowercase
text = text.lower()

# Replace spaces and unwanted characters with a hyphen
text = re.sub(r"[^\w\s-]", "", text)
text = re.sub(r"[\s_]+", "-", text.strip())

# Remove leading or trailing hyphens
return text.strip("-")

0 comments on commit 0162eae

Please sign in to comment.