Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(push): dataframe push to the remote server #1445

Merged
merged 5 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandasai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def load(dataset_path: str, virtualized=False) -> DataFrame:

def read_csv(filepath: str) -> DataFrame:
data = pd.read_csv(filepath)
return DataFrame(data, filepath=filepath)
return DataFrame(data)


__all__ = [
Expand Down
7 changes: 5 additions & 2 deletions pandasai/data_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@ def load(self, dataset_path: str, virtualized=False) -> DataFrame:

table_name = self.schema["source"]["table"]

return DataFrame(df, schema=self.schema, name=table_name)
return DataFrame(df, schema=self.schema, name=table_name, path=dataset_path)
else:
# Initialize new dataset loader for virtualization
data_loader = self.copy()
table_name = self.schema["source"]["table"]
return VirtualDataFrame(
schema=self.schema, data_loader=data_loader, name=table_name
schema=self.schema,
data_loader=data_loader,
name=table_name,
path=dataset_path,
)

def _load_schema(self):
Expand Down
110 changes: 67 additions & 43 deletions pandasai/dataframe/base.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
from __future__ import annotations
import os
import shutil
import re
import pandas as pd
from typing import TYPE_CHECKING, Optional, Union, Dict, Any, ClassVar
from typing import TYPE_CHECKING, List, Optional, Union, Dict, ClassVar

import yaml


from pandasai.config import Config
import hashlib
from pandasai.exceptions import PandasAIApiKeyError
from pandasai.helpers.dataframe_serializer import (
DataframeSerializer,
DataframeSerializerType,
)
from pandasai.helpers.path import find_project_root
from pandasai.helpers.request import Session


if TYPE_CHECKING:
Expand All @@ -35,6 +38,7 @@ class DataFrame(pd.DataFrame):
"description",
"filepath",
"schema",
"path",
"config",
"_agent",
"_column_hash",
Expand All @@ -43,7 +47,7 @@ class DataFrame(pd.DataFrame):
def __init__(self, *args, **kwargs):
self.name: Optional[str] = kwargs.pop("name", None)
self.description: Optional[str] = kwargs.pop("description", None)
self.filepath: Optional[str] = kwargs.pop("filepath", None)
self.path: Optional[str] = kwargs.pop("path", None)
schema: Optional[Dict] = kwargs.pop("schema", None)

super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -108,22 +112,6 @@ def follow_up(self, query: str, output_type: Optional[str] = None):
)
return self._agent.follow_up(query, output_type)

@classmethod
def from_pandas(
cls, df: pd.DataFrame, schema: Optional[Dict[str, Any]] = None
) -> "DataFrame":
"""
Create a PandasAI DataFrame from a pandas DataFrame.

Args:
df (pd.DataFrame): The pandas DataFrame to convert.
schema (Optional[Dict[str, Any]]): The schema of the DataFrame.

Returns:
DataFrame: A new PandasAI DataFrame instance.
"""
return cls(df, schema=schema)

@property
def rows_count(self) -> int:
return len(self)
Expand Down Expand Up @@ -165,39 +153,29 @@ def serialize_dataframe(
def get_head(self):
return self.head()

def _create_yml_template(self, name, description, output_yml_path: str):
def _create_yml_template(self, name, description, columns: List[dict]):
"""
Generate a .yml file with a simplified metadata template from a pandas DataFrame.

Args:
dataframe (pd.DataFrame): The DataFrame to document.
filepath (str): The path to the original data source file.
description: dataset description
output_yml_path (str): The file path where the .yml file will be saved.
table_name (str): Name of the table or dataset.
"""
# Metadata template
metadata = {
"name": name,
"description": description,
"columns": [
{"name": column, "type": str(self[column].dtype)}
for column in self.columns
],
"source": {
"type": "csv",
"path": (
"data.csv" if self.filepath.endswith(".csv") else "data.parquet"
),
},
"columns": columns,
"source": {"type": "parquet", "path": "data.parquet"},
}

# Save metadata to a .yml file
with open(output_yml_path, "w") as yml_file:
yaml.dump(metadata, yml_file, sort_keys=False)

print(f"YML file created at: {output_yml_path}")
return metadata

def save(self, path: str, name: str, description: str = None):
def save(
self, path: str, name: str, description: str = None, columns: List[dict] = []
):
self.name = name
self.description = description

Expand All @@ -210,24 +188,70 @@ def save(self, path: str, name: str, description: str = None):
if not org_name or not dataset_name:
raise ValueError("Both organization and dataset names are required")

# Validate dataset name format
if not dataset_name.islower() or " " in dataset_name:
# Validate organization and dataset name format
if not bool(re.match(r"^[a-z0-9\-_]+$", org_name)):
raise ValueError(
"Organization name must be lowercase and use hyphens instead of spaces (e.g. 'my-org')"
)

if not bool(re.match(r"^[a-z0-9\-_]+$", dataset_name)):
raise ValueError(
"Dataset name must be lowercase and use hyphens instead of spaces (e.g. 'my-dataset')"
)

self.path = path

# Create full path with slugified dataset name
dataset_directory = os.path.join(
find_project_root(), "datasets", org_name, dataset_name
)

os.makedirs(dataset_directory, exist_ok=True)

# save csv file
new_file_path = os.path.join(dataset_directory, "data.csv")
shutil.copy(self.filepath, new_file_path)
self.to_parquet(os.path.join(dataset_directory, "data.parquet"))

# create schema yaml file
schema_path = os.path.join(dataset_directory, "schema.yaml")
self._create_yml_template(self.name, self.description, schema_path)
self.schema = self._create_yml_template(self.name, self.description, columns)
# Save metadata to a .yml file
with open(schema_path, "w") as yml_file:
yaml.dump(self.schema, yml_file, sort_keys=False)

print(f"Dataset saved successfully to path: {dataset_directory}")

def push(self):
api_url = os.environ.get("PANDAAI_API_URL", None)
api_key = os.environ.get("PANDAAI_API_KEY", None)
if not api_url or not api_key:
raise PandasAIApiKeyError(
"Set PANDAAI_API_URL and PANDAAI_API_KEY in environment to push dataset to the remote server"
)

request_session = Session(endpoint_url=api_url, api_key=api_key)

params = {
"path": self.path,
"description": self.description,
}

dataset_directory = os.path.join(find_project_root(), "datasets", self.path)

headers = {"accept": "application/json", "x-authorization": f"Bearer {api_key}"}

with open(
os.path.join(dataset_directory, "schema.yaml"), "rb"
) as schema_file, open(
os.path.join(dataset_directory, "data.parquet"), "rb"
) as data_file:
files = [
("files", ("schema.yaml", schema_file, "application/x-yaml")),
("files", ("data.parquet", data_file, "application/octet-stream")),
]

# Send the POST request
return request_session.post(
"/datasets/push",
files=files,
params=params,
headers=headers,
)
16 changes: 14 additions & 2 deletions pandasai/helpers/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,15 @@ def delete(self, path=None, **kwargs):
return self.make_request("DELETE", path, **kwargs)

def make_request(
self, method, path, headers=None, params=None, data=None, json=None, timeout=300
self,
method,
path,
headers=None,
params=None,
data=None,
json=None,
timeout=300,
**kwargs,
):
try:
url = urljoin(self._endpoint_url, self._version_path + path)
Expand All @@ -68,11 +76,15 @@ def make_request(
data=data,
json=json,
timeout=timeout,
**kwargs,
)

data = response.json()
if response.status_code not in [200, 201]:
raise PandasAIApiCallError(data["message"])
if "message" in data:
raise PandasAIApiCallError(data["message"])
elif "detail" in data:
raise PandasAIApiCallError(data["detail"])

return data

Expand Down
Loading