diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..de32bab Binary files /dev/null and b/.DS_Store differ diff --git a/Dataset_manager_module.ipynb b/Dataset_manager_module.ipynb new file mode 100644 index 0000000..4dc631d --- /dev/null +++ b/Dataset_manager_module.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Google Cloud Storage Dataset Manager\n", + "\n", + "This notebook demonstrates how to use the `Datasets` class for managing datasets in Google Cloud Storage (GCS). The module supports operations like uploading, reading, updating, deleting, and listing datasets in various formats (CSV, JSON, Pickle). \n", + "\n", + "The `Datasets` class provides the following methods:\n", + "1. `upload_dataset` – Uploads a dataset to GCS in the specified format.\n", + "2. `read_dataset` – Reads a dataset from GCS.\n", + "3. `update_dataset` – Updates an existing dataset on GCS.\n", + "4. `delete_dataset` – Deletes a dataset from GCS.\n", + "5. `list_datasets` – Lists all datasets in the specified bucket.\n", + "\n", + "We will demonstrate these methods step by step.\n", + "\n", + "---\n", + "\n", + "## 1. Initialize the Datasets Class\n", + "\n", + "Before performing any operations, we need to initialize the `Datasets` class by authenticating with Google Cloud and specifying the project ID and bucket name.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully authenticated to Google Cloud Storage.\n" + ] + } + ], + "source": [ + "# Import necessary modules\n", + "import warnings\n", + "import json\n", + "import csv\n", + "import pickle\n", + "import os\n", + "import pandas as pd\n", + "from google.cloud import storage\n", + "import io\n", + "from typing import Union, List, Dict, Any, Optional\n", + "\n", + "# Set the path to your service account key file\n", + "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"/Users/mac/Downloads/strkfarm-88de68f85013.json\"\n", + "\n", + "# Suppress warnings\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, module='urllib3')\n", + "\n", + "# Constants for project ID and bucket name\n", + "DEFAULT_PROJECT_ID = \"strkfarm\"\n", + "DEFAULT_BUCKET_NAME = \"strkfarm\"\n", + "\n", + "# Datasets class with all necessary methods\n", + "class Datasets:\n", + " def __init__(self, project_id: str = DEFAULT_PROJECT_ID, bucket_name: str = DEFAULT_BUCKET_NAME) -> None:\n", + " self.project_id = project_id\n", + " self.bucket_name = bucket_name\n", + "\n", + " try:\n", + " self.storage_client = storage.Client(project=self.project_id)\n", + " self.bucket = self.storage_client.bucket(self.bucket_name)\n", + " self.bucket.reload() # Verify the bucket exists and is accessible\n", + " print(\"Successfully authenticated to Google Cloud Storage.\")\n", + " except Exception as e:\n", + " print(f\"Error authenticating or accessing bucket: {e}\")\n", + " raise\n", + "\n", + " SUPPORTED_FORMATS = [\"json\", \"csv\", \"pickle\"]\n", + "\n", + " def upload_dataset(self, \n", + " data: Union[pd.DataFrame, Dict, List, Any], \n", + " filename: str, \n", + " data_format: str = \"json\") -> None:\n", + " \"\"\"\n", + " Uploads data to Google Cloud Storage in the exact specified format.\n", + "\n", + " Args:\n", + " data: The data to upload. Can be a pandas DataFrame, dict, list, or other serializable object.\n", + " filename: The target filename in GCS.\n", + " data_format: The format to store the data (\"json\", \"csv\", or \"pickle\"). Defaults to \"json\".\n", + "\n", + " Raises:\n", + " ValueError: If the data_format is not supported.\n", + " Exception: If upload fails.\n", + " \"\"\"\n", + " if data_format not in self.SUPPORTED_FORMATS:\n", + " raise ValueError(f\"Unsupported data format: {data_format}. Must be one of {self.SUPPORTED_FORMATS}\")\n", + "\n", + " try:\n", + " # Handle different input data types and formats\n", + " if isinstance(data, pd.DataFrame):\n", + " blob_data = data.to_csv(index=False).encode('utf-8')\n", + " content_type = 'text/csv'\n", + " elif data_format == \"json\":\n", + " blob_data = json.dumps(data).encode('utf-8')\n", + " content_type = 'application/json'\n", + " elif data_format == \"pickle\":\n", + " blob_data = pickle.dumps(data)\n", + " content_type = 'application/octet-stream'\n", + "\n", + " # Upload to GCS\n", + " blob = self.bucket.blob(filename)\n", + " blob.upload_from_string(blob_data, content_type=content_type)\n", + " print(f\"Dataset '{filename}' uploaded successfully.\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Error uploading dataset: {e}\")\n", + " raise\n", + "\n", + " def read_dataset(self, \n", + " filename: str, \n", + " data_format: str = \"json\",\n", + " as_dataframe: bool = False) -> Optional[Union[pd.DataFrame, Dict, List, Any]]:\n", + " \"\"\"\n", + " Reads data from Google Cloud Storage in the specified format.\n", + "\n", + " Args:\n", + " filename: The name of the file to read from GCS.\n", + " data_format: The format of the stored data (\"json\", \"csv\", or \"pickle\"). Defaults to \"json\".\n", + " as_dataframe: If True and format is \"csv\", returns a pandas DataFrame. Defaults to False.\n", + "\n", + " Returns:\n", + " The loaded data in its appropriate Python format, or None if reading fails.\n", + "\n", + " Raises:\n", + " ValueError: If the data_format is not supported.\n", + " Exception: If read operation fails.\n", + " \"\"\"\n", + " if data_format not in self.SUPPORTED_FORMATS:\n", + " raise ValueError(f\"Unsupported data format: {data_format}. Must be one of {self.SUPPORTED_FORMATS}\")\n", + "\n", + " try:\n", + " blob = self.bucket.blob(filename)\n", + " blob_data = blob.download_as_bytes()\n", + "\n", + " if data_format == \"json\":\n", + " return json.loads(blob_data.decode('utf-8'))\n", + " elif data_format == \"csv\":\n", + " csv_data = self._read_csv(blob_data)\n", + " if as_dataframe:\n", + " return pd.DataFrame(csv_data[1:], columns=csv_data[0])\n", + " return csv_data\n", + " elif data_format == \"pickle\":\n", + " return pickle.loads(blob_data)\n", + "\n", + " except Exception as e:\n", + " print(f\"Error reading {data_format} data: {e}\")\n", + " return None\n", + "\n", + " def _read_csv(self, data: bytes) -> List[List[str]]:\n", + " \"\"\"\n", + " Helper function to read CSV data.\n", + "\n", + " Args:\n", + " data: The CSV data as bytes.\n", + "\n", + " Returns:\n", + " List[List[str]]: List of rows from the CSV data.\n", + "\n", + " Raises:\n", + " Exception: If CSV parsing fails.\n", + " \"\"\"\n", + " try:\n", + " reader = csv.reader(io.StringIO(data.decode('utf-8')))\n", + " return list(reader)\n", + " except Exception as e:\n", + " print(f\"Error parsing CSV: {e}\")\n", + " raise\n", + "\n", + " def update_dataset(self, \n", + " data: Union[pd.DataFrame, Dict, List, Any], \n", + " filename: str, \n", + " data_format: str = \"json\") -> None:\n", + " \"\"\"\n", + " Updates (overwrites) an existing dataset in Google Cloud Storage.\n", + "\n", + " This is a wrapper around upload_dataset that makes the update operation explicit.\n", + " Args:\n", + " data: The new data to upload.\n", + " filename: The name of the file to update.\n", + " data_format: The format to store the data (\"json\", \"csv\", or \"pickle\"). Defaults to \"json\".\n", + "\n", + " Raises:\n", + " ValueError: If the data_format is not supported.\n", + " Exception: If update fails.\n", + " \"\"\"\n", + " self.upload_dataset(data, filename, data_format)\n", + "\n", + " def delete_dataset(self, filename: str) -> bool:\n", + " \"\"\"\n", + " Deletes a dataset from Google Cloud Storage.\n", + "\n", + " Args:\n", + " filename: The name of the file to delete.\n", + "\n", + " Returns:\n", + " bool: True if deletion was successful, False otherwise.\n", + "\n", + " Raises:\n", + " Exception: If deletion fails.\n", + " \"\"\"\n", + " blob = self.bucket.blob(filename)\n", + " try:\n", + " blob.delete()\n", + " print(f\"Dataset '{filename}' deleted successfully.\")\n", + " return True\n", + " except Exception as e:\n", + " print(f\"Error deleting dataset: {e}\")\n", + " return False\n", + "\n", + " def list_datasets(self, prefix: str = None) -> List[str]:\n", + " \"\"\"\n", + " Lists all datasets in the bucket, optionally filtered by prefix.\n", + "\n", + " Args:\n", + " prefix: Optional prefix to filter the files. Defaults to None.\n", + "\n", + " Returns:\n", + " List[str]: List of dataset filenames.\n", + " \"\"\"\n", + " try:\n", + " blobs = self.bucket.list_blobs(prefix=prefix)\n", + " return [blob.name for blob in blobs]\n", + " except Exception as e:\n", + " print(f\"Error listing datasets: {e}\")\n", + " return []\n", + "\n", + "# Example usage\n", + "datasets = Datasets()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the `Datasets` class is initialized, we can start demonstrating how to interact with Google Cloud Storage.\n", + "\n", + "---\n", + "\n", + "## 2. Upload a Dataset to GCS\n", + "\n", + "The `upload_dataset` method uploads a dataset in one of the supported formats (`csv`, `json`, `pickle`). We'll demonstrate uploading a pandas DataFrame as a CSV file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully authenticated to Google Cloud Storage.\n", + "Dataset 'people_data.csv' uploaded successfully.\n" + ] + } + ], + "source": [ + "# Create an instance of Datasets class\n", + "datasets = Datasets()\n", + "\n", + "# Example data to upload\n", + "data = pd.DataFrame({\n", + " \"id\": [1, 2, 3],\n", + " \"name\": [\"Alice\", \"Bob\", \"Charlie\"],\n", + " \"age\": [23, 30, 35]\n", + "})\n", + "\n", + "# Upload data as CSV\n", + "filename = \"people_data.csv\"\n", + "datasets.upload_dataset(data, filename, data_format=\"csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we've uploaded a simple DataFrame as a CSV file to Google Cloud Storage. The dataset is now available in the GCS bucket under the name `people_data.csv`.\n", + "\n", + "---\n", + "\n", + "## 3. Read a Dataset from GCS\n", + "\n", + "Next, we demonstrate how to read a dataset from Google Cloud Storage. We'll read the CSV file we just uploaded and load it into a pandas DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'datasets' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Read the uploaded CSV dataset from GCS\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m read_data \u001b[38;5;241m=\u001b[39m \u001b[43mdatasets\u001b[49m\u001b[38;5;241m.\u001b[39mread_dataset(filename, data_format\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcsv\u001b[39m\u001b[38;5;124m\"\u001b[39m, as_dataframe\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Display the loaded data\u001b[39;00m\n\u001b[1;32m 5\u001b[0m read_data\n", + "\u001b[0;31mNameError\u001b[0m: name 'datasets' is not defined" + ] + } + ], + "source": [ + "# Read the uploaded CSV dataset from GCS\n", + "read_data = datasets.read_dataset(filename, data_format=\"csv\", as_dataframe=True)\n", + "\n", + "# Display the loaded data\n", + "read_data\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `read_dataset` method successfully loads the dataset from GCS and returns it as a pandas DataFrame.\n", + "\n", + "---\n", + "\n", + "## 4. Update an Existing Dataset\n", + "\n", + "If we need to update an existing dataset, we can use the `update_dataset` method, which works as a wrapper around the `upload_dataset` method to overwrite the existing dataset.\n", + "\n", + "For this example, let's update the CSV file by adding a new row.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'people_data.csv' uploaded successfully.\n" + ] + } + ], + "source": [ + "# Modify the data to simulate an update\n", + "updated_data = pd.DataFrame({\n", + " \"id\": [1, 2, 3, 4],\n", + " \"name\": [\"Alice\", \"Bob\", \"Charlie\", \"David\"],\n", + " \"age\": [23, 30, 35, 40]\n", + "})\n", + "\n", + "# Update the dataset on GCS\n", + "datasets.update_dataset(updated_data, filename, data_format=\"csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `update_dataset` method successfully overwrites the existing `people_data.csv` with the updated dataset. \n", + "\n", + "---\n", + "\n", + "## 5. List All Datasets in GCS\n", + "\n", + "We can list all datasets (files) in the GCS bucket. This is useful for checking which datasets are available.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['delimited_json.json',\n", + " 'events_response_positions_updated.pkl',\n", + " 'financial_data_json.json',\n", + " 'line_delimited_json.json',\n", + " 'people_data.csv',\n", + " 'sample_data.csv',\n", + " 'table-1_data_new_line_delimited_json.json']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# List all datasets in the bucket\n", + "datasets_list = datasets.list_datasets()\n", + "\n", + "# Display the list of datasets\n", + "datasets_list\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `list_datasets` method returns a list of all files (datasets) in the GCS bucket. You can optionally filter them by a prefix (if you have a naming convention).\n", + "\n", + "---\n", + "\n", + "## 6. Delete a Dataset\n", + "\n", + "If we no longer need a dataset, we can delete it using the `delete_dataset` method. Let's delete the `people_data.csv` file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 'people_data.csv' deleted successfully.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Delete the dataset from GCS\n", + "dataset_deleted = datasets.delete_dataset(filename)\n", + "\n", + "# Check if the dataset was successfully deleted\n", + "dataset_deleted\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `delete_dataset` method successfully deletes the `people_data.csv` dataset from Google Cloud Storage.\n", + "\n", + "---\n", + "\n", + "## Conclusion\n", + "\n", + "In this notebook, we've demonstrated the core functionalities of the `Datasets` class, including:\n", + "- Uploading datasets in various formats.\n", + "- Reading datasets from Google Cloud Storage.\n", + "- Updating and overwriting existing datasets.\n", + "- Listing available datasets.\n", + "- Deleting datasets from the bucket.\n", + "\n", + "These functions provide an easy-to-use interface for managing datasets in Google Cloud Storage. You can adapt these methods to work with different types of data and file formats depending on your use case.\n", + "\n", + "---\n", + "\n", + "### Notes:\n", + "- Ensure you have set up Google Cloud authentication (via `GOOGLE_APPLICATION_CREDENTIALS`).\n", + "- The `datasets` object assumes the default project and bucket, but you can provide custom values if necessary.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 78f8f84..22088fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ seaborn = ">=0.11,<0.12" scipy = ">=1.5,<1.9" yfinance = ">=0.1.55,<0.2.0" starknet_py = "*" +google-cloud-bigquery = "^3.0.0" +google-cloud-storage = "^2.8.0" +pyyaml = "^6.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..b3872b0 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/data/.DS_Store b/src/data/.DS_Store new file mode 100644 index 0000000..9586c40 Binary files /dev/null and b/src/data/.DS_Store differ diff --git a/src/data/GCS-Dataset-Upload-Manager.md b/src/data/GCS-Dataset-Upload-Manager.md new file mode 100644 index 0000000..dfb070c --- /dev/null +++ b/src/data/GCS-Dataset-Upload-Manager.md @@ -0,0 +1,112 @@ +# Google Cloud Storage Dataset Manager + +This Python module provides a class for managing datasets in Google Cloud Storage (GCS). It includes functionalities for uploading, reading, updating, deleting, and listing datasets in various formats like **JSON**, **CSV**, and **Pickle**. + +## Features + +- **Upload Dataset**: Upload data in various formats (CSV, JSON, Pickle) to Google Cloud Storage. +- **Read Dataset**: Read and download datasets from Google Cloud Storage. +- **Update Dataset**: Update (overwrite) an existing dataset in the cloud. +- **Delete Dataset**: Delete a dataset from the cloud. +- **List Datasets**: List all datasets stored in a bucket with optional filtering by prefix. + +## Prerequisites + +Before you start using this module, ensure you have the following: + +1. **Google Cloud Account**: You must have a Google Cloud project and a GCS bucket to store your datasets. +2. **Google Cloud SDK**: Install and configure the Google Cloud SDK, including setting up authentication via service accounts. +3. **Required Python Packages**: + - `google-cloud-storage`: Library to interact with Google Cloud Storage. + - `pandas`: For handling and manipulating datasets (especially for CSV handling). + +### Installation + +1. **Install Python dependencies**: + - You can install the necessary libraries using pip. Run the following command: + + ```bash + pip install google-cloud-storage pandas + ``` + +2. **Set up Google Cloud Credentials**: + - You need to authenticate your Google Cloud access by setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable. You can do this by running: + + ```bash + export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-file.json" + ``` + + - Make sure to replace `/path/to/your/service-account-file.json` with the path to your service account JSON file. + +## Module Overview + +The **`Datasets`** class includes methods for uploading, reading, updating, deleting, and listing datasets in your Google Cloud Storage bucket. + +### Constants + +- `DEFAULT_PROJECT_ID`: The Google Cloud project ID (default: `"strkfarm"`). +- `DEFAULT_BUCKET_NAME`: The GCS bucket name (default: `"strkfarm"`). + +### Class: `Datasets` + +This class allows you to interact with your GCS bucket and perform dataset management tasks. + +#### Methods: + +1. **`__init__(self, project_id: str, bucket_name: str)`**: + - Initializes the `Datasets` class with Google Cloud credentials and bucket information. + +2. **`upload_dataset(self, data, filename, data_format='json')`**: + - Uploads a dataset to Google Cloud Storage in the specified format. + - Supported formats: `json`, `csv`, `pickle`. + +3. **`read_dataset(self, filename, data_format='json', as_dataframe=False)`**: + - Reads a dataset from Google Cloud Storage in the specified format and returns the data. + - Returns a pandas DataFrame if the data is in CSV format and `as_dataframe=True`. + +4. **`update_dataset(self, data, filename, data_format='json')`**: + - Updates (overwrites) an existing dataset in Google Cloud Storage. + +5. **`delete_dataset(self, filename)`**: + - Deletes a dataset from Google Cloud Storage. + +6. **`list_datasets(self, prefix=None)`**: + - Lists all datasets in the GCS bucket, optionally filtering by prefix. + + + +## Running the Code + +### To run the code: +1. **`Set up the environment:`** +- Ensure you have the Google Cloud SDK installed and configured. +- Set the GOOGLE_APPLICATION_CREDENTIALS environment variable to point to your service account key file: +```bash +export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-file.json" +``` + +## 2. Modify the Script with Your File Path and Desired Filename + +1. Open the Python script (`datasets2.py`) and locate the following section inside the `main()` function: + + ```python + file_path = "/Users/mac/Downloads/table-1_data_new_line_delimited_json.json" # Replace with the actual file path + target_filename = "line_delimited_json.json" # Desired name in GCS + ``` + +2. Replace `file_path` with the path to the file you want to upload. + +3. Set the `target_filename` variable to the name you want the file to have once it's uploaded to Google Cloud Storage (GCS). + +--- + +## 3. Run the Python Script + +1. After updating the script, execute it with the following command in your terminal: + + ```bash + python3 datasets2.py + ``` + +--- + diff --git a/src/data/datasets2.py b/src/data/datasets2.py new file mode 100644 index 0000000..68da25d --- /dev/null +++ b/src/data/datasets2.py @@ -0,0 +1,336 @@ +""" +Google Cloud Storage Dataset Manager Module + +This module provides a class for managing datasets in Google Cloud Storage (GCS). +It supports various data formats and common operations like upload, read, update, and delete. + +Constants: + DEFAULT_PROJECT_ID: The default Google Cloud project ID + DEFAULT_BUCKET_NAME: The default GCS bucket name + +Dependencies: + - google-cloud-storage + - pandas + - json + - csv + - pickle + - io + +Typical usage example: + datasets = Datasets() # Uses default constants + datasets.upload_dataset(my_dataframe, "dataset.csv", data_format="csv") + data = datasets.read_dataset("dataset.csv", data_format="csv") +""" + +import json +import csv +import pickle +import os +from typing import Union, List, Dict, Any, Optional +from google.cloud import storage +import io +import pandas as pd + +# Module-level constants +DEFAULT_PROJECT_ID: str = "strkfarm" +DEFAULT_BUCKET_NAME: str = "strkfarm" + + +# Environment variable overrides for constants +PROJECT_ID = os.getenv('GCS_PROJECT_ID', DEFAULT_PROJECT_ID) +BUCKET_NAME = os.getenv('GCS_BUCKET_NAME', DEFAULT_BUCKET_NAME) + +# Supported data formats +SUPPORTED_FORMATS = ["json", "csv", "pickle"] + + +class Datasets: + """ + A class to manage dataset operations in Google Cloud Storage. + + This class provides methods for uploading, reading, updating, and deleting + datasets in various formats (JSON, CSV, Pickle) to/from Google Cloud Storage. + + Attributes: + project_id (str): The Google Cloud project ID. + bucket_name (str): The name of the GCS bucket. + storage_client: The Google Cloud Storage client instance. + bucket: The GCS bucket instance. + """ + + def __init__(self, project_id: str = PROJECT_ID, bucket_name: str = BUCKET_NAME) -> None: + """ + Initializes the Datasets class with GCS credentials and bucket information. + + Args: + project_id: The Google Cloud project ID. Defaults to PROJECT_ID constant. + bucket_name: The name of the GCS bucket. Defaults to BUCKET_NAME constant. + + Raises: + Exception: If authentication fails or bucket access is denied. + """ + self.project_id = project_id + self.bucket_name = bucket_name + + # Initialize GCS client and bucket + try: + self.storage_client = storage.Client(project=self.project_id) + self.bucket = self.storage_client.bucket(self.bucket_name) + self.bucket.reload() # Verify bucket exists and is accessible + print("Successfully authenticated to Google Cloud Storage.") + except Exception as e: + print(f"Error authenticating or accessing bucket: {e}") + print("Ensure GOOGLE_APPLICATION_CREDENTIALS is set correctly and the service account has permissions.") + raise + + def upload_dataset(self, + data: Union[pd.DataFrame, Dict, List, Any], + filename: str, + data_format: str = "json") -> None: + """ + Uploads data to Google Cloud Storage in the specified format. + + Args: + data: The data to upload. Can be a pandas DataFrame, dict, list, or other serializable object. + filename: The target filename in GCS. + data_format: The format to store the data ("json", "csv", or "pickle"). Defaults to "json". + + Raises: + ValueError: If the data_format is not supported. + Exception: If upload fails. + """ + if data_format not in SUPPORTED_FORMATS: + raise ValueError(f"Unsupported data format: {data_format}. Must be one of {SUPPORTED_FORMATS}") + + try: + # Handle different input data types and formats + if isinstance(data, pd.DataFrame): + blob_data = data.to_csv(index=False).encode('utf-8') + content_type = 'text/csv' + elif data_format == "json": + blob_data = json.dumps(data).encode('utf-8') + content_type = 'application/json' + elif data_format == "csv": + blob_data = self._write_csv(data) + content_type = 'text/csv' + elif data_format == "pickle": + blob_data = pickle.dumps(data) + content_type = 'application/octet-stream' + + # Upload to GCS + blob = self.bucket.blob(filename) + blob.upload_from_string(blob_data, content_type=content_type) + print(f"Dataset '{filename}' uploaded successfully.") + + except Exception as e: + print(f"Error uploading dataset: {e}") + raise + + def _write_csv(self, data: List[List[Any]]) -> bytes: + """ + Helper function to convert data to CSV format. + + Args: + data: List of rows to write to CSV. + + Returns: + bytes: Encoded CSV data as bytes. + + Raises: + Exception: If CSV conversion fails. + """ + try: + output = io.StringIO() + writer = csv.writer(output) + writer.writerows(data) + return output.getvalue().encode('utf-8') + except Exception as e: + print(f"Error converting to CSV: {e}") + raise + + def read_dataset(self, + filename: str, + data_format: str = "json", + as_dataframe: bool = False) -> Optional[Union[pd.DataFrame, Dict, List, Any]]: + """ + Reads data from Google Cloud Storage in the specified format. + + Args: + filename: The name of the file to read from GCS. + data_format: The format of the stored data ("json", "csv", or "pickle"). Defaults to "json". + as_dataframe: If True and format is "csv", returns a pandas DataFrame. Defaults to False. + + Returns: + The loaded data in its appropriate Python format, or None if reading fails. + + Raises: + ValueError: If the data_format is not supported. + Exception: If read operation fails. + """ + if data_format not in SUPPORTED_FORMATS: + raise ValueError(f"Unsupported data format: {data_format}. Must be one of {SUPPORTED_FORMATS}") + + try: + blob = self.bucket.blob(filename) + blob_data = blob.download_as_bytes() + + if data_format == "json": + return json.loads(blob_data.decode('utf-8')) + elif data_format == "csv": + csv_data = self._read_csv(blob_data) + if as_dataframe: + return pd.DataFrame(csv_data[1:], columns=csv_data[0]) + return csv_data + elif data_format == "pickle": + return pickle.loads(blob_data) + + except Exception as e: + print(f"Error reading {data_format} data: {e}") + return None + + def _read_csv(self, data: bytes) -> List[List[str]]: + """ + Helper function to read CSV data. + + Args: + data: The CSV data as bytes. + + Returns: + List[List[str]]: List of rows from the CSV data. + + Raises: + Exception: If CSV parsing fails. + """ + try: + reader = csv.reader(io.StringIO(data.decode('utf-8'))) + return list(reader) + except Exception as e: + print(f"Error parsing CSV: {e}") + raise + + def update_dataset(self, + data: Union[pd.DataFrame, Dict, List, Any], + filename: str, + data_format: str = "json") -> None: + """ + Updates (overwrites) an existing dataset in Google Cloud Storage. + + This is a wrapper around upload_dataset that makes the update operation explicit. + + Args: + data: The new data to upload. + filename: The name of the file to update. + data_format: The format to store the data ("json", "csv", or "pickle"). Defaults to "json". + + Raises: + ValueError: If the data_format is not supported. + Exception: If update fails. + """ + self.upload_dataset(data, filename, data_format) + + def delete_dataset(self, filename: str) -> bool: + """ + Deletes a dataset from Google Cloud Storage. + + Args: + filename: The name of the file to delete. + + Returns: + bool: True if deletion was successful, False otherwise. + + Raises: + Exception: If deletion fails. + """ + blob = self.bucket.blob(filename) + try: + blob.delete() + print(f"Dataset '{filename}' deleted successfully.") + return True + except Exception as e: + print(f"Error deleting dataset: {e}") + return False + + def list_datasets(self, prefix: str = None) -> List[str]: + """ + Lists all datasets in the bucket, optionally filtered by prefix. + + Args: + prefix: Optional prefix to filter the files. Defaults to None. + + Returns: + List[str]: List of dataset filenames. + """ + try: + blobs = self.bucket.list_blobs(prefix=prefix) + return [blob.name for blob in blobs] + except Exception as e: + print(f"Error listing datasets: {e}") + return [] + + +def main(): + """ + Main entry point for demonstration purposes. + Shows example usage of the Datasets class. + """ + try: + # Initialize with default settings + datasets = Datasets() + + # Specify the file path and target filename + file_path = "/Users/mac/Documents/React projects/strkfarm/strategy-research/data_downloaded/events_response_positions_updated.pkl" # Replace with the actual file path + target_filename = "events_response_positions_updated.pkl" # Desired name in GCS + + # Open the pickle file in binary mode and load it + with open(file_path, "rb") as f: + file_content = f.read() + + # Upload the file content (as a pickle object) + datasets.upload_dataset(file_content, target_filename, data_format="pickle") + + print(f"File '{file_path}' uploaded successfully as '{target_filename}'.") + + # List all datasets + print("\nAvailable datasets:") + for dataset in datasets.list_datasets(): + print(f"- {dataset}") + + except Exception as e: + print(f"An error occurred in main: {e}") + """ + Main entry point for demonstration purposes. + Shows example usage of the Datasets class. + """ + try: + # Initialize with default settings + datasets = Datasets() + + # Local pickle file path (in the same directory as the script) + local_pickle_path = "/Users/mac/Documents/React projects/strkfarm/strategy-research/data_downloaded/events_response_positions_updated.pkl" # Replace with your pickle file name + + # Read the local pickle file + with open(local_pickle_path, 'rb') as file: + pickle_data = pickle.load(file) + + # Upload to GCS with the same filename (or change as needed) + gcs_filename = os.path.basename(local_pickle_path) # Uses same filename + + datasets.upload_dataset(pickle_data, gcs_filename, data_format="pickle") + + print(f"Successfully uploaded {local_pickle_path} to GCS") + + # Optional: Verify the upload by listing files + print("\nAvailable datasets in bucket:") + for dataset in datasets.list_datasets(): + print(f"- {dataset}") + + except FileNotFoundError as e: + print(f"Error: Pickle file not found - {e}") + except pickle.UnpicklingError as e: + print(f"Error: Invalid pickle file - {e}") + except Exception as e: + print(f"An error occurred in main: {e}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..abbc046 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,242 @@ +import pytest +import pickle +from unittest.mock import MagicMock, patch +from src.data.datasets2 import Datasets + + +@patch("datasets2.storage.Client") +def test_init(mock_storage_client): + """ + Test initialization of Datasets class. + Verifies correct project ID, bucket name, and bucket reload. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + + # Create Datasets instance + datasets = Datasets() + + # Assertions to validate initialization + assert datasets.project_id == "strkfarm" + assert datasets.bucket_name == "strkfarm" + mock_client_instance.bucket.assert_called_once_with("strkfarm") + mock_bucket.reload.assert_called_once() + + +@patch("datasets2.storage.Client") +def test_upload_dataset_json(mock_storage_client): + """ + Test uploading JSON dataset to Google Cloud Storage. + Verifies correct JSON serialization and upload. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + # Create Datasets instance + datasets = Datasets() + + # Upload sample JSON dataset + datasets.upload_dataset({"key": "value"}, "test.json", data_format="json") + + # Verify upload was called with correct parameters + mock_blob.upload_from_string.assert_called_once_with( + b'{"key": "value"}', content_type="application/json" + ) + + +@patch("datasets2.storage.Client") +def test_upload_dataset_csv(mock_storage_client): + """ + Test uploading CSV dataset to Google Cloud Storage. + Verifies correct CSV formatting and upload. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + # Create Datasets instance + datasets = Datasets() + + # Upload sample CSV dataset + datasets.upload_dataset( + [["col1", "col2"], ["val1", "val2"]], + "test.csv", + data_format="csv", + ) + + # Verify upload was called with correct parameters + mock_blob.upload_from_string.assert_called_once_with( + b"col1,col2\r\nval1,val2\r\n", content_type="text/csv" + ) + + +@patch("datasets2.storage.Client") +def test_read_dataset_json(mock_storage_client): + """ + Test reading JSON dataset from Google Cloud Storage. + Verifies correct JSON deserialization. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client and download behavior + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + mock_blob.download_as_bytes.return_value = b'{"key": "value"}' + + # Create Datasets instance + datasets = Datasets() + + # Read dataset and verify content + data = datasets.read_dataset("test.json", data_format="json") + + assert data == {"key": "value"} + mock_blob.download_as_bytes.assert_called_once() + + +@patch("datasets2.storage.Client") +def test_upload_dataset_pickle(mock_storage_client): + """ + Test uploading Pickle dataset to Google Cloud Storage. + Verifies correct pickle serialization and upload. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + # Create Datasets instance + datasets = Datasets() + + # Prepare sample data for pickle upload + sample_data = {"key": "value"} + datasets.upload_dataset(sample_data, "test.pkl", data_format="pickle") + + # Verify pickle upload + mock_blob.upload_from_string.assert_called_once() + uploaded_data = mock_blob.upload_from_string.call_args[0][0] + assert pickle.loads(uploaded_data) == sample_data + assert mock_blob.upload_from_string.call_args[1]["content_type"] == "application/octet-stream" + + +@patch("datasets2.storage.Client") +def test_read_dataset_pickle(mock_storage_client): + """ + Test reading Pickle dataset from Google Cloud Storage. + Verifies correct pickle deserialization. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + # Prepare sample data for pickle download + sample_data = {"key": "value"} + mock_blob.download_as_bytes.return_value = pickle.dumps(sample_data) + + # Create Datasets instance + datasets = Datasets() + + # Read pickle dataset and verify content + data = datasets.read_dataset("test.pkl", data_format="pickle") + + # Assertions + assert data == sample_data # Check if data is correctly deserialized + mock_blob.download_as_bytes.assert_called_once() # Ensure download was called once + + + +@patch("datasets2.storage.Client") +def test_list_datasets(mock_storage_client): + """ + Test listing datasets from Google Cloud Storage. + Verifies retrieval of files with different extensions. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + + # Create mock blobs with different file extensions + mock_blob1 = MagicMock() + mock_blob2 = MagicMock() + mock_blob3 = MagicMock() + + # Set names for different file types + mock_blob1.name = "file1.json" + mock_blob2.name = "file2.csv" + mock_blob3.name = "file3.pkl" # Added pickle file + + # Configure mock storage client to return the blobs + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.list_blobs.return_value = [mock_blob1, mock_blob2, mock_blob3] + + # Create Datasets instance + datasets = Datasets() + + # Call list_datasets method + files = datasets.list_datasets() + + # Assert that all files are returned, including the new pickle file + assert files == ["file1.json", "file2.csv", "file3.pkl"] + + # Verify that list_blobs was called once + mock_bucket.list_blobs.assert_called_once() + + +@patch("datasets2.storage.Client") +def test_delete_dataset(mock_storage_client): + """ + Test deleting a dataset from Google Cloud Storage. + Verifies successful deletion and return value. + """ + # Create mock storage client instances + mock_client_instance = MagicMock() + mock_bucket = MagicMock() + mock_blob = MagicMock() + + # Configure mock storage client to return predefined instances + mock_storage_client.return_value = mock_client_instance + mock_client_instance.bucket.return_value = mock_bucket + mock_bucket.blob.return_value = mock_blob + + # Create Datasets instance + datasets = Datasets() + + # Attempt to delete a dataset + result = datasets.delete_dataset("test.json") + + # Verify deletion was successful + assert result is True + mock_blob.delete.assert_called_once() \ No newline at end of file