Skip to content

Commit

Permalink
Uploading tutorials
Browse files Browse the repository at this point in the history
  • Loading branch information
turetske committed Oct 25, 2024
1 parent 2cd65c9 commit c7e682c
Show file tree
Hide file tree
Showing 4 changed files with 597 additions and 0 deletions.
30 changes: 30 additions & 0 deletions examples/intake/intake-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import warnings

warnings.filterwarnings("ignore")

import intake
import numpy as np
import pandas as pd
import xarray as xr
#import hvplot.pandas, hvplot.xarray
#import holoviews as hv
from distributed import LocalCluster, Client
from ncar_jobqueue import NCARCluster
#hv.extension('bokeh')


if __name__ == '__main__':

# If not using NCAR HPC, use the LocalCluster
#cluster = LocalCluster()
cluster = NCARCluster()
cluster.scale(10)

client = Client(cluster)

catalog = intake.open_esm_datastore(
'file://examples/intake/resources/pelican-test-intake.json'
)

catalog_subset = catalog.search(variable='FLNS', frequency='monthly')
dsets = catalog_subset.to_dataset_dict()
292 changes: 292 additions & 0 deletions examples/pytorch/BasePelicanPytorch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training and Evaluating a Model with PyTorch, FSSpec, and Remote CSV Data\n",
"\n",
"This notebook demonstrates how to train a simple neural network using PyTorch with data read from remote CSV files over HTTPS using `fsspec`. The example includes data pipelines for both training and test datasets and evaluates the model's accuracy on the test set.\n",
"\n",
"## Install Dependencies\n",
"\n",
"```python\n",
"!pip install torch fsspec pandas torchdata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import pandas as pd\n",
"import fsspec\n",
"from torch.utils.data import Dataset, DataLoader\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the Nueral Network"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a simple feedforward nueral network for the example"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"class SimpleNN(nn.Module):\n",
" def __init__(self):\n",
" super(SimpleNN, self).__init__()\n",
" self.fc1 = nn.Linear(784, 50)\n",
" self.fc2 = nn.Linear(50, 600)\n",
"\n",
" def forward(self, x):\n",
" x = torch.relu(self.fc1(x))\n",
" x = self.fc2(x)\n",
" return x\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the Custom Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a custom dataset for PyTorch"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"class CSVDataset(Dataset):\n",
" def __init__(self, data):\n",
" self.data = data\n",
" \n",
" def __len__(self):\n",
" return len(self.data)\n",
" \n",
" def __getitem__(self, index):\n",
" return self.data[index]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define Functions to Read and Process Remote CSV Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This will use the fsspec to read and process data.\n",
"\n",
"Note that this notebook isn't using the fsspec handling functions built into torchdata.datapipes because that package is being deprecated"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def read_csv_from_url(file_url):\n",
" # Create a filesystem object for HTTPS\n",
" fs = fsspec.filesystem('osdf')\n",
" # Open the remote file\n",
" with fs.open(file_url, 'r') as f:\n",
" # Read the file into a pandas DataFrame\n",
" df = pd.read_csv(f, index_col=False)\n",
" return df\n",
"\n",
"def dataframe_to_dataset(df):\n",
" features = df.iloc[:, :-1].values.astype(np.float32) # Assuming last column is target\n",
" targets = df.iloc[:, -1].values.astype(np.int64)\n",
" dataset = [(torch.tensor(feature), torch.tensor(target)) for feature, target in zip(features, targets)]\n",
" return dataset\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare the Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the data remotely from Pelican using fsspec with the 'osdf' protocol. (Note that the OSDF protocol is a specific version of PelicanFS with the discoverURL alreayd set)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Define remote file URLs\n",
"train_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_train.csv'\n",
"test_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_test.csv'\n",
"\n",
"# Read and convert data\n",
"train_df = read_csv_from_url(train_csv_url)\n",
"test_df = read_csv_from_url(test_csv_url)\n",
"train_data = dataframe_to_dataset(train_df)\n",
"test_data = dataframe_to_dataset(test_df)\n",
"\n",
"# Create DataLoaders\n",
"train_dataset = CSVDataset(train_data)\n",
"test_dataset = CSVDataset(test_data)\n",
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n",
"test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train the model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Train our example model using the data from Pelican."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "Target 8 is out of bounds.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[18], line 14\u001b[0m\n\u001b[1;32m 12\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 13\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(batch_X)\n\u001b[0;32m---> 14\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mcriterion\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_y\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[1;32m 16\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/loss.py:1188\u001b[0m, in \u001b[0;36mCrossEntropyLoss.forward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor, target: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m-> 1188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1189\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1190\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/functional.py:3104\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m 3102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 3103\u001b[0m reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mIndexError\u001b[0m: Target 8 is out of bounds."
]
}
],
"source": [
"# Instantiate model, loss function, and optimizer\n",
"model = SimpleNN()\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.SGD(model.parameters(), lr=0.01)\n",
"\n",
"# Training loop\n",
"epochs = 5\n",
"for epoch in range(epochs):\n",
" model.train()\n",
" running_loss = 0.0\n",
" for batch_X, batch_y in train_loader:\n",
" optimizer.zero_grad()\n",
" outputs = model(batch_X)\n",
" loss = criterion(outputs, batch_y)\n",
" loss.backward()\n",
" optimizer.step()\n",
" running_loss += loss.item() * batch_X.size(0)\n",
" \n",
" epoch_loss = running_loss / len(train_loader.dataset)\n",
" print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate the Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Evaluate the accuracy of the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.eval()\n",
"correct = 0\n",
"total = 0\n",
"with torch.no_grad():\n",
" for batch_X, batch_y in test_loader:\n",
" outputs = model(batch_X)\n",
" _, predicted = torch.max(outputs, 1)\n",
" total += batch_y.size(0)\n",
" correct += (predicted == batch_y).sum().item()\n",
"\n",
"accuracy = correct / total\n",
"print(f'Accuracy on test data: {accuracy:.4f}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
12 changes: 12 additions & 0 deletions examples/pytorch/pytorch_with_pelicanfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import torch
torch.utils.data.datapipes.utils.common.DILL_AVAILABLE = torch.utils._import_utils.dill_available()
from torchdata.datapipes.iter import IterableWrapper


if __name__ == '__main__':
dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).list_files_by_fsspec()
print(list(dp))

dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).open_files_by_fsspec()
for path, filestream in dp:
print(path, filestream)
Loading

0 comments on commit c7e682c

Please sign in to comment.