Skip to content

Commit

Permalink
Merge pull request #381 from saturncloud/update-taxi-data
Browse files Browse the repository at this point in the history
update s3 bucket for yellowtrip data
  • Loading branch information
Nathan Ballou authored Jul 12, 2022
2 parents edb7a93 + 3ac6ee5 commit 1c5e846
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 27 deletions.
4 changes: 2 additions & 2 deletions examples/dask/machine-learning-grid-search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
},
"outputs": [],
"source": [
"taxi = pd.read_parquet(\"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-05.parquet\")"
"taxi = pd.read_parquet(\"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\")"
]
},
{
Expand Down Expand Up @@ -308,7 +308,7 @@
"import dask.dataframe as dd\n",
"\n",
"taxi_dd = dd.read_parquet(\n",
" \"s3://nyc-tlc/trip data/yellow_tripdata_2020-05.parquet\",\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\",\n",
" storage_options={\"anon\": True},\n",
" assume_missing=True,\n",
")"
Expand Down
2 changes: 1 addition & 1 deletion examples/dask/special-topics-rolling-average.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"import dask.dataframe as dd\n",
"\n",
"taxi = dd.read_parquet(\n",
" \"s3://nyc-tlc/trip data/yellow_tripdata_2019-01.parquet\",\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\",\n",
" storage_options={\"anon\": True},\n",
").sample(frac=0.1, replace=False)"
]
Expand Down
6 changes: 3 additions & 3 deletions examples/load-data/load-data-s3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@
"source": [
"import pandas as pd\n",
"\n",
"file = \"nyc-tlc/trip data/yellow_tripdata_2019-01.parquet\"\n",
"file = \"saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\n",
"with s3.open(file, mode=\"rb\") as f:\n",
" df = pd.read_parquet(f)"
]
Expand All @@ -149,7 +149,7 @@
"source": [
"import dask.dataframe as dd\n",
"\n",
"file = \"nyc-tlc/trip data/yellow_tripdata_2019-01.parquet\"\n",
"file = \"saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\"\n",
"with s3.open(file, mode=\"rb\") as f:\n",
" df = dd.read_parquet(f)"
]
Expand All @@ -168,7 +168,7 @@
"metadata": {},
"outputs": [],
"source": [
"files = s3.glob(\"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\")\n",
"files = s3.glob(\"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-*.parquet\")\n",
"taxi = dd.read_parquet(\n",
" files,\n",
" storage_options={\"anon\": False},\n",
Expand Down
13 changes: 8 additions & 5 deletions examples/prefect/03-prefect-resource-manager.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,7 @@
"source": [
"@task\n",
"def read():\n",
" taxi = dd.read_parquet(\n",
" \"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-01.parquet\"\n",
" )\n",
" taxi = dd.read_parquet(\"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\")\n",
" df2 = taxi[taxi.passenger_count > 1]\n",
" df3 = df2.groupby(\"VendorID\").passenger_count.std()\n",
" return df3"
Expand Down Expand Up @@ -269,7 +267,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "saturn (Python 3)",
"display_name": "Python 3.10.2 64-bit",
"language": "python",
"name": "python3"
},
Expand All @@ -283,7 +281,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
"version": "3.10.2"
},
"vscode": {
"interpreter": {
"hash": "b7848b2fbd737d4d16c30c2a265d9cb43a8b0508277d828bf32f61f61a6b4e46"
}
}
},
"nbformat": 4,
Expand Down
4 changes: 2 additions & 2 deletions examples/rapids-comparison/comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
},
"outputs": [],
"source": [
"!curl https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-01.parquet > data.parquet"
"!curl https://saturn-public-data.s3.us-east-2.amazonaws.com/nyc-taxi/data/yellow_tripdata_2019-01.parquet > data.parquet"
]
},
{
Expand Down Expand Up @@ -279,7 +279,7 @@
"source": [
"with timing(\"GPU + Dask: Random Forest (12x the data)\"):\n",
" taxi_dask = dask_cudf.read_parquet(\n",
" \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.parquet\",\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-*.parquet\",\n",
" storage_options={\"anon\": True},\n",
" assume_missing=True,\n",
" )\n",
Expand Down
15 changes: 9 additions & 6 deletions examples/rapids/01-rapids-single-gpu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@
"metadata": {},
"outputs": [],
"source": [
"taxi = cudf.read_parquet(\n",
" \"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-01.parquet\"\n",
")"
"taxi = cudf.read_parquet(\"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\")"
]
},
{
Expand Down Expand Up @@ -281,7 +279,7 @@
"outputs": [],
"source": [
"taxi_test = cudf.read_parquet(\n",
" \"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-02.parquet\"\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-02.parquet\"\n",
")"
]
},
Expand Down Expand Up @@ -373,7 +371,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "saturn (Python 3)",
"display_name": "Python 3.10.2 64-bit",
"language": "python",
"name": "python3"
},
Expand All @@ -387,7 +385,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
"version": "3.10.2"
},
"vscode": {
"interpreter": {
"hash": "b7848b2fbd737d4d16c30c2a265d9cb43a8b0508277d828bf32f61f61a6b4e46"
}
}
},
"nbformat": 4,
Expand Down
19 changes: 11 additions & 8 deletions examples/rapids/02-rapids-gpu-cluster.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@
"source": [
"taxi = (\n",
" dask_cudf.read_parquet(\n",
" \"s3://nyc-tlc/trip data/yellow_tripdata_2019-01.parquet\",\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-01.parquet\",\n",
" storage_options={\"anon\": True},\n",
" assume_missing=True,\n",
" )\n",
Expand Down Expand Up @@ -337,7 +337,7 @@
"outputs": [],
"source": [
"taxi_test = dask_cudf.read_parquet(\n",
" \"s3://nyc-tlc/trip data/yellow_tripdata_2019-02.parquet\",\n",
" \"s3://saturn-public-data/nyc-taxi/data/yellow_tripdata_2019-02.parquet\",\n",
" storage_options={\"anon\": True},\n",
" assume_missing=True,\n",
").persist()\n",
Expand Down Expand Up @@ -423,18 +423,16 @@
"\n",
"By only changing a few lines of code, we went from training on a single GPU to a training on a GPU cluster! Wow! \n",
"\n",
"Feel free to play around with parameters and the volume of data. You could, for instance, read in and train on all of 2019's taxi data (`yellow_tripdata_2019-*.csv`). *Make sure you test on a different test set!*\n",
"Feel free to play around with parameters and the volume of data. You could, for instance, read in and train on all of 2019's taxi data (`yellow_tripdata_2019-*.parquet`). *Make sure you test on a different test set!*\n",
"\n",
"Take a look at our other [examples](https://saturncloud.io/docs/examples/) for more resources on running models on single and multiple GPUs!"
]
}
],
"metadata": {
"interpreter": {
"hash": "0c30809920022c12dc34b6aa5982c47acf3f18a4dd3ede4f803889865384c7fa"
},
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('base': conda)",
"display_name": "Python 3.10.2 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
Expand All @@ -447,7 +445,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
"version": "3.10.2"
},
"vscode": {
"interpreter": {
"hash": "b7848b2fbd737d4d16c30c2a265d9cb43a8b0508277d828bf32f61f61a6b4e46"
}
}
},
"nbformat": 4,
Expand Down

0 comments on commit 1c5e846

Please sign in to comment.