Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Remove HugeCTR examples
Browse files Browse the repository at this point in the history
radekosmulski committed May 9, 2023
1 parent 92dc8cb commit e4e5396
Showing 5 changed files with 44 additions and 1,181 deletions.
39 changes: 10 additions & 29 deletions examples/scaling-criteo/01-Download-Convert.ipynb
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -33,7 +33,7 @@
"\n",
"# Scaling Criteo: Download and Convert\n",
"\n",
"This notebook is created using the latest stable [merlin-hugectr](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-hugectr/tags), [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags), or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container. \n",
"This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container. \n",
"\n",
"## Criteo 1TB Click Logs dataset\n",
"\n",
@@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -151,7 +151,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -177,7 +177,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -196,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -227,7 +227,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -277,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -297,28 +297,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"merlin": {
"containers": [
"nvcr.io/nvidia/merlin/merlin-hugectr:latest",
"nvcr.io/nvidia/merlin/merlin-tensorflow:latest",
"nvcr.io/nvidia/merlin/merlin-pytorch:latest"
]
}
},
"nbformat": 4,
136 changes: 33 additions & 103 deletions examples/scaling-criteo/02-ETL-with-NVTabular.ipynb
Original file line number Diff line number Diff line change
@@ -2,12 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
@@ -37,7 +33,7 @@
"\n",
"# Scaling Criteo: ETL with NVTabular\n",
"\n",
"This notebook is created using the latest stable [merlin-hugectr](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-hugectr/tags), [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags), or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n",
"This notebook is created using the latest stable [merlin-tensorflow](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-tensorflow/tags) or [merlin-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/merlin/containers/merlin-pytorch/tags) container.\n",
"\n",
"## Overview\n",
"\n",
@@ -80,12 +76,8 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Standard Libraries\n",
@@ -122,19 +114,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define some information about where to get our data\n",
"BASE_DIR = os.environ.get(\"BASE_DIR\", \"/raid/data/criteo\")\n",
"INPUT_DATA_DIR = os.environ.get(\"INPUT_DATA_DIR\", BASE_DIR + \"/converted/criteo\")\n",
"OUTPUT_DATA_DIR = os.environ.get(\"OUTPUT_DATA_DIR\", BASE_DIR + \"/test_dask/output\")\n",
"USE_HUGECTR = bool(os.environ.get(\"USE_HUGECTR\", \"\"))\n",
"stats_path = os.path.join(OUTPUT_DATA_DIR, \"test_dask/stats\")\n",
"dask_workdir = os.path.join(OUTPUT_DATA_DIR, \"test_dask/workdir\")\n",
"\n",
@@ -163,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -179,12 +166,8 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
@@ -216,12 +199,8 @@
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
@@ -465,7 +444,7 @@
"<Client: 'tcp://127.0.0.1:44059' processes=2 threads=2, memory=100.00 GiB>"
]
},
"execution_count": 9,
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
@@ -537,12 +516,8 @@
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define our dataset schema\n",
@@ -568,24 +543,19 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We need to enforce the required HugeCTR data types, so we set them in a dictionary and give as an argument when creating our dataset. The dictionary defines the output datatypes of our datasets."
"Optionally, we can define the output datatypes of our datasets."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dict_dtypes = {}\n",
"\n",
"# The environment variable USE_HUGECTR defines, if we want to use the output for HugeCTR or another framework\n",
"for col in CATEGORICAL_COLUMNS:\n",
" dict_dtypes[col] = np.int64 if USE_HUGECTR else np.int32\n",
" dict_dtypes[col] = np.int32\n",
"\n",
"for col in CONTINUOUS_COLUMNS:\n",
" dict_dtypes[col] = np.float32\n",
@@ -603,12 +573,8 @@
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_dataset = nvt.Dataset(train_paths, engine=\"parquet\", part_size=part_size)\n",
@@ -624,12 +590,8 @@
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output_train_dir = os.path.join(OUTPUT_DATA_DIR, \"train/\")\n",
@@ -647,7 +609,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [
{
@@ -661,10 +623,10 @@
{
"data": {
"text/plain": [
"<nvtabular.workflow.workflow.Workflow at 0x7fdacec4fdc0>"
"<nvtabular.workflow.workflow.Workflow>"
]
},
"execution_count": 15,
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
@@ -676,12 +638,8 @@
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
@@ -708,12 +666,8 @@
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"jupyter": {
"outputs_hidden": false
}
},
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
@@ -745,7 +699,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -754,35 +708,11 @@
}
],
"metadata": {
"file_extension": ".py",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"merlin": {
"containers": [
"nvcr.io/nvidia/merlin/merlin-hugectr:latest",
"nvcr.io/nvidia/merlin/merlin-tensorflow:latest",
"nvcr.io/nvidia/merlin/merlin-pytorch:latest"
]
},
"mimetype": "text/x-python",
"npconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
}
},
"nbformat": 4,
"nbformat_minor": 4
402 changes: 0 additions & 402 deletions examples/scaling-criteo/03-Training-with-HugeCTR.ipynb

This file was deleted.

639 changes: 0 additions & 639 deletions examples/scaling-criteo/04-Triton-Inference-with-HugeCTR.ipynb

This file was deleted.

9 changes: 1 addition & 8 deletions examples/scaling-criteo/README.md
Original file line number Diff line number Diff line change
@@ -6,8 +6,7 @@ We demonstrate how to scale NVTabular, as well as:

- Use multiple GPUs and nodes with NVTabular for feature engineering.
- Train recommender system models with the Merlin Models for TensorFlow.
- Train recommender system models with HugeCTR using multiple GPUs.
- Inference with the Triton Inference Server and Merlin Models for TensorFlow or HugeCTR.
- Inference with the Triton Inference Server and Merlin Models for TensorFlow.

Our recommendation is to use our latest stable [Merlin containers](https://catalog.ngc.nvidia.com/containers?filters=&orderBy=dateModifiedDESC&query=merlin) for the examples. Each notebook provides the required container.

@@ -18,9 +17,3 @@ Training and Deployment with **TensorFlow**:
- [Feature Engineering with NVTabular](02-ETL-with-NVTabular.ipynb)
- [Training with TensorFlow](03-Training-with-Merlin-Models-TensorFlow.ipynb)
- [Deploy the TensorFlow Model with Triton Inference Server](04-Triton-Inference-with-Merlin-Models-TensorFlow.ipynb)

Training and Deployment with **HugeCTR**:
- [Download and Convert](01-Download-Convert.ipynb)
- [Feature Engineering with NVTabular](02-ETL-with-NVTabular.ipynb)
- [Training with HugeCTR](03-Training-with-HugeCTR.ipynb)
- [Deploy the HugeCTR Model with Triton Inference Server](04-Triton-Inference-with-HugeCTR.ipynb)

0 comments on commit e4e5396

Please sign in to comment.