diff --git a/.kokoro/tests/training/sklearn_structured_custom_routines.sh b/.kokoro/tests/training/sklearn_structured_custom_routines.sh new file mode 100755 index 000000000..bed25056d --- /dev/null +++ b/.kokoro/tests/training/sklearn_structured_custom_routines.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -eo pipefail + + +download_files() { + # Download files for testing. + GCS_FOLDER="gs://cloud-samples-data/ml-engine/chicago_taxi" + + echo "Downloading files" + gsutil cp ${GCS_FOLDER}/training/small/taxi_trips_train.csv data/taxi_trips_train.csv + gsutil cp ${GCS_FOLDER}/training/small/taxi_trips_eval.csv data/taxi_trips_eval.csv + gsutil cp ${GCS_FOLDER}/prediction/taxi_trips_prediction_dict.ndjson data/taxi_trips_prediction_dict.ndjson + + # Define ENV for `train-local.sh` script + export TAXI_TRAIN_SMALL=data/taxi_trips_train.csv + export TAXI_EVAL_SMALL=data/taxi_trips_eval.csv + export TAXI_PREDICTION_DICT_NDJSON=data/taxi_trips_prediction_dict.ndjson +} + + +run_tests() { + # Run base tests. + echo "Running code tests in `pwd`." + download_files + # Run local training and local prediction + source scripts/train-local.sh +} + + +main(){ + cd ${KOKORO_ARTIFACTS_DIR}/github/ai-platform-samples/${CAIP_TEST_DIR} + run_tests + echo 'Test was successful' +} + +main diff --git a/.kokoro/training/sklearn/structured/custom_routines/common.cfg b/.kokoro/training/sklearn/structured/custom_routines/common.cfg new file mode 100644 index 000000000..537f1faac --- /dev/null +++ b/.kokoro/training/sklearn/structured/custom_routines/common.cfg @@ -0,0 +1,50 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Format: //devtools/kokoro/config/proto/build.proto + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + + +# Download credentials from Cloud Storage. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/ai-platform-samples" + + +# Use the trampoline script to run in docker. +build_file: "ai-platform-samples/.kokoro/trampoline.sh" + + +# Environment Variables. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python" +} + +# Tell the trampoline which tests to run. +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/ai-platform-samples/.kokoro/tests/run_tests.sh" +} + +env_vars: { + key: "CAIP_TEST_DIR" + value: "training/sklearn/structured/custom_routines" +} + +# Run specific tests +env_vars: { + key: "CAIP_TEST_SCRIPT" + value: "github/ai-platform-samples/.kokoro/tests/training/sklearn_structured_custom_routines.sh" +} diff --git a/.kokoro/training/sklearn/structured/custom_routines/continuous.cfg b/.kokoro/training/sklearn/structured/custom_routines/continuous.cfg new file mode 100644 index 000000000..ff03e8bf5 --- /dev/null +++ b/.kokoro/training/sklearn/structured/custom_routines/continuous.cfg @@ -0,0 +1,15 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Format: //devtools/kokoro/config/proto/build.proto diff --git a/.kokoro/training/sklearn/structured/custom_routines/periodic.cfg b/.kokoro/training/sklearn/structured/custom_routines/periodic.cfg new file mode 100644 index 000000000..ff03e8bf5 --- /dev/null +++ b/.kokoro/training/sklearn/structured/custom_routines/periodic.cfg @@ -0,0 +1,15 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Format: //devtools/kokoro/config/proto/build.proto diff --git a/.kokoro/training/sklearn/structured/custom_routines/presubmit.cfg b/.kokoro/training/sklearn/structured/custom_routines/presubmit.cfg new file mode 100644 index 000000000..ff03e8bf5 --- /dev/null +++ b/.kokoro/training/sklearn/structured/custom_routines/presubmit.cfg @@ -0,0 +1,15 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Format: //devtools/kokoro/config/proto/build.proto diff --git a/setup/README.md b/setup/README.md index 557e30cc6..01899a681 100644 --- a/setup/README.md +++ b/setup/README.md @@ -21,9 +21,9 @@ and follow the instructions. 5- Enable the API for the following services: - * [Compute Engine](https://pantheon.corp.google.com/compute) - * [Storage](https://pantheon.corp.google.com/storage) - * [AI Platform](https://pantheon.corp.google.com/mlengine) + * [Compute Engine](https://console.cloud.google.com/compute) + * [Storage](https://console.cloud.google.com/storage) + * [AI Platform](https://console.cloud.google.com/mlengine) From your terminal, run: diff --git a/training/__init__.py b/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/sklearn/structured/__init__.py b/training/sklearn/structured/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/sklearn/structured/base/README.md b/training/sklearn/structured/base/README.md index 341cf27a5..46ae89964 100644 --- a/training/sklearn/structured/base/README.md +++ b/training/sklearn/structured/base/README.md @@ -30,7 +30,7 @@ executed on your local machine. * [task.py](trainer/task.py) initializes and parses task arguments. This is the entry point to the trainer. * [model.py](trainer/model.py) includes a function to create the scikit-learn estimator or pipeline * [metadata.py](trainer/metadata.py) contains the definition for the target and feature names, among other configuring variables - * [util.py](trainer/task.py) contains a number of helper functions used in task.py + * [util.py](trainer/util.py) contains a number of helper functions used in task.py * [scripts](./scripts) directory: command-line scripts to train the model locally or on AI Platform. We recommend to run the scripts in this directory in the following order, and use the `source` command to run them, in order to export the environment variables at each step: @@ -71,6 +71,12 @@ This will create a training job on AI Platform and displays some instructions on At the end of a successful training job, it will upload the trained model object to a GCS bucket and sets `$MODEL_DIR` environment variable to the directory containing the model. +### Monitoring +Once the training starts and the models are generated, you may view the training job in +the [AI Platform page](https://console.cloud.google.com/mlengine/jobs). If you click on the +corresponding training job, you will be able to view the chosen hyperparamters, along with the +metric scores for each model. All the generated model objects will be stored on GCS. + ## Explaining Key Elements In this section, we'll highlight the main elements of this sample. diff --git a/training/sklearn/structured/base/trainer/metadata.py b/training/sklearn/structured/base/trainer/metadata.py index 228e765cb..a836c495a 100644 --- a/training/sklearn/structured/base/trainer/metadata.py +++ b/training/sklearn/structured/base/trainer/metadata.py @@ -22,7 +22,9 @@ # Target name TARGET_NAME = 'tip' -# The features to be used for training +# The features to be used for training. +# If FEATURE_NAMES is None, then all the available columns will be +# used as features, except for the target column. FEATURE_NAMES = [ 'trip_miles', 'trip_seconds', diff --git a/training/sklearn/structured/base/trainer/utils.py b/training/sklearn/structured/base/trainer/utils.py index e74eaedbc..210544582 100644 --- a/training/sklearn/structured/base/trainer/utils.py +++ b/training/sklearn/structured/base/trainer/utils.py @@ -35,8 +35,14 @@ def data_train_test_split(data_df): pandas.DataFrame, pandas.Series) """ - # Only use metadata.FEATURE_NAMES + metadata.TARGET_NAME - features = data_df[metadata.FEATURE_NAMES] + if metadata.FEATURE_NAMES is None: + # Use all the columns as features, except for the target column + feature_names = list(data_df.columns) + feature_names.remove(metadata.TARGET_NAME) + features = data_df[feature_names] + else: + # Only use metadata.FEATURE_NAMES + features = data_df[metadata.FEATURE_NAMES] target = data_df[metadata.TARGET_NAME] x_train, x_val, y_train, y_val = ms.train_test_split(features, @@ -70,7 +76,7 @@ def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None): def read_df_from_gcs(file_pattern): - """Read data from Google Cloud Storage, split into train and validation sets. + """Read data from Google Cloud Storage, split into train and validation sets Assume that the data on GCS is in csv format without header. The column names will be provided through metadata diff --git a/training/sklearn/structured/custom_routines/README.md b/training/sklearn/structured/custom_routines/README.md new file mode 100644 index 000000000..b489894b9 --- /dev/null +++ b/training/sklearn/structured/custom_routines/README.md @@ -0,0 +1,188 @@ +# Training with scikit-learn - Custom Prediction Routines + +## Overview + +The purpose of this directory is to provide a sample to show to to train a +scikit-learn model on AI Platform with custom routines. The sample makes it easier to organize +your code, and to adapt it to your dataset. In more details, +the template covers the following functionality: + +* Metadata to define your dataset, features, and target. +* Standard implementation of input, parsing, and serving functions. +* Train, evaluate, and export the model. + +Although this sample provides standard implementation to different +functionality, you can customize these parts with your own implementation. + +## Prerequisites + +* Follow the instructions in the [setup](../../../../setup) directory in order to setup your environment +* Follow the instructions in the [datasets](../../../../datasets) directory and +run [download-taxi.sh](../../../../datasets/download-taxi.sh) to download the datasets +* Change the directory to this sample and run `python setup.py install`. Note: This +is mostly for local testing of your code. When you submit a training job, no code will be +executed on your local machine. + +## Sample Structure + +* [trainer](./trainer) directory: containing the training package to be submitted to AI Platform + * [__init__py](./trainer/__init__.py) which is an empty file. It is needed to make this directory a Python package. + * [task.py](trainer/task.py) initializes and parses task arguments. This is the entry point to the trainer. + * [model.py](trainer/model.py) includes a function to create the scikit-learn estimator or pipeline + * [metadata.py](trainer/metadata.py) contains the definition for the target and feature names, among other configuring variables + * [util.py](trainer/util.py) contains a number of helper functions used in task.py + * [my_pipeline.py](trainer/my_pipeline.py) contains the custom routines to create a pipeline +* [scripts](./scripts) directory: command-line scripts to train the model locally or on AI Platform. + We recommend to run the scripts in this directory in the following order, and use + the `source` command to run them, in order to export the environment variables at each step: + * [train-local.sh](./scripts/train-local.sh) trains the model locally using `gcloud`. It is always a + good idea to try and train the model locally for debugging, before submitting it to AI Platform. + * [train-cloud.sh](./scripts/train-cloud.sh) submits a training job to AI Platform. +* [setup.py](./setup.py): containing all the required Python packages for this tutorial. + + +We recommend that you follow the same structure for your own work. In most cases, you only need to +modify: + + - `metadata.py` + - `model.py` + + and leave the other python files untouched. + +## Running the Sample + +After you go over the steps in the prerequisites section, you are ready to run this sample. +Here are the steps you need to take: + +1. _[Optional]_ Train the model locally. Run: + +```bash +source ./scripts/train-local.sh +``` + +as many times as you like (This has no effect on your cloud usage). If successful, this script should +create a new model as `trained/structured-taxi/model.joblib`, which means you may now submit a +training job to AI Platform. + +2. Submit a training job to AI Platform. Run: + +```bash +source ./scripts/train-cloud.sh +``` +This will create a training job on AI Platform and displays some instructions on how to track the job progress. +At the end of a successful training job, it will upload the trained model object to a GCS +bucket and sets `$MODEL_DIR` environment variable to the parent directory of all the generated models. + +It will also package up the custom routine and upload it to the bucket and +set the environment variable `CUSTOM_ROUTINE_PATH` which points to it. +`CUSTOM_ROUTINE_PATH` will later be used during the deployment of the model. + +### Monitoring +Once the training starts and the models are generated, you may view the training job in +the [AI Platform page](https://console.cloud.google.com/mlengine/jobs). If you click on the +corresponding training job, you will be able to view the chosen hyperparamters, along with the +metric scores for each model. All the generated model objects will be stored on GCS. + +## Explaining Key Elements + +In this section, we'll highlight the main elements of this sample. + +### [model.py](trainer/model.py) + +In this sample, we simply create an instance of `RandomForestClassifier` estimator and return it. + +### [metadata.py](trainer/metadata.py) + +We define which features should be used for training. We also define what the target is. + +### [my_pipeline.py](trainer/my_pipeline.py) + +This file contains our custom routines. The code in this file is required for making predictions. +We will package and ship this, along with our trained model +when we deploy our model to AI Platform to make predictions. + +### [train-local.sh](./scripts/train-local.sh) + +The command to run the training job locally is this: + +```bash +gcloud ai-platform local train \ + --module-name=trainer.task \ + --package-path=${PACKAGE_PATH} \ + --job-dir=${MODEL_DIR} \ + -- \ + --log-level DEBUG \ + --input=${TAXI_TRAIN_SMALL} +``` + +* `--job-dir`: path for the output artifacts, e.g. the model object +* `module-name` is the name of the Python file inside the package which runs the training job +* `package-path` determines where the training Python package is. +* `--` this is just a separator. Anyhing after this will be passed to the training job as input argument. +* `log-level` sets the python logger level to DEBUG for this script (default is INFO) +* `--input`: path to the input dataset + + +### [train-cloud.sh](./scripts/train-cloud.sh) + +To submit a training job to AI Platform, the main command is: + +```bash +gcloud ai-platform jobs submit training ${JOB_NAME} \ + --job-dir=${MODEL_DIR} \ + --runtime-version=${RUNTIME_VERSION} \ + --region=${REGION} \ + --scale-tier=${TIER} \ + --module-name=trainer.task \ + --package-path=${PACKAGE_PATH} \ + --python-version=${PYTHON_VERSION} \ + --stream-logs \ + -- \ + --input=${GCS_TAXI_TRAIN_BIG} +``` + +* `${JOB_NAME}` is a unique name for each job. We create one with a timestamp to make it unique each time. +* `runtime-version`: which runtime to use. See [this](https://cloud.google.com/ml-engine/docs/tensorflow/runtime-version-list) for more information. +* `scale-tier` is to choose the tier. For this sample, we use BASIC. However, if you need +to use accelerators for instance, or do a distributed training, you will need a different tier. +* `region`: which region to run the training job in. +* `stream-logs`: streams the logs until the job finishes. +* `config`: passing the config file which contains the hyperparameter tuning information. + +## Clean Up +If you were able to run [train-cloud.sh](./scripts/train-cloud.sh) successfully, you have +created and stored some files in your GCS bucket. You may simply remove them by running + +```bash +source ./scripts/cleanup.sh +``` + +## How Custom Prediction Routines Work + +Custom Prediction Routines is a piece of Python code which you will submit along +with your trained models you are deploying it to AI Platform. A typical use case +for custom prediction routines is when you want to create a custom scikit-learn +pipeline, and you do not need to use docker containers for your prediction. + +### Highlights + +Let's take a quick look at how the custom routines work on AI-Platform. +If you look closely, this sample is quite similar to the [base sample for scikit-learn](../base). +To highlight the differences: + +1. [my_pipeline.py](trainer/my_pipeline.py) contains the custom code that we need to package and pass alongside the +model during deployment. + +2. In [train-cloud.sh](./scripts/train-cloud.sh) we package our Python code and upload it to the bucket. +Note that we package everything in the trainer folder since it is easier, even though most other Python files +in that directory are not needed to be packaged. + + \ No newline at end of file diff --git a/training/sklearn/structured/custom_routines/package.py b/training/sklearn/structured/custom_routines/package.py new file mode 100644 index 000000000..ce501a672 --- /dev/null +++ b/training/sklearn/structured/custom_routines/package.py @@ -0,0 +1,5 @@ +import setuptools +setuptools.setup(name='custom_routine', + packages=['trainer'], + version="1.0", + ) diff --git a/training/sklearn/structured/custom_routines/requirements.txt b/training/sklearn/structured/custom_routines/requirements.txt new file mode 100644 index 000000000..8d38cfb6b --- /dev/null +++ b/training/sklearn/structured/custom_routines/requirements.txt @@ -0,0 +1,8 @@ +# The pip syntax below allows us to not repeat +# In order to not maintain two separate dependency +# lists in setup.py vs requirements.txt +# See https://caremad.io/posts/2013/07/setup-vs-requirement/ + +--index-url https://pypi.python.org/simple/ + +-e . \ No newline at end of file diff --git a/training/sklearn/structured/custom_routines/scripts/cleanup.sh b/training/sklearn/structured/custom_routines/scripts/cleanup.sh new file mode 100755 index 000000000..501e67311 --- /dev/null +++ b/training/sklearn/structured/custom_routines/scripts/cleanup.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -v + +# Delete the directories created by setup.py: +rm -rf dist +rm -rf trainer.egg-info +rm -rf build +rm -rf trained +rm -rf custom_routine.egg-info + + +# This has to be run after train-cloud.sh is successfully executed + +# Delete Cloud Storage objects that were created +gsutil -m rm -r ${MODEL_DIR} +gsutil -m rm -r ${CUSTOM_ROUTINE_PATH} + +set - diff --git a/training/sklearn/structured/custom_routines/scripts/train-cloud.sh b/training/sklearn/structured/custom_routines/scripts/train-cloud.sh new file mode 100755 index 000000000..ff5ca1303 --- /dev/null +++ b/training/sklearn/structured/custom_routines/scripts/train-cloud.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -v + +echo "Submitting an AI Platform job..." + +TIER="BASIC" # BASIC | BASIC_GPU | STANDARD_1 | PREMIUM_1 + +export MODEL_NAME="sklearn_taxi" + +PACKAGE_PATH=./trainer # this can be a gcs location to a zipped and uploaded package +export MODEL_DIR=gs://${BUCKET_NAME}/${MODEL_NAME} +export CUSTOM_ROUTINE_PATH=gs://${BUCKET_NAME}/${MODEL_NAME}/library/custom_routine-1.0.tar.gz + +gsutil mb gs://${BUCKET_NAME} + +CURRENT_DATE=`date +%Y%m%d_%H%M%S` +JOB_NAME=train_${MODEL_NAME}_${CURRENT_DATE} + +gcloud ai-platform jobs submit training ${JOB_NAME} \ + --job-dir=${MODEL_DIR} \ + --runtime-version=${RUNTIME_VERSION} \ + --region=${REGION} \ + --scale-tier=${TIER} \ + --module-name=trainer.task \ + --package-path=${PACKAGE_PATH} \ + --python-version=${PYTHON_VERSION} \ + --stream-logs \ + -- \ + --input=${GCS_TAXI_TRAIN_BIG} \ + --n-estimators=20 \ + --max-depth=3 + +python ./package.py sdist --formats=gztar +gsutil cp ./dist/custom_routine-1.0.tar.gz ${CUSTOM_ROUTINE_PATH} + +set - + +# Notes: +# GCS_TAXI_TRAIN_BIG is set by datasets/downlaod-taxi.sh script +# use --packages instead of --package-path if gcs location +# add --reuse-job-dir to resume training diff --git a/training/sklearn/structured/custom_routines/scripts/train-local.sh b/training/sklearn/structured/custom_routines/scripts/train-local.sh new file mode 100755 index 000000000..072208e8d --- /dev/null +++ b/training/sklearn/structured/custom_routines/scripts/train-local.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -v + +echo "Training local ML model" + +MODEL_NAME="structured-taxi" + +PACKAGE_PATH=./trainer +MODEL_DIR=./trained/${MODEL_NAME} + +gcloud ai-platform local train \ + --module-name=trainer.task \ + --package-path=${PACKAGE_PATH} \ + --job-dir=${MODEL_DIR} \ + -- \ + --log-level DEBUG \ + --input=${TAXI_TRAIN_SMALL} \ + --n-estimators=20 \ + --max-depth=3 + +set - + +# Notes: +# TAXI_TRAIN_SMALL is set by datasets/downlaod-taxi.sh script diff --git a/training/sklearn/structured/custom_routines/setup.py b/training/sklearn/structured/custom_routines/setup.py new file mode 100644 index 000000000..f5b0449e9 --- /dev/null +++ b/training/sklearn/structured/custom_routines/setup.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from setuptools import find_packages +from setuptools import setup + +REQUIRED_PACKAGES = [ + 'tensorflow==1.14.0', + 'scikit-learn>=0.20.2', + 'pandas==0.24.2', + 'cloudml-hypertune', +] + +setup( + name='trainer', + version='0.1', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + include_package_data=True, + description='AI Platform | Training | scikit-learn | Base' +) diff --git a/training/sklearn/structured/custom_routines/trainer/__init__.py b/training/sklearn/structured/custom_routines/trainer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/sklearn/structured/custom_routines/trainer/metadata.py b/training/sklearn/structured/custom_routines/trainer/metadata.py new file mode 100644 index 000000000..e337dbf01 --- /dev/null +++ b/training/sklearn/structured/custom_routines/trainer/metadata.py @@ -0,0 +1,43 @@ +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Dataset metadata.""" + +# If the input CSV file has a header row, then set CSV_COLUMNS to None. +# Otherwise, set CSV_COLUMNS to a list of target and feature names: +CSV_COLUMNS = None + +# Target name +TARGET_NAME = 'tip' + +# The features to be used for training. +# If FEATURE_NAMES is None, then all the available columns will be +# used as features, except for the target column. +FEATURE_NAMES = None + +# If the model is serialized using joblib +# then use 'model.joblib' for the model name +MODEL_FILE_NAME = 'model.joblib' + +# Set to True if you want to tune some hyperparameters +HYPERPARAMTER_TUNING = False + +# Used only if the dataset is to be read from BigQuery +BASE_QUERY = ''' + SELECT + * + FROM + `{table}` + ''' diff --git a/training/sklearn/structured/custom_routines/trainer/model.py b/training/sklearn/structured/custom_routines/trainer/model.py new file mode 100644 index 000000000..0635149cc --- /dev/null +++ b/training/sklearn/structured/custom_routines/trainer/model.py @@ -0,0 +1,58 @@ +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""ML model definitions.""" + +from sklearn import ensemble +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline +from trainer import my_pipeline as mp + + +def get_estimator(arguments): + """Generate ML Pipeline which include both pre-processing and model training + + Args: + arguments: (argparse.ArgumentParser), parameters passed from command-line + + Returns: + structured.pipeline.Pipeline + """ + + # We want to use 5 numerical features and 1 categorical in this sample. + numerical_indices = [0, 1, 2, 3, 4, 5] # trip_miles, ..., trip_start_day + categorical_indices = [14] # 14th feature is payment_type + + p1 = make_pipeline(mp.PositionalSelector(categorical_indices), + mp.StripString(), + mp.SimpleOneHotEncoder()) + p2 = make_pipeline(mp.PositionalSelector(numerical_indices), + StandardScaler()) + + feats = FeatureUnion([ + ('numericals', p1), + ('categoricals', p2), + ]) + + # n_estimators and max_depth are expected to be passed as + # command line argument to task.py + pipeline = Pipeline([ + ('pre', feats), + ('estimator', ensemble.RandomForestClassifier( + n_estimators=arguments.n_estimators, + max_depth=arguments.max_depth) + ) + ]) + return pipeline diff --git a/training/sklearn/structured/custom_routines/trainer/my_pipeline.py b/training/sklearn/structured/custom_routines/trainer/my_pipeline.py new file mode 100644 index 000000000..370caedf9 --- /dev/null +++ b/training/sklearn/structured/custom_routines/trainer/my_pipeline.py @@ -0,0 +1,47 @@ +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin + + +# A pipeline to select a subset of features, given their positional indices +class PositionalSelector(BaseEstimator, TransformerMixin): + def __init__(self, positions): + self.positions = positions + + def fit(self, X, y=None): + return self + + def transform(self, X): + return np.array(X)[:, self.positions] + + +class StripString(BaseEstimator, TransformerMixin): + def fit(self, X, y=None): + return self + + def transform(self, X): + strip = np.vectorize(str.strip) + return strip(np.array(X)) + + +# A simple one hot encoder for scikit-learn +class SimpleOneHotEncoder(BaseEstimator, TransformerMixin): + def fit(self, X, y=None): + self.values = [] + for c in range(X.shape[1]): + Y = X[:, c] + values = {v: i for i, v in enumerate(np.unique(Y))} + self.values.append(values) + return self + + def transform(self, X): + X = np.array(X) + matrices = [] + for c in range(X.shape[1]): + Y = X[:, c] + mat = np.zeros(shape=(len(Y), len(self.values[c])), dtype=np.int8) + for i, x in enumerate(Y): + if x in self.values[c]: + mat[i][self.values[c][x]] = 1 + matrices.append(mat) + res = np.concatenate(matrices, axis=1) + return res diff --git a/training/sklearn/structured/custom_routines/trainer/task.py b/training/sklearn/structured/custom_routines/trainer/task.py new file mode 100644 index 000000000..c0852088a --- /dev/null +++ b/training/sklearn/structured/custom_routines/trainer/task.py @@ -0,0 +1,150 @@ +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Executes model training and evaluation.""" + +import argparse +import logging +import os + +import hypertune +import numpy as np +from datetime import datetime +from sklearn import model_selection +from trainer import metadata +from trainer import model +from trainer import utils + + +def _train_and_evaluate(estimator, dataset, output_dir): + """Runs model training and evaluation. + + Args: + estimator: (pipeline.Pipeline), Pipeline instance assemble pre-processing + steps and model training + dataset: (pandas.DataFrame), DataFrame containing training data + output_dir: (string), directory that the trained model will be exported + + Returns: + None + """ + x_train, y_train, x_val, y_val = utils.data_train_test_split(dataset) + estimator.fit(x_train, y_train) + + # Write model and eval metrics to `output_dir` + model_output_path = os.path.join(output_dir, 'model', + metadata.MODEL_FILE_NAME) + + utils.dump_object(estimator, model_output_path) + + if metadata.HYPERPARAMTER_TUNING: + # Note: for now, use `cross_val_score` defaults (i.e. 3-fold) + scores = model_selection.cross_val_score(estimator, x_val, y_val, cv=3) + + logging.info('Scores: %s', scores) + + # The default name of the metric is training/hptuning/metric. + # We recommend that you assign a custom name + # The only functional difference is that if you use a custom name, + # you must set the hyperparameterMetricTag value in the + # HyperparameterSpec object in the job request to match the chosen name + hpt = hypertune.HyperTune() + hpt.report_hyperparameter_tuning_metric( + hyperparameter_metric_tag='Taxi Model Accuracy', + metric_value=np.mean(scores), + global_step=900) + + +def run_experiment(arguments): + """Testbed for running model training and evaluation.""" + # Get data for training and evaluation + + logging.info('Arguments: %s', arguments) + + dataset = utils.read_df_from_gcs(arguments.input) + + # Get estimator + estimator = model.get_estimator(arguments) + + # Run training and evaluation + _train_and_evaluate(estimator, dataset, arguments.job_dir) + + +def _parse_args(): + """Parses command-line arguments.""" + + parser = argparse.ArgumentParser() + + parser.add_argument( + '--log-level', + help='Logging level.', + choices=[ + 'DEBUG', + 'ERROR', + 'FATAL', + 'INFO', + 'WARN', + ], + default='INFO', + ) + + parser.add_argument( + '--input', + help='''Dataset to use for training and evaluation. + Can be BigQuery table or a file (CSV). + If BigQuery table, specify as as PROJECT_ID.DATASET.TABLE_NAME. + ''', + required=True, + ) + + parser.add_argument( + '--job-dir', + help='Output directory for exporting model and other metadata.', + required=True, + ) + + parser.add_argument( + '--n-estimators', + help='Number of trees in the forest.', + default=10, + type=int, + ) + + parser.add_argument( + '--max-depth', + help='The maximum depth of the tree.', + type=int, + default=3, + ) + + return parser.parse_args() + + +def main(): + """Entry point""" + + arguments = _parse_args() + logging.basicConfig(level=arguments.log_level) + # Run the train and evaluate experiment + time_start = datetime.utcnow() + run_experiment(arguments) + time_end = datetime.utcnow() + time_elapsed = time_end - time_start + logging.info('Experiment elapsed time: {} seconds'.format( + time_elapsed.total_seconds())) + + +if __name__ == '__main__': + main() diff --git a/training/sklearn/structured/custom_routines/trainer/utils.py b/training/sklearn/structured/custom_routines/trainer/utils.py new file mode 100644 index 000000000..499f69dbd --- /dev/null +++ b/training/sklearn/structured/custom_routines/trainer/utils.py @@ -0,0 +1,150 @@ +# Copyright 2019 Google LLC. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Hold utility functions.""" + +import os + +import pandas as pd +from sklearn import model_selection as ms +from sklearn.externals import joblib +from tensorflow import gfile +from trainer import metadata + + +def data_train_test_split(data_df): + """Split the DataFrame two subsets for training and testing. + + Args: + data_df: (pandas.DataFrame) DataFrame the splitting to be performed on + + Returns: + A Tuple of (pandas.DataFrame, pandas.Series, + pandas.DataFrame, pandas.Series) + """ + + if metadata.FEATURE_NAMES is None: + # Use all the columns as features, except for the target column + feature_names = list(data_df.columns) + feature_names.remove(metadata.TARGET_NAME) + features = data_df[feature_names] + else: + # Only use metadata.FEATURE_NAMES + features = data_df[metadata.FEATURE_NAMES] + target = data_df[metadata.TARGET_NAME] + + x_train, x_val, y_train, y_val = ms.train_test_split(features, + target, + test_size=0.2) + return x_train.values, y_train, x_val.values, y_val + + +def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None): + """Read data from BigQuery and split into train and validation sets. + + Args: + full_table_path: (string) full path of the table containing training data + in the format of [project_id.dataset_name.table_name]. + project_id: (string, Optional) Google BigQuery Account project ID. + num_samples: (int, Optional) Number of data samples to read. + + Returns: + pandas.DataFrame + """ + + query = metadata.BASE_QUERY.format(table=full_table_path) + limit = ' LIMIT {}'.format(num_samples) if num_samples else '' + query += limit + + # Use "application default credentials" + # Use SQL syntax dialect + data_df = pd.read_gbq(query, project_id=project_id, dialect='standard') + + return data_df + + +def read_df_from_gcs(file_pattern): + """Read data from Google Cloud Storage, split into train and validation sets. + + Assume that the data on GCS is in csv format without header. + The column names will be provided through metadata + + Args: + file_pattern: (string) pattern of the files containing training data. + For example: [gs://bucket/folder_name/prefix] + + Returns: + pandas.DataFrame + """ + + # Download the files to local /tmp/ folder + df_list = [] + + for filepath in gfile.Glob(file_pattern): + with gfile.Open(filepath, 'r') as f: + if metadata.CSV_COLUMNS is None: + df_list.append(pd.read_csv(f)) + else: + df_list.append(pd.read_csv(f, names=metadata.CSV_COLUMNS, + header=None)) + + data_df = pd.concat(df_list) + + return data_df + + +def upload_to_gcs(local_path, gcs_path): + """Upload local file to Google Cloud Storage. + + Args: + local_path: (string) Local file + gcs_path: (string) Google Cloud Storage destination + + Returns: + None + """ + gfile.Copy(local_path, gcs_path) + + +def dump_object(object_to_dump, output_path): + """Pickle the object and save to the output_path. + + Args: + object_to_dump: Python object to be pickled + output_path: (string) output path which can be Google Cloud Storage + + Returns: + None + """ + + if not gfile.Exists(output_path): + gfile.MakeDirs(os.path.dirname(output_path)) + with gfile.Open(output_path, 'w') as wf: + joblib.dump(object_to_dump, wf) + + +def boolean_mask(columns, target_columns): + """Create a boolean mask indicating location of target_columns in columns. + + Args: + columns: (List[string]), list of all columns considered. + target_columns: (List[string]), columns whose position + should be masked as 1. + + Returns: + List[bool] + """ + target_set = set(target_columns) + return [x in target_set for x in columns] diff --git a/training/sklearn/structured/hp_tuning/README.md b/training/sklearn/structured/hp_tuning/README.md index ee53c3b2f..1edc1e5da 100644 --- a/training/sklearn/structured/hp_tuning/README.md +++ b/training/sklearn/structured/hp_tuning/README.md @@ -30,7 +30,7 @@ executed on your local machine. * [task.py](trainer/task.py) initializes and parses task arguments. This is the entry point to the trainer. * [model.py](trainer/model.py) includes a function to create the scikit-learn estimator or pipeline * [metadata.py](trainer/metadata.py) contains the definition for the target and feature names, among other configuring variables - * [util.py](trainer/task.py) contains a number of helper functions used in task.py + * [util.py](trainer/util.py) contains a number of helper functions used in task.py * [scripts](./scripts) directory: command-line scripts to train the model locally or on AI Platform. We recommend to run the scripts in this directory in the following order, and use the `source` command to run them, in order to export the environment variables at each step: @@ -73,6 +73,12 @@ This will create a training job on AI Platform and displays some instructions on At the end of a successful training job, it will upload the trained model object to a GCS bucket and sets `$MODEL_DIR` environment variable to the parent directory of all the generated models. +### Monitoring +Once the training starts and the models are generated, you may view the training job in +the [AI Platform page](https://console.cloud.google.com/mlengine/jobs). If you click on the +corresponding training job, you will be able to view the chosen hyperparamters, along with the +metric scores for each model. All the generated model objects will be stored on GCS. + ## Explaining Key Elements In this section, we'll highlight the main elements of this sample. @@ -134,8 +140,6 @@ to use accelerators for instance, or do a distributed training, you will need a * `stream-logs`: streams the logs until the job finishes. * `config`: passing the config file which contains the hyperparameter tuning information. -### - ## Clean Up If you were able to run [train-cloud.sh](./scripts/train-cloud.sh) successfully, you have created and stored some files in your GCS bucket. You may simply remove them by running @@ -159,7 +163,7 @@ In this sample, we will be tuning the following three hyperparameters: * `criterion` with a category type from the set of `{"gini", "entropy"}` -### Setup +### Highlights Let's take a quick look at how the hyperparameter tuning works on AI-Platform. If you look closely, this sample is quite similar to the [base sample for scikit-learn](../base). @@ -177,12 +181,6 @@ We also defined how many models should be trained, and how many of them can be t Finally, we used the same value which we used for `hyperparameter_metric_tag` in step 2, for `hyperparameterMetricTag` in this file. -### Monitoring -Once the training starts and the models are generated, you may view the training job in -the [AI Platform page](https://pantheon.corp.google.com/mlengine/jobs). If you click on the -corresponding training job, you will be able to view the chosen hyperparamters, along with the -metric scores for each model. All the generated model objects will be stored on GCS. - ## What's Next In this sample, we trained a simple classifier with scikit-learn using hyperparameter tuning. @@ -191,4 +189,4 @@ please continue with [this sample](../../../../prediction/sklearn/structured/bas You may need to manually set the environment variable `MODEL_DIR` though, depending on which one of the trained models you actually want to deploy. -For further information on hyperparameter tuning on AI Platform, please visit [this page](https://cloud.google.com/ml-engine/docs/using-hyperparameter-tuning). \ No newline at end of file +For further information on hyperparameter tuning on AI Platform, please visit [this page](https://cloud.google.com/ml-engine/docs/using-hyperparameter-tuning). diff --git a/training/sklearn/structured/hp_tuning/trainer/metadata.py b/training/sklearn/structured/hp_tuning/trainer/metadata.py index 228e765cb..a836c495a 100644 --- a/training/sklearn/structured/hp_tuning/trainer/metadata.py +++ b/training/sklearn/structured/hp_tuning/trainer/metadata.py @@ -22,7 +22,9 @@ # Target name TARGET_NAME = 'tip' -# The features to be used for training +# The features to be used for training. +# If FEATURE_NAMES is None, then all the available columns will be +# used as features, except for the target column. FEATURE_NAMES = [ 'trip_miles', 'trip_seconds', diff --git a/training/sklearn/structured/hp_tuning/trainer/task.py b/training/sklearn/structured/hp_tuning/trainer/task.py index c1f752bf7..3813835c6 100644 --- a/training/sklearn/structured/hp_tuning/trainer/task.py +++ b/training/sklearn/structured/hp_tuning/trainer/task.py @@ -32,7 +32,7 @@ def _train_and_evaluate(estimator, dataset, output_dir): """Runs model training and evaluation. Args: - estimator: (pipeline.Pipeline), Pipeline instance, assemble pre-processing + estimator: (pipeline.Pipeline), Pipeline instance assemble pre-processing steps and model training dataset: (pandas.DataFrame), DataFrame containing training data output_dir: (string), directory that the trained model will be exported @@ -59,7 +59,7 @@ def _train_and_evaluate(estimator, dataset, output_dir): # We recommend that you assign a custom name # The only functional difference is that if you use a custom name, # you must set the hyperparameterMetricTag value in the - # HyperparameterSpec object in the job request to match your chosen name + # HyperparameterSpec object in the job request to match the chosen name hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='Taxi Model Accuracy', diff --git a/training/sklearn/structured/hp_tuning/trainer/utils.py b/training/sklearn/structured/hp_tuning/trainer/utils.py index 46ebe8bec..499f69dbd 100644 --- a/training/sklearn/structured/hp_tuning/trainer/utils.py +++ b/training/sklearn/structured/hp_tuning/trainer/utils.py @@ -18,7 +18,7 @@ import os import pandas as pd -from sklearn import model_selection +from sklearn import model_selection as ms from sklearn.externals import joblib from tensorflow import gfile from trainer import metadata @@ -35,13 +35,19 @@ def data_train_test_split(data_df): pandas.DataFrame, pandas.Series) """ - # Only use metadata.FEATURE_NAMES + metadata.TARGET_NAME - features = data_df[metadata.FEATURE_NAMES] + if metadata.FEATURE_NAMES is None: + # Use all the columns as features, except for the target column + feature_names = list(data_df.columns) + feature_names.remove(metadata.TARGET_NAME) + features = data_df[feature_names] + else: + # Only use metadata.FEATURE_NAMES + features = data_df[metadata.FEATURE_NAMES] target = data_df[metadata.TARGET_NAME] - x_train, x_val, y_train, y_val = model_selection.train_test_split(features, - target, - test_size=0.2) + x_train, x_val, y_train, y_val = ms.train_test_split(features, + target, + test_size=0.2) return x_train.values, y_train, x_val.values, y_val