pipelines/tensorflow/training/pipeline.py

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import pathlib

from kfp.v2 import compiler, dsl
from google_cloud_pipeline_components.experimental.custom_job.utils import (
    create_custom_training_job_op_from_component,
)
from pipelines import generate_query
from pipelines.kfp_components.dependencies import TF_SERVING_CONTAINER_IMAGE_URI
from pipelines.kfp_components.aiplatform import (
    lookup_model,
    export_model,
    upload_model,
    get_current_time,
)
from pipelines.kfp_components.helpers import copy_artifact
from pipelines.kfp_components.bigquery import extract_bq_to_dataset, bq_query_to_table
from pipelines.kfp_components.tfdv import (
    show_anomalies,
    validate_schema,
    visualise_statistics,
    generate_statistics,
)
from pipelines.kfp_components.tensorflow import (
    train_tensorflow_model,
    predict_tensorflow_model,
)
from pipelines.kfp_components.evaluation import calculate_eval_metrics, compare_models


@dsl.pipeline(name="tensorflow-train-pipeline")
def tensorflow_pipeline(
    project_id: str,
    project_location: str,
    pipeline_files_gcs_path: str,
    ingestion_project_id: str,
    tfdv_schema_filename: str,
    tfdv_train_stats_path: str,
    model_name: str,
    model_label: str,
    dataset_id: str,
    dataset_location: str,
    ingestion_dataset_id: str,
    timestamp: str,
):
    """
    Tensorflow Keras training pipeline which:
     1. Extracts a dataset from BQ
     2. Generates statistics
     3. Performs the validation of it against a tfdv schema
     4. Alert if there are any anomalies
     5. Trains the model via Vertex AI CustomTrainJob
     6. Evaluates the model against the current champion model
     7. If better than the current champion model it pushes the model to
     Vertex AI Models

    Args:
        project_id (str): project id of the Google Cloud project
        project_location (str): location of the Google Cloud project
        pipeline_files_gcs_path (str): GCS path where the pipeline files are located
        ingestion_project_id (str): project id containing the source bigquery data
            for ingestion. This can be the same as `project_id` if the source data is
            in the same project where the ML pipeline is executed.
        tfdv_schema_filename (str): filename of schema generated by tfdv
            (in assets directory)
        tfdv_train_stats_path (str): path for statistics generated by tfdv
        model_name (str): name of model
        model_label (str): label of model
        dataset_id (str): id of BQ dataset used to store all staging data & predictions
        dataset_location (str): location of dataset
        ingestion_dataset_id (str): dataset id of ingestion data
        timestamp (str): Optional. Empty or a specific timestamp in ISO 8601 format
            (YYYY-MM-DDThh:mm:ss.sss±hh:mm or YYYY-MM-DDThh:mm:ss).
            If any time part is missing, it will be regarded as zero.

    Returns:
        None
    """

    # Create variables to ensure the same arguments are passed
    # into different components of the pipeline
    label_column_name = "total_fare"
    pred_column_name = "predictions"
    metrics_names = ["MeanSquaredError"]
    custom_metrics = {
        "SquaredPearson": "squared_pearson",
    }
    # Define path to TFMA custom metric modules. Set to None if no custom metric used
    tfma_custom_metrics_path = (
        f"{pipeline_files_gcs_path}/training/assets/tfma_custom_metrics"
    )
    file_pattern = ""  # e.g. "files-*.csv", used as file pattern on storage
    time_column = "trip_start_timestamp"
    ingestion_table = "taxi_trips"
    table_suffix = "_tf_training"  # suffix to table names
    ingested_table = "ingested_data" + table_suffix
    preprocessed_table = "preprocessed_data" + table_suffix
    train_table = "train_data" + table_suffix
    valid_table = "valid_data" + table_suffix
    test_table = "test_data" + table_suffix

    # generate sql queries which are used in ingestion and preprocessing
    # operations

    queries_folder = pathlib.Path(__file__).parent / "queries"

    time_filter = get_current_time(timestamp=timestamp).set_display_name(
        "Get time filter for ingestion query"
    )

    ingest_query = generate_query(
        queries_folder / "ingest.sql",
        source_dataset=f"{ingestion_project_id}.{ingestion_dataset_id}",
        source_table=ingestion_table,
        filter_column=time_column,
        target_column=label_column_name,
        filter_start_value=time_filter.output,
    )
    split_train_query = generate_query(
        queries_folder / "sample.sql",
        source_dataset=dataset_id,
        source_table=ingested_table,
        num_lots=10,
        lots=tuple(range(8)),
    )
    split_valid_query = generate_query(
        queries_folder / "sample.sql",
        source_dataset=dataset_id,
        source_table=ingested_table,
        num_lots=10,
        lots="(8)",
    )
    split_test_query = generate_query(
        queries_folder / "sample.sql",
        source_dataset=dataset_id,
        source_table=ingested_table,
        num_lots=10,
        lots="(9)",
    )
    data_cleaning_query = generate_query(
        queries_folder / "engineer_features.sql",
        source_dataset=dataset_id,
        source_table=train_table,
    )

    # data ingestion and preprocessing operations

    kwargs = dict(
        bq_client_project_id=project_id,
        destination_project_id=project_id,
        dataset_id=dataset_id,
        dataset_location=dataset_location,
        query_job_config=json.dumps(dict(write_disposition="WRITE_TRUNCATE")),
    )
    ingest = bq_query_to_table(
        query=ingest_query, table_id=ingested_table, **kwargs
    ).set_display_name("Ingest data")

    # exporting data to GCS from BQ

    ingested_dataset = (
        extract_bq_to_dataset(
            bq_client_project_id=project_id,
            source_project_id=project_id,
            dataset_id=dataset_id,
            table_name=ingested_table,
            dataset_location=dataset_location,
            file_pattern=file_pattern,
        )
        .after(ingest)
        .set_display_name("Extract data to storage")
    )
    # validate data

    # generate statistics
    gen_statistics = generate_statistics(
        dataset=ingested_dataset.outputs["dataset"],
        file_pattern=file_pattern,
    ).set_display_name("Generate data statistics")
    # visualise statistics
    visualised_statistics = visualise_statistics(
        statistics=gen_statistics.output, statistics_name="Data Statistics"
    ).set_display_name("Visualise data statistics")

    # Construct schema_path from base GCS path + filename
    tfdv_schema_path = (
        f"{pipeline_files_gcs_path}/training/assets/{tfdv_schema_filename}"
    )
    # validate data schema
    validated_schema = validate_schema(
        statistics=gen_statistics.output, schema_path=tfdv_schema_path
    ).set_display_name("Validate data schema")
    # show anomalies and fail if any anomalies were detected
    anomalies = show_anomalies(
        anomalies=validated_schema.output, fail_on_anomalies=True
    ).set_display_name("Show anomalies")

    split_train_data = (
        bq_query_to_table(query=split_train_query, table_id=train_table, **kwargs)
        .after(anomalies)
        .set_display_name("Split train data")
    )
    split_valid_data = (
        bq_query_to_table(query=split_valid_query, table_id=valid_table, **kwargs)
        .after(anomalies)
        .set_display_name("Split validation data")
    )
    split_test_data = (
        bq_query_to_table(query=split_test_query, table_id=test_table, **kwargs)
        .after(anomalies)
        .set_display_name("Split test data")
    )
    data_cleaning = (
        bq_query_to_table(
            query=data_cleaning_query, table_id=preprocessed_table, **kwargs
        )
        .after(split_train_data)
        .set_display_name("Data Cleansing")
    )

    # data extraction to gcs

    train_dataset = (
        extract_bq_to_dataset(
            bq_client_project_id=project_id,
            source_project_id=project_id,
            dataset_id=dataset_id,
            table_name=preprocessed_table,
            dataset_location=dataset_location,
            file_pattern=file_pattern,
        )
        .after(data_cleaning)
        .set_display_name("Extract train data to storage")
    )
    valid_dataset = (
        extract_bq_to_dataset(
            bq_client_project_id=project_id,
            source_project_id=project_id,
            dataset_id=dataset_id,
            table_name=valid_table,
            dataset_location=dataset_location,
            file_pattern=file_pattern,
        )
        .after(split_valid_data)
        .set_display_name("Extract validation data to storage")
    )
    test_dataset = (
        extract_bq_to_dataset(
            bq_client_project_id=project_id,
            source_project_id=project_id,
            dataset_id=dataset_id,
            table_name=test_table,
            dataset_location=dataset_location,
            file_pattern=file_pattern,
        )
        .after(split_test_data)
        .set_display_name("Extract test data to storage")
    )

    # train tensorflow model
    model_params = dict(
        batch_size=100,
        epochs=5,
        loss_fn="MeanSquaredError",
        optimizer="Adam",
        metrics=metrics_names,
        learning_rate=0.01,
        hidden_units=[(64, "relu"), (32, "relu")],
        distribute_strategy="single",
        early_stopping_epochs=5,
    )

    train_model = (
        custom_train_job(
            training_data=train_dataset.outputs["dataset"],
            validation_data=valid_dataset.outputs["dataset"],
            file_pattern=file_pattern,
            label_name=label_column_name,
            model_params=json.dumps(model_params),
            # Training wrapper specific parameters
            project=project_id,
            location=project_location,
        )
        .after(train_dataset)
        .set_display_name("Vertex Training for TF model")
    )

    model = train_model.outputs["model"]
    metrics_artifact = train_model.outputs["metrics_artifact"]

    # predict test dataset using trained model
    challenger_predictions = predict_tensorflow_model(
        test_dataset.outputs["dataset"],
        model,
        label_column_name=label_column_name,
        predictions_column_name=pred_column_name,
        file_pattern=file_pattern,
    ).set_display_name("Predict test data")

    # Calculate evaluation metrics of challenger model
    challenger_eval_metrics = calculate_eval_metrics(
        # Generic inputs
        csv_file=challenger_predictions.output,
        metrics_names=json.dumps(metrics_names),
        label_column_name=label_column_name,
        pred_column_name=pred_column_name,
        # Custom metric config
        project_id=project_id,
        custom_metrics=json.dumps(custom_metrics),
        custom_metrics_path=tfma_custom_metrics_path,
        # Slicing config
        slicing_specs=[
            'feature_keys: ["payment_type"]',
            'feature_keys: ["payment_type", "company"]',
            'feature_values: [{key: "payment_type", value: "Cash"}]',
            'feature_keys: ["company", "dayofweek"] '  # Note this is the same line
            + 'feature_values: [{key: "payment_type", value: "Cash"}]',
        ],
    ).set_display_name("Evaluate test metrics for challenger model")

    # Lookup champion model
    champion_model_lookup = lookup_model(
        model_label=f"{model_label}",
        model_name=model_name,
        project_location=project_location,
        project_id=project_id,
        fail_on_model_not_found=False,
    ).set_display_name("Lookup champion model")

    champion_model_resource_name = champion_model_lookup.outputs["Output"]

    # If there is no champion model, upload challenger model
    with dsl.Condition(
        name="champion-model-not-exists",
        condition=(champion_model_resource_name == ""),
    ):

        # Upload model
        upload_model(
            display_name=model_name,
            serving_container_image_uri=TF_SERVING_CONTAINER_IMAGE_URI,
            model=model,
            project_id=project_id,
            project_location=project_location,
            description="",
            labels=json.dumps(
                dict(
                    model_label=f"{model_label}",
                    pipeline_job_uuid="{{$.pipeline_job_uuid}}",
                    pipeline_job_name="{{$.pipeline_job_name}}",
                )
            ),
        ).set_display_name("Upload challenger model")

        # Copy training stats to well-known location (to be consumed during prediction)
        copy_artifact(
            src_artifact=gen_statistics.output, des_uri=tfdv_train_stats_path
        ).set_display_name("Copy train statistics")

    with dsl.Condition(
        name="champion-model-exists",
        condition=(champion_model_resource_name != ""),
    ):

        exported_champion_model = export_model(
            model_resource_name=champion_model_resource_name,
        ).set_display_name("Export champion model")

        champion_model = exported_champion_model.outputs["model"]

        champion_predictions = predict_tensorflow_model(
            test_dataset.outputs["dataset"],
            champion_model,
            label_column_name=label_column_name,
            predictions_column_name=pred_column_name,
            file_pattern=file_pattern,
        ).set_display_name("Predict test data")

        # Calculate evaluation metrics of champion model
        champion_eval_metrics = calculate_eval_metrics(
            # Generic inputs
            csv_file=champion_predictions.output,
            metrics_names=json.dumps(metrics_names),
            label_column_name=label_column_name,
            pred_column_name=pred_column_name,
            # Custom metric config
            project_id=project_id,
            custom_metrics=json.dumps(custom_metrics),
            custom_metrics_path=tfma_custom_metrics_path,
            # Slicing config
            slicing_specs=[
                'feature_keys: ["payment_type"]',
                'feature_keys: ["payment_type", "company"]',
                'feature_values: [{key: "payment_type", value: "Cash"}]',
                'feature_keys: ["company", "dayofweek"] '  # Note this is the same line
                + 'feature_values: [{key: "payment_type", value: "Cash"}]',
            ],
        ).set_display_name("Evaluate test metrics for the champion model")

        # Determine if challenger model is better than champion model
        compare_champion_challenger_models = compare_models(
            metrics=champion_eval_metrics.outputs["eval_metrics"],
            other_metrics=challenger_eval_metrics.outputs["eval_metrics"],
            evaluation_metric="mean_squared_error",
            higher_is_better=False,
            absolute_difference=0.0,
        ).set_display_name("Compare champion and challenger models")

        # Upload challenger model if it is better than champion model
        with dsl.Condition(
            name="challenger-better-than-champion",
            condition=(compare_champion_challenger_models.output == "true"),
        ):

            # Upload model
            upload_model(
                display_name=model_name,
                serving_container_image_uri=TF_SERVING_CONTAINER_IMAGE_URI,
                model=model,
                project_id=project_id,
                project_location=project_location,
                description="",
                labels=json.dumps(
                    dict(
                        model_label=f"{model_label}",
                        pipeline_job_uuid="{{$.pipeline_job_uuid}}",
                        pipeline_job_name="{{$.pipeline_job_name}}",
                    )
                ),
            ).set_display_name("Upload challenger model")

            # Copy training stats to well-known location
            # (to be consumed during prediction)
            copy_artifact(
                src_artifact=gen_statistics.output, des_uri=tfdv_train_stats_path
            ).set_display_name("Copy train statistics to GCS for challenger model")


def compile():
    """
    Uses the kfp compiler package to compile the pipeline function into a workflow yaml

    Args:
        None

    Returns:
        None
    """
    compiler.Compiler().compile(
        pipeline_func=tensorflow_pipeline,
        package_path="training.json",
        type_check=False,
    )


if __name__ == "__main__":
    custom_train_job = create_custom_training_job_op_from_component(
        component_spec=train_tensorflow_model,
        replica_count=1,
        machine_type="n1-standard-4",
    )
    compile()