Skip to content

Commit

Permalink
Added section on removing global variables
Browse files Browse the repository at this point in the history
  • Loading branch information
JaumeAmoresDS committed Apr 3, 2024
1 parent 3c0f3c2 commit 868963b
Showing 1 changed file with 291 additions and 4 deletions.
295 changes: 291 additions & 4 deletions posts/data_science/hello_world.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3978,8 +3978,7 @@
"id": "35ca4a4d",
"metadata": {},
"source": [
"\n",
"### Optional changes\n",
"### Removing global variables\n",
"\n",
"Let's introduce two optional changes: \n",
"\n",
Expand All @@ -3992,10 +3991,298 @@
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": null,
"id": "3b093697",
"metadata": {},
"source": []
"outputs": [],
"source": [
"%%writefile hello_world_pipeline.py\n",
"# -------------------------------------------------------------------------------------\n",
"# Imports\n",
"# -------------------------------------------------------------------------------------\n",
"# Standard imports\n",
"import os\n",
"import argparse\n",
"\n",
"# Third-party imports\n",
"import pandas as pd\n",
"from sklearn import Bunch\n",
"\n",
"# AML imports\n",
"from azure.ai.ml import (\n",
" command,\n",
" dsl,\n",
" Input,\n",
" Output,\n",
" MLClient\n",
")\n",
"from azure.identity import DefaultAzureCredential\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# Connection\n",
"# -------------------------------------------------------------------------------------\n",
"def connect ():\n",
" # authenticate\n",
" credential = DefaultAzureCredential()\n",
"\n",
" # Get a handle to the workspace\n",
" ml_client = MLClient.from_config (\n",
" credential=credential,\n",
" )\n",
" return ml_client\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# Pipeline definition\n",
"# -------------------------------------------------------------------------------------\n",
"@dsl.pipeline(\n",
" compute=\"serverless\", # \"serverless\" value runs pipeline on serverless compute\n",
" description=\"E2E hello world pipeline with input\",\n",
")\n",
"def three_components_pipeline(\n",
" # Preprocessing component parameters, first component:\n",
" preprocessing_training_input_file: str,\n",
" preprocessing_training_output_filename: str,\n",
" x: int,\n",
" \n",
" # Preprocessing component parameters, second component:\n",
" preprocessing_test_input_file: str,\n",
" preprocessing_test_output_filename: str,\n",
" \n",
" # Training component parameters:\n",
" training_output_filename: str, \n",
" \n",
" # Inference component parameters:\n",
" inference_output_filename: str,\n",
"):\n",
" \"\"\"\n",
" Third pipeline: preprocessing, training and inference.\n",
" \n",
" Parameters\n",
" ----------\n",
" preprocessing_training_input_file: str\n",
" Path to file containing training data to be preprocessed.\n",
" preprocessing_training_output_filename: str\n",
" Name of file containing the preprocessed, training data.\n",
" x: int\n",
" Number to add to input data for preprocessing it.\n",
" preprocessing_test_input_file: str\n",
" Path to file containing test data to be preprocessed.\n",
" preprocessing_test_output_filename: str\n",
" Name of file containing the preprocessed, test data.\n",
" training_output_filename: str\n",
" Name of file containing the trained model.\n",
" inference_output_filename: str\n",
" Name of file containing the output data with inference results.\n",
" \"\"\"\n",
" # -------------------------------------------------------------------------------------\n",
" # Preprocessing\n",
" # -------------------------------------------------------------------------------------\n",
" # Interface\n",
" preprocessing_component = command(\n",
" inputs=dict(\n",
" input_file=Input (type=\"uri_file\"),\n",
" x=Input (type=\"number\"),\n",
" output_filename=Input (type=\"string\"),\n",
" ),\n",
" outputs=dict(\n",
" output_folder=Output (type=\"uri_folder\"),\n",
" ),\n",
" code=f\"./preprocessing/\", # location of source code: in this case, the root folder\n",
" command=\"python preprocessing.py \"\n",
" \"--input_file ${{inputs.input_file}} \"\n",
" \"-x ${{inputs.x}} \"\n",
" \"--output_folder ${{outputs.output_folder}} \"\n",
" \"--output_filename ${{inputs.output_filename}}\",\n",
" environment=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest\",\n",
" display_name=\"Pre-processing\",\n",
" )\n",
" #preprocessing_component = ml_client.create_or_update(preprocessing_component.component)\n",
"\n",
" # Instantiation\n",
" preprocessing_training_job = preprocessing_component(\n",
" input_file=preprocessing_training_input_file,\n",
" #output_folder: automatically determined\n",
" output_filename=preprocessing_training_output_filename,\n",
" x=x,\n",
" )\n",
" preprocessing_test_job = preprocessing_component(\n",
" input_file=preprocessing_test_input_file,\n",
" #output_folder: automatically determined\n",
" output_filename=preprocessing_test_output_filename,\n",
" x=x,\n",
" )\n",
"\n",
" # -------------------------------------------------------------------------------------\n",
" # Training component\n",
" # -------------------------------------------------------------------------------------\n",
" # Interface\n",
" training_component = command(\n",
" inputs=dict(\n",
" input_folder=Input (type=\"uri_folder\"),\n",
" input_filename=Input (type=\"string\"),\n",
" output_filename=Input (type=\"string\"),\n",
" ),\n",
" outputs=dict(\n",
" output_folder=Output (type=\"uri_folder\"),\n",
" ),\n",
" code=f\"./training/\", # location of source code: in this case, the root folder\n",
" command=\"python training.py \"\n",
" \"--input_folder ${{inputs.input_folder}} \"\n",
" \"--input_filename ${{inputs.input_filename}} \"\n",
" \"--output_folder ${{outputs.output_folder}} \"\n",
" \"--output_filename ${{inputs.output_filename}}\",\n",
" environment=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest\",\n",
" display_name=\"Training\",\n",
" )\n",
" #training_component = ml_client.create_or_update(training_component.component)\n",
"\n",
" # Instantiation\n",
" training_job = training_component(\n",
" input_folder=preprocessing_training_job.outputs.output_folder,\n",
" input_filename=preprocessing_training_output_filename,\n",
" #output_folder: automatically determined\n",
" output_filename=training_output_filename,\n",
" )\n",
"\n",
" # -------------------------------------------------------------------------------------\n",
" # Inference\n",
" # -------------------------------------------------------------------------------------\n",
" # Interface\n",
" inference_component = command(\n",
" inputs=dict(\n",
" preprocessed_input_folder=Input (type=\"uri_folder\"),\n",
" preprocessed_input_filename=Input (type=\"string\"),\n",
" model_input_folder=Input (type=\"uri_folder\"),\n",
" model_input_filename=Input (type=\"string\"),\n",
" output_filename=Input (type=\"string\"),\n",
" ),\n",
" outputs=dict(\n",
" output_folder=Output (type=\"uri_folder\"),\n",
" ),\n",
" code=f\"./inference/\", # location of source code: in this case, the root folder\n",
" command=\"python inference.py \" \n",
" \"--preprocessed_input_folder ${{inputs.preprocessed_input_folder}} \"\n",
" \"--preprocessed_input_filename ${{inputs.preprocessed_input_filename}} \"\n",
" \"--model_input_folder ${{inputs.model_input_folder}} \"\n",
" \"--model_input_filename ${{inputs.model_input_filename}} \"\n",
" \"--output_folder ${{outputs.output_folder}} \"\n",
" \"--output_filename ${{inputs.output_filename}} \",\n",
"\n",
" environment=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest\",\n",
" display_name=\"inference\",\n",
" )\n",
" #inference_component = ml_client.create_or_update(inference_component.component)\n",
"\n",
" # Instantiation\n",
" inference_job = inference_component(\n",
" preprocessed_input_folder=preprocessing_test_job.outputs.output_folder,\n",
" preprocessed_input_filename=preprocessing_test_output_filename,\n",
" model_input_folder=training_job.outputs.output_folder,\n",
" model_input_filename=training_output_filename,\n",
" #output_folder: automatically determined\n",
" output_filename=inference_output_filename,\n",
" )\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# Pipeline running\n",
"# -------------------------------------------------------------------------------------\n",
"def run_pipeline (\n",
" config_path: str=\"./pipeline_input.json\",\n",
"):\n",
"\n",
" # Read config json file\n",
" with open (config_path,\"rt\") as config_file\n",
" config = json.load (config_file)\n",
"\n",
" # Convert config dictionary into a Bunch object.\n",
" # This allows to get access to fields as object attributes\n",
" # Which I find more convenient.\n",
" config = Bunch (**config)\n",
"\n",
" # Connect to AML client\n",
" ml_client = connect ()\n",
"\n",
" # Build pipeline \n",
" three_components_pipeline = three_components_pipeline(\n",
" # first preprocessing component\n",
" preprocessing_training_input_file=Input(type=\"uri_file\", path=config.preprocessing_training_input_file),\n",
" preprocessing_training_output_filename=config.preprocessing_training_output_filename,\n",
" x=config.x,\n",
" \n",
" # second preprocessing component\n",
" preprocessing_test_input_file=Input(type=\"uri_file\", path=config.preprocessing_test_input_file),\n",
" preprocessing_test_output_filename=config.preprocessing_test_output_filename,\n",
" \n",
" # Training component parameters:\n",
" training_output_filename=config.training_output_filename,\n",
" \n",
" # Inference component parameters:\n",
" inference_output_filename=config.inference_output_filename,\n",
" )\n",
"\n",
" three_components_pipeline_job = ml_client.jobs.create_or_update(\n",
" three_components_pipeline,\n",
" # Project's name\n",
" experiment_name=config.experiment_name,\n",
" )\n",
"\n",
" # ----------------------------------------------------\n",
" # Pipeline running\n",
" # ----------------------------------------------------\n",
" ml_client.jobs.stream(three_components_pipeline_job.name)\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# Parsing\n",
"# -------------------------------------------------------------------------------------\n",
"def parse_args ():\n",
" \"\"\"Parses input arguments\"\"\"\n",
" \n",
" parser = argparse.ArgumentParser()\n",
" parser.add_argument (\n",
" \"--config-path\", \n",
" type=str, \n",
" default=\"pipeline_input.json\",\n",
" help=\"Path to config file specifying pipeline input parameters.\",\n",
" )\n",
" parser.add_argument (\n",
" \"--experiment-name\", \n",
" type=str, \n",
" default=\"hello-world-experiment\",\n",
" help=\"Name of experiment.\",\n",
" )\n",
"\n",
" args = parser.parse_args()\n",
" \n",
" return args\n",
"\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# main\n",
"# -------------------------------------------------------------------------------------\n",
"def main ():\n",
" \"\"\"Parses arguments and runs pipeline\"\"\"\n",
" args = parse_args ()\n",
" run_pipeline (\n",
" args.config_path,\n",
" args.experiment_name,\n",
" )\n",
"\n",
"# -------------------------------------------------------------------------------------\n",
"# -------------------------------------------------------------------------------------\n",
"if __name__ == \"__main__\":\n",
" main ()"
]
},
{
"cell_type": "markdown",
"id": "286cb55b",
"metadata": {},
"source": [
"## Further refactorings\n",
"\n",
"- Create more structure on config input file: one dictionary per pipeline component."
]
}
],
"metadata": {
Expand Down

0 comments on commit 868963b

Please sign in to comment.