From cfec46dde650c045661ae710cf847496bf31c8ba Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Tue, 27 Oct 2020 16:29:21 -0400 Subject: [PATCH 1/5] feat: add container/CI for openaddresses API --- .github/workflows/build_openaddresses.yml | 51 +++ .github/workflows/publish_openaddresses.yml | 57 +++ openaddresses-batch-machine/README.md | 9 + .../container/Dockerfile | 28 ++ .../pipeline/components/copy_to_minio.yaml | 35 ++ .../components/openaddresses_get_data.yaml | 26 ++ .../pipeline/get_openaddresses_data.ipynb | 423 ++++++++++++++++++ .../pipeline/utilities.py | 135 ++++++ 8 files changed, 764 insertions(+) create mode 100644 .github/workflows/build_openaddresses.yml create mode 100644 .github/workflows/publish_openaddresses.yml create mode 100644 openaddresses-batch-machine/README.md create mode 100644 openaddresses-batch-machine/container/Dockerfile create mode 100644 openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml create mode 100644 openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml create mode 100644 openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb create mode 100644 openaddresses-batch-machine/pipeline/utilities.py diff --git a/.github/workflows/build_openaddresses.yml b/.github/workflows/build_openaddresses.yml new file mode 100644 index 0000000..9bbcc54 --- /dev/null +++ b/.github/workflows/build_openaddresses.yml @@ -0,0 +1,51 @@ +# This workflow will build a docker container, publish it to Azure Container Registry, and deploy it to Azure Kubernetes Service using a helm chart. +# +# https://github.com/Azure/actions-workflow-samples/tree/master/Kubernetes +# +# To configure this workflow: +# +# 1. Set up the following secrets in your workspace: +# a. REGISTRY_USERNAME with ACR username +# b. REGISTRY_PASSWORD with ACR Password +# c. AZURE_CREDENTIALS with the output of `az ad sp create-for-rbac --sdk-auth` +# +# 2. Change the values for the REGISTRY_NAME, CLUSTER_NAME, CLUSTER_RESOURCE_GROUP and NAMESPACE environment variables (below). +name: build_openaddresses +on: [pull_request] + +# Environment variables available to all jobs and steps in this workflow +env: + REGISTRY_NAME: k8scc01covidacr + CLUSTER_NAME: k8s-cancentral-02-covid-aks + CLUSTER_RESOURCE_GROUP: k8s-cancentral-01-covid-aks +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + + # Connect to Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Free disk space + run: | + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + docker rmi $(docker image ls -aq) + df -h + + - run: | + docker build -f ./openaddresses-batch-machine/openaddresses-batch-machine/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/openaddresses-batch-machine + docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + + # Scan image for vulnerabilities + - uses: Azure/container-scan@v0 + with: + image-name: ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + severity-threshold: CRITICAL + run-quality-checks: false diff --git a/.github/workflows/publish_openaddresses.yml b/.github/workflows/publish_openaddresses.yml new file mode 100644 index 0000000..6e550eb --- /dev/null +++ b/.github/workflows/publish_openaddresses.yml @@ -0,0 +1,57 @@ +# This workflow will build a docker container, publish it to Azure Container Registry, and deploy it to Azure Kubernetes Service using a helm chart. +# +# https://github.com/Azure/actions-workflow-samples/tree/master/Kubernetes +# +# To configure this workflow: +# +# 1. Set up the following secrets in your workspace: +# a. REGISTRY_USERNAME with ACR username +# b. REGISTRY_PASSWORD with ACR Password +# c. AZURE_CREDENTIALS with the output of `az ad sp create-for-rbac --sdk-auth` +# +# 2. Change the values for the REGISTRY_NAME, CLUSTER_NAME, CLUSTER_RESOURCE_GROUP and NAMESPACE environment variables (below). +name: publish_openaddresses +on: + push: + branches: + - master + +# Environment variables available to all jobs and steps in this workflow +env: + REGISTRY_NAME: k8scc01covidacr + CLUSTER_NAME: k8s-cancentral-02-covid-aks + CLUSTER_RESOURCE_GROUP: k8s-cancentral-01-covid-aks +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + + # Connect to Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Free disk space + run: | + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + docker rmi $(docker image ls -aq) + df -h + + # Container build and push to a Azure Container registry (ACR) + - run: | + docker build -f ./openaddresses-batch-machine/openaddresses-batch-machine/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/openaddresses-batch-machine + docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + + # Scan image for vulnerabilities + - uses: Azure/container-scan@v0 + with: + image-name: ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + severity-threshold: CRITICAL + run-quality-checks: false diff --git a/openaddresses-batch-machine/README.md b/openaddresses-batch-machine/README.md new file mode 100644 index 0000000..04e8868 --- /dev/null +++ b/openaddresses-batch-machine/README.md @@ -0,0 +1,9 @@ +# Summary + +This repo builds and provides access to [this modification](https://github.com/JosephKuchar/batch-machine) of [OpenAddresses batch-machine](https://github.com/openaddresses/batch-machine). The container is built is similar to that specified by the OpenAddresses repo, but is pinned to a specific commit from JosephKuchar/batch-machine and restricts the user to be non-ROOT. + +# Usage: + +See `../.github/workflows/build_openaddresses.yml` (or `publish_openaddresses.yml`) for CI/build details, which build `./container/Dockerfile` + +See `./pipeline/get_openaddresses_data.ipynb` for example usage. Typically, the easiest way to invoke this is through a Kubeflow Pipeline. diff --git a/openaddresses-batch-machine/container/Dockerfile b/openaddresses-batch-machine/container/Dockerfile new file mode 100644 index 0000000..154a055 --- /dev/null +++ b/openaddresses-batch-machine/container/Dockerfile @@ -0,0 +1,28 @@ +FROM alpine:3.11 + +ENV BATCH_MACHINE_PATH=/batch-machine + +RUN apk add nodejs yarn git python3 python3-dev py3-pip \ + py3-gdal gdal gdal-dev make bash sqlite-dev zlib-dev \ + postgresql-libs gcc g++ musl-dev postgresql-dev cairo \ + py3-cairo file + +# Download and install Tippecanoe +RUN git clone -b 1.35.0 https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe && \ + cd /tmp/tippecanoe && \ + make && \ + PREFIX=/usr/local make install && \ + rm -rf /tmp/tippecanoe + +# Get/install batch-machine +RUN git clone https://github.com/JosephKuchar/batch-machine $BATCH_MACHINE_PATH && \ + pip3 install $BATCH_MACHINE_PATH + +# Restrict to non-root access +RUN addgroup appgroup && \ + adduser -S -g appgroup appuser +USER appuser + +COPY test.json test.json + +CMD python3 ${BATCH_MACHINE_PATH}/test.py diff --git a/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml b/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml new file mode 100644 index 0000000..352fbeb --- /dev/null +++ b/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml @@ -0,0 +1,35 @@ +name: Copy to Minio +inputs: +- {name: Minio URL, type: URL, description: 'Minio instance URL, starting with http://'} +- {name: Minio access key, type: String} +- {name: Minio secret key, type: String} +- {name: Local source, description: 'Local source of upload'} +- {name: Minio destination, type: String, description: 'Minio destination location in format /'} +- {name: Flags, optional: true, default: '', type: String, description: 'Flags/options passed to mc'} +outputs: +- {name: Minio destination, type: String} +- {name: md5sum, type: String, description: 'A combined md5sum of all data passed to MinIO'} +implementation: + container: + image: minio/mc + command: + - sh + - -ex + - -c + - | + FLAGS=$7 + mkdir -p "$(dirname "$5")" + mkdir -p "$(dirname "$6")" + mc config host add my_minio $0 $1 $2 + mc cp $FLAGS $3 my_minio/$4 + echo "$4" > "$5" + # Use find in case we retrieved a directory - this gets all files in the dir + find $3 -type f -exec md5sum {} \; | sort -k 2 | md5sum | awk '{print $1}' > $6 + - {inputValue: Minio URL} + - {inputValue: Minio access key} + - {inputValue: Minio secret key} + - {inputPath: Local source} + - {inputValue: Minio destination} + - {outputPath: Minio destination} + - {outputPath: md5sum} + - {inputValue: Flags} diff --git a/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml new file mode 100644 index 0000000..2fb814a --- /dev/null +++ b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml @@ -0,0 +1,26 @@ +name: Download Data from OpenAddresses +inputs: +- {name: source_json, type: JsonObject, description: 'OpenAddresses source specification in JSON format'} +- {name: args, type: String, optional: true, default: '', description: 'Optional command line args to pass to openaddr-process-one, such as "--layer addresses --layersource city"'} +outputs: +- {name: data, description: 'All data downloaded from OpenAddresses call'} +implementation: + container: + image: k8scc01covidacr.azurecr.io/daaas-openaddresses-batch-machine + command: + - sh + - -ex + - -c + - | + SOURCE_JSON=$0 + ARGS=$1 + OUTPUT_PATH=$2 + mkdir -p $OUTPUT_PATH + + cat $SOURCE_JSON + + openaddr-process-one $SOURCE_JSON $OUTPUT_PATH $ARGS + + - {inputPath: source_json} + - {inputValue: args} + - {outputPath: data} diff --git a/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb new file mode 100644 index 0000000..9240385 --- /dev/null +++ b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary\n", + "\n", + "Uses the OpenAddresses tooling to \n", + "\n", + "* download data based on a user-defined JSON source specification\n", + "* save the data in a user-defined location in MinIO\n", + "\n", + "Usage: \n", + "\n", + "* Edit the below settings" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-21T17:59:49.579078Z", + "iopub.status.busy": "2020-10-21T17:59:49.578558Z", + "iopub.status.idle": "2020-10-21T17:59:49.583721Z", + "shell.execute_reply": "2020-10-21T17:59:49.583075Z", + "shell.execute_reply.started": "2020-10-21T17:59:49.579031Z" + } + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "import kfp\n", + "from kfp.components import load_component_from_file\n", + "from kfp import dsl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# User settings:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-21T17:59:47.088665Z", + "iopub.status.busy": "2020-10-21T17:59:47.088273Z", + "iopub.status.idle": "2020-10-21T17:59:47.093738Z", + "shell.execute_reply": "2020-10-21T17:59:47.092523Z", + "shell.execute_reply.started": "2020-10-21T17:59:47.088621Z" + } + }, + "outputs": [], + "source": [ + "json_source_file = \"myfile.json\"\n", + "minio_tenant = 'minimal'\n", + "openaddresses_args = \"--layer addresses --layersource city\",\n", + "minio_output_uri = 'MINIO-BUCKET-NAME/path/to/storage/location'," + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Additional Inputs for Debugging/Testing:\n", + "\n", + "If testing/debugging, you can use the following demo JSON file. \n", + "\n", + "You can also use this method to write your own JSON file from the notebook if that makes sense for your workflow. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:07.972789Z", + "iopub.status.busy": "2020-10-20T18:23:07.972583Z", + "iopub.status.idle": "2020-10-20T18:23:08.043174Z", + "shell.execute_reply": "2020-10-20T18:23:08.042300Z", + "shell.execute_reply.started": "2020-10-20T18:23:07.972764Z" + } + }, + "outputs": [], + "source": [ + "# json_source_file = \"demo_calgary.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:08.044634Z", + "iopub.status.busy": "2020-10-20T18:23:08.044373Z", + "iopub.status.idle": "2020-10-20T18:23:08.055601Z", + "shell.execute_reply": "2020-10-20T18:23:08.054867Z", + "shell.execute_reply.started": "2020-10-20T18:23:08.044603Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting demo_calgary.json\n" + ] + } + ], + "source": [ + "# %%writefile $json_source_file\n", + "# {\n", + "# \"coverage\": {\n", + "# \"country\": \"ca\",\n", + "# \"state\": \"ab\",\n", + "# \"city\": \"Calgary\",\n", + "# \"geometry\": {\n", + "# \"type\": \"Point\",\n", + "# \"coordinates\": [\n", + "# -114.08,\n", + "# 51.08\n", + "# ]\n", + "# }\n", + "# },\n", + "# \"schema\": 2,\n", + "# \"layers\": {\n", + "# \"addresses\": [\n", + "# {\n", + "# \"name\": \"city\",\n", + "# \"data\": \"https://data.calgary.ca/api/views/uwj2-d2wc/rows.csv?accessType=DOWNLOAD\",\n", + "# \"website\": \"https://data.calgary.ca/\",\n", + "# \"license\": {\n", + "# \"url\": \"https://data.calgary.ca/stories/s/Open-Calgary-Terms-of-Use/u45n-7awa\",\n", + "# \"text\": \"Contains information licensed under the Open Government Licence – City of Calgary.\",\n", + "# \"attribution name\": \"City of Calgary\"\n", + "# },\n", + "# \"protocol\": \"http\",\n", + "# \"conform\": {\n", + "# \"format\": \"csv\",\n", + "# \"lat\": \"latitude\",\n", + "# \"lon\": \"longitude\",\n", + "# \"number\": {\n", + "# \"function\": \"join\",\n", + "# \"fields\": [\n", + "# \"HOUSE_NUMBER\",\n", + "# \"HOUSE_ALPHA\"\n", + "# ],\n", + "# \"separator\": \"\"\n", + "# },\n", + "# \"street\": [\n", + "# \"STREET_NAME\",\n", + "# \"STREET_TYPE\",\n", + "# \"STREET_QUAD\"\n", + "# ],\n", + "# \"str_name\": \"STREET_NAME\",\n", + "# \"str_type\": \"STREET_TYPE\",\n", + "# \"str_dir\": \"STREET_QUAD\",\n", + "# \"full_addr\": \"ADDRESS\"\n", + "# },\n", + "# \"attribution\": \"City of Calgary\"\n", + "# }\n", + "# ]\n", + "# }\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Component/Pipeline definitions" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:07.888183Z", + "iopub.status.busy": "2020-10-20T18:23:07.887892Z", + "iopub.status.idle": "2020-10-20T18:23:07.963361Z", + "shell.execute_reply": "2020-10-20T18:23:07.962600Z", + "shell.execute_reply.started": "2020-10-20T18:23:07.888152Z" + } + }, + "outputs": [], + "source": [ + "openaddresses_get_op = load_component_from_file(\n", + " \"./components/openaddresses_get_data.yaml\"\n", + ")\n", + "copy_to_minio_op = load_component_from_file(\n", + " \"./components/copy_to_minio.yaml\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:07.964790Z", + "iopub.status.busy": "2020-10-20T18:23:07.964546Z", + "iopub.status.idle": "2020-10-20T18:23:07.971652Z", + "shell.execute_reply": "2020-10-20T18:23:07.970994Z", + "shell.execute_reply.started": "2020-10-20T18:23:07.964763Z" + } + }, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Download OpenAddresses Data to Minio\"\n", + ")\n", + "def pipeline(\n", + " source_json,\n", + " minio_output_uri: str,\n", + " # TODO: Handle these automatically once multitenancy is available\n", + " minio_url,\n", + " minio_access_key: str,\n", + " minio_secret_key: str,\n", + " openaddresses_args: str = \"\",\n", + "):\n", + " operations = {}\n", + "\n", + " operations['Get Data'] = openaddresses_get_op(\n", + " source_json=source_json,\n", + " args=openaddresses_args,\n", + " ).set_image_pull_policy(\"Always\")\n", + "\n", + " operations['Store Data'] = copy_to_minio_op(\n", + " local_source=operations['Get Data'].outputs['data'],\n", + " minio_destination=minio_output_uri,\n", + " minio_url=minio_url,\n", + " minio_access_key=minio_access_key,\n", + " minio_secret_key=minio_secret_key,\n", + " flags=\"--recursive\", # Because outputs['data'] is a directory\n", + " )\n", + " # Set all operations display names to their key in the operations dict\n", + " for name, op in operations.items():\n", + " op.set_display_name(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build pipeline arguments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the JSON source file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-21T17:57:25.511835Z", + "iopub.status.busy": "2020-10-21T17:57:25.511436Z", + "iopub.status.idle": "2020-10-21T17:57:25.589954Z", + "shell.execute_reply": "2020-10-21T17:57:25.589133Z", + "shell.execute_reply.started": "2020-10-21T17:57:25.511793Z" + } + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'json_source_file' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson_source_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfin\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msource_json\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'json_source_file' is not defined" + ] + } + ], + "source": [ + "with open(json_source_file, 'r') as fin:\n", + " source_json = json.load(fin)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-21T18:02:22.261643Z", + "iopub.status.busy": "2020-10-21T18:02:22.261379Z", + "iopub.status.idle": "2020-10-21T18:02:22.264434Z", + "shell.execute_reply": "2020-10-21T18:02:22.263870Z", + "shell.execute_reply.started": "2020-10-21T18:02:22.261618Z" + } + }, + "source": [ + "## Get MinIO credentials from the Notebook Server (could also specify these things manually)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:08.072695Z", + "iopub.status.busy": "2020-10-20T18:23:08.072480Z", + "iopub.status.idle": "2020-10-20T18:23:08.082385Z", + "shell.execute_reply": "2020-10-20T18:23:08.081808Z", + "shell.execute_reply.started": "2020-10-20T18:23:08.072670Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to access minio credentials from:\n", + "/vault/secrets/minio-minimal-tenant1\n" + ] + } + ], + "source": [ + "# Get minio credentials using a helper\n", + "from utilities import get_minio_credentials\n", + "\n", + "minio_settings = get_minio_credentials(minio_tenant, strip_http=False)\n", + "minio_url = minio_settings[\"url\"]\n", + "minio_access_key = minio_settings[\"access_key\"]\n", + "minio_secret_key = minio_settings[\"secret_key\"]\n", + "\n", + "arguments = dict(\n", + " source_json=json.dumps(source_json),\n", + " openaddresses_args=openaddresses_args,\n", + " minio_output_uri=minio_output_uri,\n", + " minio_url=minio_url,\n", + " minio_access_key=minio_access_key,\n", + " minio_secret_key=minio_secret_key,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-20T18:23:08.083546Z", + "iopub.status.busy": "2020-10-20T18:23:08.083355Z", + "iopub.status.idle": "2020-10-20T18:23:08.272886Z", + "shell.execute_reply": "2020-10-20T18:23:08.272147Z", + "shell.execute_reply.started": "2020-10-20T18:23:08.083523Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "Experiment link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pipeline_run = kfp.Client().create_run_from_pipeline_func(\n", + " pipeline,\n", + " arguments=arguments,\n", + " run_name=\"openaddresses-get-store-data\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/openaddresses-batch-machine/pipeline/utilities.py b/openaddresses-batch-machine/pipeline/utilities.py new file mode 100644 index 0000000..b18823f --- /dev/null +++ b/openaddresses-batch-machine/pipeline/utilities.py @@ -0,0 +1,135 @@ +import re + + +def parse_env_var_def(s): + """ + Parse string defining a shell environment variable, returning name and val + + Returns (varname, value) if matching pattern, else None + """ + match = re.search(r"\s*(?<=export)\s+([^=]+)=(.*)", s) + if match: + lhs, rhs = match.groups() + + # Remove whitespace and any quoted strings' quotes + lhs = lhs.strip().strip('\'').strip('"') + rhs = rhs.strip().strip('\'').strip('"') + # If both sides exist, return them + if lhs and rhs: + return lhs, rhs + return None + + +def get_env_variables_from_file(filepath): + """ + Returns a dictionary of the environment variables defined in a file + """ + with open(filepath, 'r') as fin: + lines = fin.readlines() + lines = [parse_env_var_def(line) for line in lines] + + # Return a dict of lhs:rhs, skipping any lines that are blank + return {line[0]: line[1] for line in lines if line} + + +def get_minio_credentials(tenant, strip_http=True, verbose=True): + """ + Retrieve minio credentials from the vault (available from notebook server) + + Args: + strip_http (bool): If True, strips http:// from the start of the minio + URL + tenant (str): Minio tenant name, such as "minimal" or "premium" + + Returns: + (dict): Dict with keys: + url + access_key + secret_key + """ + vault = f"/vault/secrets/minio-{tenant}-tenant1" + if verbose: + print("Trying to access minio credentials from:") + print(vault) + d = get_env_variables_from_file(vault) + + # Select only the keys that we want, also checking that they exist at all + key_map = { + "MINIO_URL": "url", + "MINIO_ACCESS_KEY": "access_key", + "MINIO_SECRET_KEY": "secret_key", + } + minio_credentials = {} + for k in key_map: + try: + minio_credentials[key_map[k]] = d[k] + except KeyError: + raise KeyError(f"Cannot find minio credential {k} in vault file") + + if strip_http: + # Get rid of http:// in minio URL + minio_credentials["url"] = re.sub(r'^https?://', + "", + minio_credentials["url"], + ) + + return minio_credentials + + +def create_bucket_if_missing(minio_obj, bucket): + if not minio_obj.bucket_exists(bucket): + minio_obj.make_bucket(bucket) + print(f"Created bucket {bucket}") + + +def copy_to_minio(minio_url, bucket, access_key, secret_key, sourcefile, + destination): + from minio import Minio + + # Store results to minio + s3 = Minio( + minio_url, + access_key=access_key, + secret_key=secret_key, + secure=False, + region="us-west-1", + ) + + # Create bucket if needed + create_bucket_if_missing(s3, bucket) + + # Put file into bucket + s3.fput_object(bucket, destination, sourcefile) + + +def minio_find_files_matching_pattern(minio_url, bucket, access_key, + secret_key, pattern, prefix='', + recursive=True): + """ + Returns all files in a minio location that match the given pattern + + This function is glob-like in idea, but uses regex patterns instead + of glob patterns. + """ + import re + from minio import Minio + pattern = re.compile(pattern) + + s3 = Minio( + minio_url, + access_key=access_key, + secret_key=secret_key, + secure=False, + region="us-west-1", + ) + + # Get everything in bucket/prefix + objs = s3.list_objects(bucket, prefix=prefix, recursive=True) + + # Discard directories + filepaths = [obj.object_name for obj in objs if not obj.is_dir] + + # Select only those that fit the pattern + matching = [filepath for filepath in filepaths if pattern.match(filepath)] + + return matching From e3442a0e8173ec6c054381f2e08a5a23916576b5 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Tue, 27 Oct 2020 20:37:00 +0000 Subject: [PATCH 2/5] feat: Update component and pipeline to access openaddresses --- .../components/openaddresses_get_data.yaml | 2 +- .../pipeline/get_openaddresses_data.ipynb | 180 +++--------------- 2 files changed, 24 insertions(+), 158 deletions(-) diff --git a/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml index 2fb814a..cfde79b 100644 --- a/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml +++ b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml @@ -6,7 +6,7 @@ outputs: - {name: data, description: 'All data downloaded from OpenAddresses call'} implementation: container: - image: k8scc01covidacr.azurecr.io/daaas-openaddresses-batch-machine + image: k8scc01covidacr.azurecr.io/daaas-openaddresses-batch-machine:latest command: - sh - -ex diff --git a/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb index 9240385..634533b 100644 --- a/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb +++ b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb @@ -13,21 +13,13 @@ "\n", "Usage: \n", "\n", - "* Edit the below settings" + "* Edit the below settings and run notebook" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-21T17:59:49.579078Z", - "iopub.status.busy": "2020-10-21T17:59:49.578558Z", - "iopub.status.idle": "2020-10-21T17:59:49.583721Z", - "shell.execute_reply": "2020-10-21T17:59:49.583075Z", - "shell.execute_reply.started": "2020-10-21T17:59:49.579031Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -46,22 +38,15 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-21T17:59:47.088665Z", - "iopub.status.busy": "2020-10-21T17:59:47.088273Z", - "iopub.status.idle": "2020-10-21T17:59:47.093738Z", - "shell.execute_reply": "2020-10-21T17:59:47.092523Z", - "shell.execute_reply.started": "2020-10-21T17:59:47.088621Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ + "minio_bucket_name = 'FIRSTNAME-LASTNAME'\n", "json_source_file = \"myfile.json\"\n", "minio_tenant = 'minimal'\n", - "openaddresses_args = \"--layer addresses --layersource city\",\n", - "minio_output_uri = 'MINIO-BUCKET-NAME/path/to/storage/location'," + "openaddresses_args = \"--layer addresses --layersource city\"\n", + "minio_output_uri = f'{minio_bucket_name}/path/to/storage/location'" ] }, { @@ -77,42 +62,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:07.972789Z", - "iopub.status.busy": "2020-10-20T18:23:07.972583Z", - "iopub.status.idle": "2020-10-20T18:23:08.043174Z", - "shell.execute_reply": "2020-10-20T18:23:08.042300Z", - "shell.execute_reply.started": "2020-10-20T18:23:07.972764Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], - "source": [ - "# json_source_file = \"demo_calgary.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:08.044634Z", - "iopub.status.busy": "2020-10-20T18:23:08.044373Z", - "iopub.status.idle": "2020-10-20T18:23:08.055601Z", - "shell.execute_reply": "2020-10-20T18:23:08.054867Z", - "shell.execute_reply.started": "2020-10-20T18:23:08.044603Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting demo_calgary.json\n" - ] - } - ], "source": [ "# %%writefile $json_source_file\n", "# {\n", @@ -179,16 +131,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:07.888183Z", - "iopub.status.busy": "2020-10-20T18:23:07.887892Z", - "iopub.status.idle": "2020-10-20T18:23:07.963361Z", - "shell.execute_reply": "2020-10-20T18:23:07.962600Z", - "shell.execute_reply.started": "2020-10-20T18:23:07.888152Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "openaddresses_get_op = load_component_from_file(\n", @@ -201,16 +145,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:07.964790Z", - "iopub.status.busy": "2020-10-20T18:23:07.964546Z", - "iopub.status.idle": "2020-10-20T18:23:07.971652Z", - "shell.execute_reply": "2020-10-20T18:23:07.970994Z", - "shell.execute_reply.started": "2020-10-20T18:23:07.964763Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", @@ -261,29 +197,9 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-21T17:57:25.511835Z", - "iopub.status.busy": "2020-10-21T17:57:25.511436Z", - "iopub.status.idle": "2020-10-21T17:57:25.589954Z", - "shell.execute_reply": "2020-10-21T17:57:25.589133Z", - "shell.execute_reply.started": "2020-10-21T17:57:25.511793Z" - } - }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'json_source_file' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson_source_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfin\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0msource_json\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'json_source_file' is not defined" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "with open(json_source_file, 'r') as fin:\n", " source_json = json.load(fin)" @@ -306,26 +222,9 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:08.072695Z", - "iopub.status.busy": "2020-10-20T18:23:08.072480Z", - "iopub.status.idle": "2020-10-20T18:23:08.082385Z", - "shell.execute_reply": "2020-10-20T18:23:08.081808Z", - "shell.execute_reply.started": "2020-10-20T18:23:08.072670Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trying to access minio credentials from:\n", - "/vault/secrets/minio-minimal-tenant1\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Get minio credentials using a helper\n", "from utilities import get_minio_credentials\n", @@ -354,42 +253,9 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2020-10-20T18:23:08.083546Z", - "iopub.status.busy": "2020-10-20T18:23:08.083355Z", - "iopub.status.idle": "2020-10-20T18:23:08.272886Z", - "shell.execute_reply": "2020-10-20T18:23:08.272147Z", - "shell.execute_reply.started": "2020-10-20T18:23:08.083523Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "Experiment link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Run link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "pipeline_run = kfp.Client().create_run_from_pipeline_func(\n", " pipeline,\n", From 0e4c450d670872eb3d449a4bad303ad3ca10bd64 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Tue, 27 Oct 2020 16:46:31 -0400 Subject: [PATCH 3/5] fix: update paths in openaddresses CI --- .github/workflows/build_openaddresses.yml | 2 +- .github/workflows/publish_openaddresses.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_openaddresses.yml b/.github/workflows/build_openaddresses.yml index 9bbcc54..d2f3c26 100644 --- a/.github/workflows/build_openaddresses.yml +++ b/.github/workflows/build_openaddresses.yml @@ -40,7 +40,7 @@ jobs: df -h - run: | - docker build -f ./openaddresses-batch-machine/openaddresses-batch-machine/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/openaddresses-batch-machine + docker build -f ./openaddresses-batch-machine/container/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/container docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest # Scan image for vulnerabilities diff --git a/.github/workflows/publish_openaddresses.yml b/.github/workflows/publish_openaddresses.yml index 6e550eb..94f4078 100644 --- a/.github/workflows/publish_openaddresses.yml +++ b/.github/workflows/publish_openaddresses.yml @@ -44,7 +44,7 @@ jobs: # Container build and push to a Azure Container registry (ACR) - run: | - docker build -f ./openaddresses-batch-machine/openaddresses-batch-machine/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/openaddresses-batch-machine + docker build -f ./openaddresses-batch-machine/container/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/container docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest From 300404cdbfe9e0f93d7f428c46690b665fb4fc7f Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Tue, 27 Oct 2020 16:53:39 -0400 Subject: [PATCH 4/5] fix: Remove temp debugging file --- openaddresses-batch-machine/container/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/openaddresses-batch-machine/container/Dockerfile b/openaddresses-batch-machine/container/Dockerfile index 154a055..b90c463 100644 --- a/openaddresses-batch-machine/container/Dockerfile +++ b/openaddresses-batch-machine/container/Dockerfile @@ -23,6 +23,4 @@ RUN addgroup appgroup && \ adduser -S -g appgroup appuser USER appuser -COPY test.json test.json - CMD python3 ${BATCH_MACHINE_PATH}/test.py From 0b1a677c11e3eaac9ff823e61c8fd64d47e445ba Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Tue, 15 Dec 2020 15:13:22 -0500 Subject: [PATCH 5/5] Non-functional change to trigger CI --- .github/workflows/build_openaddresses.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_openaddresses.yml b/.github/workflows/build_openaddresses.yml index d2f3c26..feecdf2 100644 --- a/.github/workflows/build_openaddresses.yml +++ b/.github/workflows/build_openaddresses.yml @@ -49,3 +49,5 @@ jobs: image-name: ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} severity-threshold: CRITICAL run-quality-checks: false + +# Insignificant change to tigger a build. Delete this line if found