From a2ab84ae9af01b8ce178bc416553d68a6b2c28d4 Mon Sep 17 00:00:00 2001 From: Saman Ehsan Date: Fri, 1 Feb 2019 10:05:42 -0500 Subject: [PATCH] Remove cromwell-metadata docker (#105) * Remove cromwell-metadata docker Use the default application credentials of the VM, which are the same credentials provided in workflow options, when requesting metadata from cromwell instead of storing the JSON key in a private docker image. * Use PAPIv2 * Remove cromwell-tools requirement * Remove use_caas parameter The parameter was previously being used to determine which credentials to use when sending an API request to Cromwell to retrieve analysis workflow metadata. Now the submit wdl will only run in caas so the parameter is no longer needed. * Update documentation * Update pipeline-tools docker image version --- README.md | 6 +- adapter_pipelines/Optimus/adapter.wdl | 4 +- adapter_pipelines/cellranger/adapter.wdl | 4 +- .../ss2_single_sample/adapter.wdl | 4 +- adapter_pipelines/submit.wdl | 10 ++-- adapter_pipelines/submit_stub/submit.wdl | 1 - .../get_analysis_workflow_metadata.py | 58 +++++++------------ .../test_get_analysis_workflow_metadata.py | 47 +++------------ requirements.txt | 2 +- setup.py | 3 +- 10 files changed, 41 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index c43e42e1..59af1c8b 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,11 @@ [![Documentation Status](https://readthedocs.org/projects/pipeline-tools/badge/?version=latest)](http://pipeline-tools.readthedocs.io/en/latest/?badge=latest) -This repo contains Python code and pipelines for interacting with the Human Cell Atlas Data Coordination Platform. They are used by the Secondary Analysis Service. +This repository contains Python code and pipelines for interacting with the Human Cell Atlas Data Coordination Platform (DCP). They are used by the Secondary Analysis Service. -The pipelines wrap analysis pipelines from the Skylab repo and provide some glue to interface with the DCP. The adapter pipelines take bundle ids as inputs, query the Data Storage Service to find the input files needed by the analysis pipelines, then run the analysis pipelines and submit the results to the Ingest Service. This helps us keep the analysis pipelines themselves free of dependencies on the DCP. +The pipelines wrap analysis pipelines from the Skylab repository and provide some glue to interface with the DCP. The adapter pipelines take bundle ids as inputs, query the Data Storage Service to find the input files needed by the analysis pipelines, then run the analysis pipelines and submit the results to the Ingest Service. This helps us keep the analysis pipelines themselves free of dependencies on the DCP. + +Note: The adapter pipelines can only run in Cromwell instances that use SAM for Identity and Access Management (IAM), such as Cromwell-as-a-Service. ## Run tests diff --git a/adapter_pipelines/Optimus/adapter.wdl b/adapter_pipelines/Optimus/adapter.wdl index f334d912..dceb6305 100644 --- a/adapter_pipelines/Optimus/adapter.wdl +++ b/adapter_pipelines/Optimus/adapter.wdl @@ -118,7 +118,6 @@ workflow AdapterOptimus { Int? retry_timeout Int? individual_request_timeout String reference_bundle - Boolean use_caas # Set runtime environment such as "dev" or "staging" or "prod" so submit task could choose proper docker image to use String runtime_environment @@ -127,7 +126,7 @@ workflow AdapterOptimus { Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.45.0" + String pipeline_tools_version = "v0.46.0" call GetInputs as prep { input: @@ -214,7 +213,6 @@ workflow AdapterOptimus { retry_timeout = retry_timeout, individual_request_timeout = individual_request_timeout, runtime_environment = runtime_environment, - use_caas = use_caas, record_http = record_http, pipeline_tools_version = pipeline_tools_version, add_md5s = add_md5s, diff --git a/adapter_pipelines/cellranger/adapter.wdl b/adapter_pipelines/cellranger/adapter.wdl index 380b966f..fbd244a0 100644 --- a/adapter_pipelines/cellranger/adapter.wdl +++ b/adapter_pipelines/cellranger/adapter.wdl @@ -142,7 +142,6 @@ workflow Adapter10xCount { Int? retry_timeout Int? individual_request_timeout String reference_bundle - Boolean use_caas # Set runtime environment such as "dev" or "staging" or "prod" so submit task could choose proper docker image to use String runtime_environment @@ -151,7 +150,7 @@ workflow Adapter10xCount { Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.45.0" + String pipeline_tools_version = "v0.46.0" call GetInputs { input: @@ -252,7 +251,6 @@ workflow Adapter10xCount { retry_timeout = retry_timeout, individual_request_timeout = individual_request_timeout, runtime_environment = runtime_environment, - use_caas = use_caas, record_http = record_http, pipeline_tools_version = pipeline_tools_version, add_md5s = add_md5s, diff --git a/adapter_pipelines/ss2_single_sample/adapter.wdl b/adapter_pipelines/ss2_single_sample/adapter.wdl index 67c0c959..27fb3e24 100644 --- a/adapter_pipelines/ss2_single_sample/adapter.wdl +++ b/adapter_pipelines/ss2_single_sample/adapter.wdl @@ -74,7 +74,6 @@ workflow AdapterSmartSeq2SingleCell{ Int? retry_timeout Int? individual_request_timeout String reference_bundle - Boolean use_caas # Set runtime environment such as "dev" or "staging" or "prod" so submit task could choose proper docker image to use String runtime_environment @@ -83,7 +82,7 @@ workflow AdapterSmartSeq2SingleCell{ Int max_cromwell_retries = 0 Boolean add_md5s = false - String pipeline_tools_version = "v0.45.0" + String pipeline_tools_version = "v0.46.0" call GetInputs as prep { input: @@ -208,7 +207,6 @@ workflow AdapterSmartSeq2SingleCell{ retry_timeout = retry_timeout, individual_request_timeout = individual_request_timeout, runtime_environment = runtime_environment, - use_caas = use_caas, record_http = record_http, pipeline_tools_version = pipeline_tools_version, add_md5s = add_md5s, diff --git a/adapter_pipelines/submit.wdl b/adapter_pipelines/submit.wdl index 1244325a..1ab14097 100644 --- a/adapter_pipelines/submit.wdl +++ b/adapter_pipelines/submit.wdl @@ -7,8 +7,8 @@ task get_metadata { Float? retry_multiplier Int? retry_timeout Int? individual_request_timeout - Boolean use_caas Boolean record_http + String pipeline_tools_version Int max_retries = 0 command <<< @@ -25,11 +25,10 @@ task get_metadata { get-analysis-workflow-metadata \ --analysis_output_path ${analysis_output_path} \ - --cromwell_url ${cromwell_url} \ - --use_caas ${use_caas} + --cromwell_url ${cromwell_url} >>> runtime { - docker: (if runtime_environment == "prod" then "gcr.io/hca-dcp-pipelines-prod/cromwell-metadata:" else "gcr.io/broad-dsde-mint-${runtime_environment}/cromwell-metadata:") + "v1.2.0" + docker: "quay.io/humancellatlas/secondary-analysis-pipeline-tools:" + pipeline_tools_version maxRetries: max_retries } output { @@ -251,7 +250,6 @@ workflow submit { Float? retry_multiplier Int? retry_timeout Int? individual_request_timeout - Boolean use_caas # By default, don't record http requests Boolean record_http = false String pipeline_tools_version @@ -267,12 +265,12 @@ workflow submit { analysis_output_path = outputs[0], runtime_environment = runtime_environment, cromwell_url = cromwell_url, - use_caas=use_caas, record_http = record_http, retry_timeout = retry_timeout, individual_request_timeout = individual_request_timeout, retry_multiplier = retry_multiplier, retry_max_interval = retry_max_interval, + pipeline_tools_version = pipeline_tools_version, max_retries = max_retries } diff --git a/adapter_pipelines/submit_stub/submit.wdl b/adapter_pipelines/submit_stub/submit.wdl index 237f4470..b920f684 100644 --- a/adapter_pipelines/submit_stub/submit.wdl +++ b/adapter_pipelines/submit_stub/submit.wdl @@ -23,7 +23,6 @@ workflow submit { String analysis_file_version String method String runtime_environment - Boolean use_caas Int? retry_max_interval Float? retry_multiplier Int? retry_timeout diff --git a/pipeline_tools/get_analysis_workflow_metadata.py b/pipeline_tools/get_analysis_workflow_metadata.py index 684c1f55..701c0612 100644 --- a/pipeline_tools/get_analysis_workflow_metadata.py +++ b/pipeline_tools/get_analysis_workflow_metadata.py @@ -1,8 +1,7 @@ import argparse import json -from cromwell_tools import cromwell_tools -from requests.auth import HTTPBasicAuth - +import google.auth +import google.auth.transport.requests from pipeline_tools.http_requests import HttpRequests @@ -25,34 +24,33 @@ def get_analysis_workflow_id(analysis_output_path): return workflow_id -def get_auth(credentials_file=None): - """Parse cromwell username and password from credentials file. - - Args: - credentials_file (str): Path to the file containing cromwell authentication credentials. +def get_auth_headers(): + """ Get a bearer token from the default google account credentials on the machine that executes + this function. The credentials must have the scopes "https://www.googleapis.com/auth/userinfo.email" + and "https://www.googleapis.com/auth/userinfo.profile", which Cromwell will add automatically if + it is confiugred to use the Pipelines API v2 backend. Returns: - requests.auth.HTTPBasicAuth: An object to be used for cromwell requests. + headers (dict): authorization header containing bearer token {'Authorization': 'bearer 12345'} """ - credentials_file = credentials_file or '/cromwell-metadata/cromwell_credentials.txt' - with open(credentials_file) as f: - credentials = f.read().split() - user = credentials[0] - password = credentials[1] - return HTTPBasicAuth(user, password) + credentials, project = google.auth.default() + if not credentials.valid: + credentials.refresh(google.auth.transport.requests.Request()) + headers = {} + credentials.apply(headers) + return headers -def get_metadata(cromwell_url, workflow_id, http_requests, use_caas=False, caas_key_file=None): - """Get metadata for analysis workflow from Cromwell and write it to a JSON file. Retry the request with - exponentially increasing wait times if there is an error. +def get_metadata(cromwell_url, workflow_id, http_requests): + """Get metadata for analysis workflow from Cromwell and write it to a JSON file. This is only + compatible with instances of Cromwell that use SAM for Identity Access Management (IAM), such + as Cromwell-as-a-Service. Args: cromwell_url (str): Url to the cromwell environment the workflow was run in. workflow_id (str): The analysis workflow id. http_requests: `http_requests.HttpRequests` instance, a wrapper around requests provides better retry and logging. - use_caas (bool): Whether or not to use Cromwell-as-a-Service. - caas_key_file (str): Path to CaaS service account JSON key file. Raises: requests.HTTPError: For 4xx errors or 5xx errors beyond the timeout @@ -61,17 +59,9 @@ def get_metadata(cromwell_url, workflow_id, http_requests, use_caas=False, caas_ def log_before(workflow_id): print('Getting metadata for workflow {}'.format(workflow_id)) - cromwell_url = cromwell_url - - if use_caas: - json_credentials = caas_key_file or "/cromwell-metadata/caas_key.json" - headers = cromwell_tools.generate_auth_header_from_key_file(json_credentials) - auth = None - else: - headers = None - auth = get_auth() + headers = get_auth_headers() url = '{0}/{1}/metadata?expandSubWorkflows=true'.format(cromwell_url, workflow_id) - response = http_requests.get(url, auth=auth, headers=headers, before=log_before(workflow_id)) + response = http_requests.get(url, headers=headers, before=log_before(workflow_id)) with open('metadata.json', 'w') as f: json.dump(response.json(), f, indent=2) @@ -80,21 +70,15 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('--analysis_output_path', required=True) parser.add_argument('--cromwell_url', required=True) - parser.add_argument('--use_caas', required=True) - parser.add_argument('--caas_key_file', required=False, default=None) args = parser.parse_args() - use_caas = True if args.use_caas.lower() == 'true' else False - print('Using analysis output path: {0}'.format(args.analysis_output_path)) # Get the workflow id and metadata, write them to files workflow_id = get_analysis_workflow_id(analysis_output_path=args.analysis_output_path) get_metadata(cromwell_url=args.cromwell_url, workflow_id=workflow_id, - http_requests=HttpRequests(), - use_caas=use_caas, - caas_key_file=args.caas_key_file) + http_requests=HttpRequests()) if __name__ == '__main__': diff --git a/pipeline_tools/tests/test_get_analysis_workflow_metadata.py b/pipeline_tools/tests/test_get_analysis_workflow_metadata.py index e97e063f..0db8fe1f 100644 --- a/pipeline_tools/tests/test_get_analysis_workflow_metadata.py +++ b/pipeline_tools/tests/test_get_analysis_workflow_metadata.py @@ -1,7 +1,6 @@ import os import pytest import requests -from requests.auth import HTTPBasicAuth from unittest.mock import patch from pipeline_tools import get_analysis_workflow_metadata @@ -52,11 +51,7 @@ class Data: return Data -def mocked_get_auth(): - return HTTPBasicAuth('user', 'password') - - -def mocked_generate_auth_header_from_key_file(foo_credentials): +def mocked_get_auth_headers(): return {'Authorization': 'bearer 12345'} @@ -72,32 +67,6 @@ def test_get_analysis_workflow_id(self, test_data, tmpdir): assert result == expected assert current_file_path.read() == 'analysis_subworkflow_id' - def test_get_auth(self): - credentials_file = '{0}test_credentials.txt'.format(data_dir) - auth = get_analysis_workflow_metadata.get_auth(credentials_file) - expected_auth = HTTPBasicAuth('fake-user', 'fake-password') - assert auth == expected_auth - - def test_get_metadata_success(self, requests_mock, test_data, tmpdir): - current_file_path = tmpdir.join('metadata.json') - - def _request_callback(request, context): - context.status_code = 200 - return { - 'workflowName': 'TestWorkflow' - } - - requests_mock.get(test_data.cromwell_metadata_url, json=_request_callback) - with patch('pipeline_tools.get_analysis_workflow_metadata.get_auth', side_effect=mocked_get_auth), \ - tmpdir.as_cwd(), \ - HttpRequestsManager(): - get_analysis_workflow_metadata.get_metadata(test_data.base_url, - test_data.workflow_id, - HttpRequests(), - use_caas=False) - assert requests_mock.call_count == 1 - assert current_file_path.read() is not None - def test_get_metadata_using_caas(self, requests_mock, test_data, tmpdir): current_file_path = tmpdir.join('metadata.json') @@ -108,12 +77,11 @@ def _request_callback(request, context): } requests_mock.get(test_data.caas_metadata_url, json=_request_callback) - with patch('pipeline_tools.get_analysis_workflow_metadata.cromwell_tools.generate_auth_header_from_key_file', - side_effect=mocked_generate_auth_header_from_key_file), tmpdir.as_cwd(), HttpRequestsManager(): + with patch('pipeline_tools.get_analysis_workflow_metadata.get_auth_headers', + side_effect=mocked_get_auth_headers), tmpdir.as_cwd(), HttpRequestsManager(): get_analysis_workflow_metadata.get_metadata(test_data.caas_base_url, test_data.workflow_id, - HttpRequests(), - use_caas=True) + HttpRequests()) assert requests_mock.call_count == 1 assert current_file_path.read() is not None @@ -123,10 +91,9 @@ def _request_callback(request, context): return {'status': 'error', 'message': 'Internal Server Error'} requests_mock.get(test_data.cromwell_metadata_url, json=_request_callback) - with patch('pipeline_tools.get_analysis_workflow_metadata.get_auth', side_effect=mocked_get_auth), \ - pytest.raises(requests.HTTPError), HttpRequestsManager(): + with patch('pipeline_tools.get_analysis_workflow_metadata.get_auth_headers', + side_effect=mocked_get_auth_headers), pytest.raises(requests.HTTPError), HttpRequestsManager(): get_analysis_workflow_metadata.get_metadata(test_data.base_url, test_data.workflow_id, - HttpRequests(), - use_caas=False) + HttpRequests()) assert requests_mock.call_count == 3 diff --git a/requirements.txt b/requirements.txt index 8667c5df..0761dd2e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ requests==2.20.0 +google-auth>=1.6.1,<2 google-cloud-storage==1.8.0 tenacity==4.10.0 PyJWT==1.6.4 -git+git://github.com/broadinstitute/cromwell-tools.git@v0.5.0 git+git://github.com/HumanCellAtlas/metadata-api@release/1.0b4#egg=hca-metadata-api[dss] diff --git a/setup.py b/setup.py index 70b1504c..e1c6235a 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ license='BSD 3-clause "New" or "Revised" License', packages=['pipeline_tools'], install_requires=[ - 'cromwell-tools', + 'google-auth>=1.6.1,<2', 'google-cloud-storage>=1.8.0,<2', 'hca>=4.5.0,<5', 'hca-metadata-api', @@ -39,7 +39,6 @@ }, # FIXME: DEPRECATION: Dependency Links processing has been deprecated and will be removed in a future release. dependency_links=[ - 'git+git://github.com/broadinstitute/cromwell-tools.git@v0.5.0#egg=cromwell-tools-1.0.1', # FIXME: install hca-metadata-api from PyPI once it is available (shortly) # Pin to a specific commit of the hca-metadata-api so we won't be broken by changes to that repo before it's # available on PyPI