diff --git a/.dockerignore b/.dockerignore index 4dce722dae3..0f114c15005 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,7 +9,6 @@ hail/.bloop/ hail/.gradle/ hail/.idea/ hail/.pytest_cache/ -.git/ hail/.ensime.cache.d/ hail/.ensime_cache.d/ hail/.ensime_cache/ diff --git a/.github/workflows/prod_deploy.yaml b/.github/workflows/prod_deploy.yaml new file mode 100644 index 00000000000..4fc2fc62fbc --- /dev/null +++ b/.github/workflows/prod_deploy.yaml @@ -0,0 +1,43 @@ +name: prod-deploy +on: + push: + branches: + - main +jobs: + invoke-prod-deploy: + runs-on: ubuntu-latest + steps: + - name: deploy to GCP + continue-on-error: true + run: | + DEPLOY_BATCH_URL_GCP=$(curl --fail --silent --show-error -X POST \ + -H "Authorization: Bearer ${{ secrets.CI_TOKEN }}" \ + -H "Content-Type:application/json" \ + -d '{"steps": ["deploy_auth", "deploy_batch", "deploy_ci", "deploy_hailgenetics_image", "deploy_wheel", "upload_query_jar"], "sha": "${{ github.sha }}"}' \ + https://ci.hail.populationgenomics.org.au/api/v1alpha/prod_deploy || echo "failed") + echo DEPLOY_BATCH_URL_GCP="$DEPLOY_BATCH_URL_GCP" >> $GITHUB_ENV + + - name: deploy to Azure + continue-on-error: true + run: | + DEPLOY_BATCH_URL_AZURE=$(curl --fail --silent --show-error -X POST \ + -H "Authorization: Bearer ${{ secrets.CI_TOKEN_AZURE }}" \ + -H "Content-Type:application/json" \ + -d '{"steps": ["deploy_auth", "deploy_batch", "deploy_ci", "upload_query_jar"], "sha": "${{ github.sha }}"}' \ + https://ci.azhail.populationgenomics.org.au/api/v1alpha/prod_deploy || echo "failed") + echo DEPLOY_BATCH_URL_AZURE="$DEPLOY_BATCH_URL_AZURE" >> $GITHUB_ENV + + - name: post to Slack + run: | + SLACK_MSG="Deploying Hail Batch:\n*GCP:* $DEPLOY_BATCH_URL_GCP\n*Azure:* $DEPLOY_BATCH_URL_AZURE" + curl --fail --silent --show-error -X POST \ + -H "Authorization: Bearer ${{ secrets.SLACK_BOT_TOKEN }}" \ + -H "Content-type: application/json" \ + -d "{\"channel\": \"production-announcements\", \"text\": \"$SLACK_MSG\"}" \ + https://slack.com/api/chat.postMessage + + - name: check if any deploy failed + run: | + if [ "$DEPLOY_BATCH_URL_GCP" == "failed" ] || [ "$DEPLOY_BATCH_URL_AZURE" == "failed" ]; then + exit 1 + fi diff --git a/.gitignore b/.gitignore index 108a935e55f..5bb09fbcca5 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,9 @@ node_modules GPATH GRTAGS GTAGS +*.dylib +*/hail.jar +infra/.terraform.lock.hcl hail/python/hail/docs/experimental/hail.experimental.DB.rst hail/python/hailtop/batch/docs/api/ hail/upload-qob-jar @@ -46,4 +49,5 @@ wheel-container.tar hail/python/hail/backend/extra_classpath hail/python/hail/backend/hail.jar hail/install-editable +_/ .helix diff --git a/amundsen/Makefile b/amundsen/Makefile new file mode 100644 index 00000000000..f1adad9f537 --- /dev/null +++ b/amundsen/Makefile @@ -0,0 +1,12 @@ +include ../config.mk + +PYTHON := PYTHONPATH=$${PYTHONPATH:+$${PYTHONPATH}:}$(EXTRA_PYTHONPATH) python3 + +JINJA_ENVIRONMENT = '{"code":{"sha":"$(shell git rev-parse --short=12 HEAD)"},"deploy":$(DEPLOY),"default_ns":{"name":"$(NAMESPACE)"},"global":{"docker_prefix":"$(DOCKER_PREFIX)","domain":"$(DOMAIN)","k8s_server_url":"$(KUBERNETES_SERVER_URL)"},"scope":"$(SCOPE)"}' + +.PHONY: deploy +deploy: + ! [ -z $(NAMESPACE) ] # call this like: make deploy NAMESPACE=default + E=$(JINJA_ENVIRONMENT) && \ + python3 ../ci/jinja2_render.py $$E deployment.yaml deployment.yaml.out + kubectl -n $(NAMESPACE) apply -f deployment.yaml.out diff --git a/amundsen/deployment.yaml b/amundsen/deployment.yaml new file mode 100644 index 00000000000..8ee78b4e7e1 --- /dev/null +++ b/amundsen/deployment.yaml @@ -0,0 +1,81 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: amundsen-frontend + labels: + app: amundsen-frontend + hail.is/sha: "{{ code.sha }}" +spec: + selector: + matchLabels: + app: amundsen-frontend + replicas: 1 + template: + metadata: + labels: + app: amundsen-frontend + hail.is/sha: "{{ code.sha }}" + spec: +{% if deploy %} + priorityClassName: production + nodeSelector: + preemptible: "false" +{% else %} + nodeSelector: + preemptible: "true" + tolerations: + - key: preemptible + value: "true" +{% endif %} + containers: + - name: amundsen-frontend + image: {{ global.docker_prefix }}/amundsendev/amundsen-frontend:2.3.0 + imagePullPolicy: Always + resources: + requests: + cpu: "20m" + memory: "20M" + limits: + cpu: "1" + memory: "1G" + ports: + - containerPort: 5000 + env: +{% if deploy %} + - name: FRONTEND_BASE + value: https://amundsen-frontend.hail.is + - name: SEARCHSERVICE_BASE + value: https://amundsen-search.hail.is + - name: METADATASERVICE_BASE + value: https://amundsen-metadata.hail.is +{% else %} + - name: FRONTEND_BASE + value: https://internal.hail.is/{{ default_ns.name }}/amundsen-frontend + - name: SEARCHSERVICE_BASE + value: https://internal.hail.is/{{ default_ns.name }}/amundsen-search + - name: METADATASERVICE_BASE + value: https://internal.hail.is/{{ default_ns.name }}/amundsen-metadata +{% endif %} + - name: LONG_RANDOM_STRING + value: 4532y7y2389faehuwfteyw8704y329 + command: ["gunicorn"] + args: ['-w', '4', '--bind', ':5000', 'amundsen_application.wsgi'] + readinessProbe: + httpGet: + path: "/healthcheck" + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 60 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 5 + livenessProbe: + httpGet: + path: "/healthcheck" + port: 5000 + initialDelaySeconds: 10 + periodSeconds: 60 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 5 diff --git a/auth/auth/driver/driver.py b/auth/auth/driver/driver.py index f7f99917b68..5388b657d8a 100644 --- a/auth/auth/driver/driver.py +++ b/auth/auth/driver/driver.py @@ -442,7 +442,9 @@ async def _create_user(app, user, skip_trial_bp, cleanup): raise DatabaseConflictError -async def create_user(app, user, skip_trial_bp=False): +# 2023-11-16 mfranklin: disable trial bp because there's an auth problem +# https://hail.zulipchat.com/#narrow/stream/300487-Hail-Batch-Dev/topic/Issue.20creating.20users/near/401890787 +async def create_user(app, user, skip_trial_bp=True): cleanup: List[Callable[[], Awaitable[None]]] = [] try: await _create_user(app, user, skip_trial_bp, cleanup) diff --git a/batch/batch/batch.py b/batch/batch/batch.py index fdffd053239..a2da5ba685b 100644 --- a/batch/batch/batch.py +++ b/batch/batch/batch.py @@ -48,8 +48,9 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]: duration_ms = None duration = None - if record['cost_breakdown'] is not None: - record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown'])) + cost_breakdown = record.get('cost_breakdown') + if cost_breakdown is not None: + cost_breakdown = cost_breakdown_to_dict(json.loads(cost_breakdown)) batch_response = { 'id': record['id'], @@ -70,8 +71,8 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]: 'duration_ms': duration_ms, 'duration': duration, 'msec_mcpu': record['msec_mcpu'], - 'cost': coalesce(record['cost'], 0), - 'cost_breakdown': record['cost_breakdown'], + 'cost': coalesce(record.get('cost'), 0), + 'cost_breakdown': cost_breakdown, } attributes = json.loads(record['attributes']) @@ -138,8 +139,9 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn exit_code = None duration = None - if record['cost_breakdown'] is not None: - record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown'])) + cost_breakdown = record.get('cost_breakdown') + if cost_breakdown is not None: + cost_breakdown = cost_breakdown_to_dict(json.loads(cost_breakdown)) return cast( JobListEntryV1Alpha, @@ -152,9 +154,9 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn 'state': record['state'], 'exit_code': exit_code, 'duration': duration, - 'cost': coalesce(record['cost'], 0), + 'cost': coalesce(record.get('cost'), 0), 'msec_mcpu': record['msec_mcpu'], - 'cost_breakdown': record['cost_breakdown'], + 'cost_breakdown': cost_breakdown, 'always_run': bool(record['always_run']), 'display_state': None, }, diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py index 014ad7fdf35..502bae02306 100644 --- a/batch/batch/cloud/gcp/driver/create_instance.py +++ b/batch/batch/cloud/gcp/driver/create_instance.py @@ -311,6 +311,7 @@ def scheduling() -> dict: -e DOCKER_PREFIX=$DOCKER_PREFIX \ -e DOCKER_ROOT_IMAGE=$DOCKER_ROOT_IMAGE \ -e INSTANCE_CONFIG=$INSTANCE_CONFIG \ +-e DOCKER_PREFIX=$DOCKER_PREFIX \ -e MAX_IDLE_TIME_MSECS=$MAX_IDLE_TIME_MSECS \ -e BATCH_WORKER_IMAGE=$BATCH_WORKER_IMAGE \ -e BATCH_WORKER_IMAGE_ID=$BATCH_WORKER_IMAGE_ID \ diff --git a/batch/batch/driver/instance_collection/pool.py b/batch/batch/driver/instance_collection/pool.py index 39ff5046b60..42e2469e882 100644 --- a/batch/batch/driver/instance_collection/pool.py +++ b/batch/batch/driver/instance_collection/pool.py @@ -139,6 +139,7 @@ def __init__( self.worker_max_idle_time_secs = config.worker_max_idle_time_secs self.job_queue_scheduling_window_secs = config.job_queue_scheduling_window_secs self.min_instances = config.min_instances + self.label = config.label self.all_supported_regions = self.inst_coll_manager.regions @@ -167,6 +168,7 @@ def config(self): 'standing_worker_max_idle_time_secs': self.standing_worker_max_idle_time_secs, 'worker_max_idle_time_secs': self.worker_max_idle_time_secs, 'job_queue_scheduling_window_secs': self.job_queue_scheduling_window_secs, + 'label': self.label, } def configure(self, pool_config: PoolConfig): @@ -190,6 +192,7 @@ def configure(self, pool_config: PoolConfig): self.standing_worker_max_idle_time_secs = pool_config.standing_worker_max_idle_time_secs self.worker_max_idle_time_secs = pool_config.worker_max_idle_time_secs self.job_queue_scheduling_window_secs = pool_config.job_queue_scheduling_window_secs + self.label = pool_config.label def adjust_for_remove_instance(self, instance): super().adjust_for_remove_instance(instance) diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py index ac6f99d756e..9380a4cd149 100644 --- a/batch/batch/driver/main.py +++ b/batch/batch/driver/main.py @@ -663,6 +663,8 @@ async def pool_config_update(request: web.Request, _) -> NoReturn: f'a non-negative integer less than or equal to max_live_instances {max_live_instances}', ) + label = post['label'] + possible_worker_cores = [] for cores in possible_cores_from_worker_type(pool.cloud, worker_type): if not worker_local_ssd_data_disk: @@ -758,6 +760,7 @@ async def pool_config_update(request: web.Request, _) -> NoReturn: worker_max_idle_time_secs=worker_max_idle_time_secs, standing_worker_max_idle_time_secs=standing_worker_max_idle_time_secs, job_queue_scheduling_window_secs=job_queue_scheduling_window_secs, + label=label, ) current_client_pool_config = json.loads(str(post['_pool_config_json'])) diff --git a/batch/batch/driver/templates/pool.html b/batch/batch/driver/templates/pool.html index 6a62fb827e7..b6a720dbb46 100644 --- a/batch/batch/driver/templates/pool.html +++ b/batch/batch/driver/templates/pool.html @@ -29,6 +29,7 @@

Configuration

Worker max idle time in seconds:
Standing worker max idle time in seconds:
Job queue scheduling window in seconds:
+
Label: