diff --git a/.dockerignore b/.dockerignore
index 4dce722dae3..0f114c15005 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,7 +9,6 @@ hail/.bloop/
hail/.gradle/
hail/.idea/
hail/.pytest_cache/
-.git/
hail/.ensime.cache.d/
hail/.ensime_cache.d/
hail/.ensime_cache/
diff --git a/.github/workflows/prod_deploy.yaml b/.github/workflows/prod_deploy.yaml
new file mode 100644
index 00000000000..4fc2fc62fbc
--- /dev/null
+++ b/.github/workflows/prod_deploy.yaml
@@ -0,0 +1,43 @@
+name: prod-deploy
+on:
+ push:
+ branches:
+ - main
+jobs:
+ invoke-prod-deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - name: deploy to GCP
+ continue-on-error: true
+ run: |
+ DEPLOY_BATCH_URL_GCP=$(curl --fail --silent --show-error -X POST \
+ -H "Authorization: Bearer ${{ secrets.CI_TOKEN }}" \
+ -H "Content-Type:application/json" \
+ -d '{"steps": ["deploy_auth", "deploy_batch", "deploy_ci", "deploy_hailgenetics_image", "deploy_wheel", "upload_query_jar"], "sha": "${{ github.sha }}"}' \
+ https://ci.hail.populationgenomics.org.au/api/v1alpha/prod_deploy || echo "failed")
+ echo DEPLOY_BATCH_URL_GCP="$DEPLOY_BATCH_URL_GCP" >> $GITHUB_ENV
+
+ - name: deploy to Azure
+ continue-on-error: true
+ run: |
+ DEPLOY_BATCH_URL_AZURE=$(curl --fail --silent --show-error -X POST \
+ -H "Authorization: Bearer ${{ secrets.CI_TOKEN_AZURE }}" \
+ -H "Content-Type:application/json" \
+ -d '{"steps": ["deploy_auth", "deploy_batch", "deploy_ci", "upload_query_jar"], "sha": "${{ github.sha }}"}' \
+ https://ci.azhail.populationgenomics.org.au/api/v1alpha/prod_deploy || echo "failed")
+ echo DEPLOY_BATCH_URL_AZURE="$DEPLOY_BATCH_URL_AZURE" >> $GITHUB_ENV
+
+ - name: post to Slack
+ run: |
+ SLACK_MSG="Deploying Hail Batch:\n*GCP:* $DEPLOY_BATCH_URL_GCP\n*Azure:* $DEPLOY_BATCH_URL_AZURE"
+ curl --fail --silent --show-error -X POST \
+ -H "Authorization: Bearer ${{ secrets.SLACK_BOT_TOKEN }}" \
+ -H "Content-type: application/json" \
+ -d "{\"channel\": \"production-announcements\", \"text\": \"$SLACK_MSG\"}" \
+ https://slack.com/api/chat.postMessage
+
+ - name: check if any deploy failed
+ run: |
+ if [ "$DEPLOY_BATCH_URL_GCP" == "failed" ] || [ "$DEPLOY_BATCH_URL_AZURE" == "failed" ]; then
+ exit 1
+ fi
diff --git a/.gitignore b/.gitignore
index 108a935e55f..5bb09fbcca5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,9 @@ node_modules
GPATH
GRTAGS
GTAGS
+*.dylib
+*/hail.jar
+infra/.terraform.lock.hcl
hail/python/hail/docs/experimental/hail.experimental.DB.rst
hail/python/hailtop/batch/docs/api/
hail/upload-qob-jar
@@ -46,4 +49,5 @@ wheel-container.tar
hail/python/hail/backend/extra_classpath
hail/python/hail/backend/hail.jar
hail/install-editable
+_/
.helix
diff --git a/amundsen/Makefile b/amundsen/Makefile
new file mode 100644
index 00000000000..f1adad9f537
--- /dev/null
+++ b/amundsen/Makefile
@@ -0,0 +1,12 @@
+include ../config.mk
+
+PYTHON := PYTHONPATH=$${PYTHONPATH:+$${PYTHONPATH}:}$(EXTRA_PYTHONPATH) python3
+
+JINJA_ENVIRONMENT = '{"code":{"sha":"$(shell git rev-parse --short=12 HEAD)"},"deploy":$(DEPLOY),"default_ns":{"name":"$(NAMESPACE)"},"global":{"docker_prefix":"$(DOCKER_PREFIX)","domain":"$(DOMAIN)","k8s_server_url":"$(KUBERNETES_SERVER_URL)"},"scope":"$(SCOPE)"}'
+
+.PHONY: deploy
+deploy:
+ ! [ -z $(NAMESPACE) ] # call this like: make deploy NAMESPACE=default
+ E=$(JINJA_ENVIRONMENT) && \
+ python3 ../ci/jinja2_render.py $$E deployment.yaml deployment.yaml.out
+ kubectl -n $(NAMESPACE) apply -f deployment.yaml.out
diff --git a/amundsen/deployment.yaml b/amundsen/deployment.yaml
new file mode 100644
index 00000000000..8ee78b4e7e1
--- /dev/null
+++ b/amundsen/deployment.yaml
@@ -0,0 +1,81 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: amundsen-frontend
+ labels:
+ app: amundsen-frontend
+ hail.is/sha: "{{ code.sha }}"
+spec:
+ selector:
+ matchLabels:
+ app: amundsen-frontend
+ replicas: 1
+ template:
+ metadata:
+ labels:
+ app: amundsen-frontend
+ hail.is/sha: "{{ code.sha }}"
+ spec:
+{% if deploy %}
+ priorityClassName: production
+ nodeSelector:
+ preemptible: "false"
+{% else %}
+ nodeSelector:
+ preemptible: "true"
+ tolerations:
+ - key: preemptible
+ value: "true"
+{% endif %}
+ containers:
+ - name: amundsen-frontend
+ image: {{ global.docker_prefix }}/amundsendev/amundsen-frontend:2.3.0
+ imagePullPolicy: Always
+ resources:
+ requests:
+ cpu: "20m"
+ memory: "20M"
+ limits:
+ cpu: "1"
+ memory: "1G"
+ ports:
+ - containerPort: 5000
+ env:
+{% if deploy %}
+ - name: FRONTEND_BASE
+ value: https://amundsen-frontend.hail.is
+ - name: SEARCHSERVICE_BASE
+ value: https://amundsen-search.hail.is
+ - name: METADATASERVICE_BASE
+ value: https://amundsen-metadata.hail.is
+{% else %}
+ - name: FRONTEND_BASE
+ value: https://internal.hail.is/{{ default_ns.name }}/amundsen-frontend
+ - name: SEARCHSERVICE_BASE
+ value: https://internal.hail.is/{{ default_ns.name }}/amundsen-search
+ - name: METADATASERVICE_BASE
+ value: https://internal.hail.is/{{ default_ns.name }}/amundsen-metadata
+{% endif %}
+ - name: LONG_RANDOM_STRING
+ value: 4532y7y2389faehuwfteyw8704y329
+ command: ["gunicorn"]
+ args: ['-w', '4', '--bind', ':5000', 'amundsen_application.wsgi']
+ readinessProbe:
+ httpGet:
+ path: "/healthcheck"
+ port: 5000
+ initialDelaySeconds: 10
+ periodSeconds: 60
+ timeoutSeconds: 1
+ successThreshold: 1
+ failureThreshold: 5
+ livenessProbe:
+ httpGet:
+ path: "/healthcheck"
+ port: 5000
+ initialDelaySeconds: 10
+ periodSeconds: 60
+ timeoutSeconds: 1
+ successThreshold: 1
+ failureThreshold: 5
diff --git a/auth/auth/driver/driver.py b/auth/auth/driver/driver.py
index f7f99917b68..5388b657d8a 100644
--- a/auth/auth/driver/driver.py
+++ b/auth/auth/driver/driver.py
@@ -442,7 +442,9 @@ async def _create_user(app, user, skip_trial_bp, cleanup):
raise DatabaseConflictError
-async def create_user(app, user, skip_trial_bp=False):
+# 2023-11-16 mfranklin: disable trial bp because there's an auth problem
+# https://hail.zulipchat.com/#narrow/stream/300487-Hail-Batch-Dev/topic/Issue.20creating.20users/near/401890787
+async def create_user(app, user, skip_trial_bp=True):
cleanup: List[Callable[[], Awaitable[None]]] = []
try:
await _create_user(app, user, skip_trial_bp, cleanup)
diff --git a/batch/batch/batch.py b/batch/batch/batch.py
index fdffd053239..a2da5ba685b 100644
--- a/batch/batch/batch.py
+++ b/batch/batch/batch.py
@@ -48,8 +48,9 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]:
duration_ms = None
duration = None
- if record['cost_breakdown'] is not None:
- record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
+ cost_breakdown = record.get('cost_breakdown')
+ if cost_breakdown is not None:
+ cost_breakdown = cost_breakdown_to_dict(json.loads(cost_breakdown))
batch_response = {
'id': record['id'],
@@ -70,8 +71,8 @@ def batch_record_to_dict(record: Dict[str, Any]) -> Dict[str, Any]:
'duration_ms': duration_ms,
'duration': duration,
'msec_mcpu': record['msec_mcpu'],
- 'cost': coalesce(record['cost'], 0),
- 'cost_breakdown': record['cost_breakdown'],
+ 'cost': coalesce(record.get('cost'), 0),
+ 'cost_breakdown': cost_breakdown,
}
attributes = json.loads(record['attributes'])
@@ -138,8 +139,9 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn
exit_code = None
duration = None
- if record['cost_breakdown'] is not None:
- record['cost_breakdown'] = cost_breakdown_to_dict(json.loads(record['cost_breakdown']))
+ cost_breakdown = record.get('cost_breakdown')
+ if cost_breakdown is not None:
+ cost_breakdown = cost_breakdown_to_dict(json.loads(cost_breakdown))
return cast(
JobListEntryV1Alpha,
@@ -152,9 +154,9 @@ def job_record_to_dict(record: Dict[str, Any], name: Optional[str]) -> JobListEn
'state': record['state'],
'exit_code': exit_code,
'duration': duration,
- 'cost': coalesce(record['cost'], 0),
+ 'cost': coalesce(record.get('cost'), 0),
'msec_mcpu': record['msec_mcpu'],
- 'cost_breakdown': record['cost_breakdown'],
+ 'cost_breakdown': cost_breakdown,
'always_run': bool(record['always_run']),
'display_state': None,
},
diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py
index 014ad7fdf35..502bae02306 100644
--- a/batch/batch/cloud/gcp/driver/create_instance.py
+++ b/batch/batch/cloud/gcp/driver/create_instance.py
@@ -311,6 +311,7 @@ def scheduling() -> dict:
-e DOCKER_PREFIX=$DOCKER_PREFIX \
-e DOCKER_ROOT_IMAGE=$DOCKER_ROOT_IMAGE \
-e INSTANCE_CONFIG=$INSTANCE_CONFIG \
+-e DOCKER_PREFIX=$DOCKER_PREFIX \
-e MAX_IDLE_TIME_MSECS=$MAX_IDLE_TIME_MSECS \
-e BATCH_WORKER_IMAGE=$BATCH_WORKER_IMAGE \
-e BATCH_WORKER_IMAGE_ID=$BATCH_WORKER_IMAGE_ID \
diff --git a/batch/batch/driver/instance_collection/pool.py b/batch/batch/driver/instance_collection/pool.py
index 39ff5046b60..42e2469e882 100644
--- a/batch/batch/driver/instance_collection/pool.py
+++ b/batch/batch/driver/instance_collection/pool.py
@@ -139,6 +139,7 @@ def __init__(
self.worker_max_idle_time_secs = config.worker_max_idle_time_secs
self.job_queue_scheduling_window_secs = config.job_queue_scheduling_window_secs
self.min_instances = config.min_instances
+ self.label = config.label
self.all_supported_regions = self.inst_coll_manager.regions
@@ -167,6 +168,7 @@ def config(self):
'standing_worker_max_idle_time_secs': self.standing_worker_max_idle_time_secs,
'worker_max_idle_time_secs': self.worker_max_idle_time_secs,
'job_queue_scheduling_window_secs': self.job_queue_scheduling_window_secs,
+ 'label': self.label,
}
def configure(self, pool_config: PoolConfig):
@@ -190,6 +192,7 @@ def configure(self, pool_config: PoolConfig):
self.standing_worker_max_idle_time_secs = pool_config.standing_worker_max_idle_time_secs
self.worker_max_idle_time_secs = pool_config.worker_max_idle_time_secs
self.job_queue_scheduling_window_secs = pool_config.job_queue_scheduling_window_secs
+ self.label = pool_config.label
def adjust_for_remove_instance(self, instance):
super().adjust_for_remove_instance(instance)
diff --git a/batch/batch/driver/main.py b/batch/batch/driver/main.py
index ac6f99d756e..9380a4cd149 100644
--- a/batch/batch/driver/main.py
+++ b/batch/batch/driver/main.py
@@ -663,6 +663,8 @@ async def pool_config_update(request: web.Request, _) -> NoReturn:
f'a non-negative integer less than or equal to max_live_instances {max_live_instances}',
)
+ label = post['label']
+
possible_worker_cores = []
for cores in possible_cores_from_worker_type(pool.cloud, worker_type):
if not worker_local_ssd_data_disk:
@@ -758,6 +760,7 @@ async def pool_config_update(request: web.Request, _) -> NoReturn:
worker_max_idle_time_secs=worker_max_idle_time_secs,
standing_worker_max_idle_time_secs=standing_worker_max_idle_time_secs,
job_queue_scheduling_window_secs=job_queue_scheduling_window_secs,
+ label=label,
)
current_client_pool_config = json.loads(str(post['_pool_config_json']))
diff --git a/batch/batch/driver/templates/pool.html b/batch/batch/driver/templates/pool.html
index 6a62fb827e7..b6a720dbb46 100644
--- a/batch/batch/driver/templates/pool.html
+++ b/batch/batch/driver/templates/pool.html
@@ -29,6 +29,7 @@
Configuration
Worker max idle time in seconds:
Standing worker max idle time in seconds:
Job queue scheduling window in seconds:
+ Label: