Skip to content

Commit

Permalink
Update nightly branch after successful build (#3195)
Browse files Browse the repository at this point in the history
* Enable updating of nightly branch after successful build.
* Skip nightly build if no changes since last successful build.
* Checkout the full repo so we have all tags etc.
* Respond to PR comments / clean up git commands

---------

Co-authored-by: Dazhong Xia <[email protected]>
  • Loading branch information
zaneselvans and jdangerx authored Jan 3, 2024
1 parent e9a91be commit 5a81260
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 59 deletions.
29 changes: 25 additions & 4 deletions .github/workflows/build-deploy-pudl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,35 @@ jobs:
uses: actions/checkout@v4
with:
ref: ${{ env.BUILD_REF }}
fetch-depth: 0

- name: Skip the build if no changes since the last successful nightly build.
if: ${{ (github.event_name == 'schedule') }}
run: |
CURRENT_COMMIT=$(git rev-parse HEAD)
NIGHTLY_COMMIT=$(git rev-parse origin/nightly)
if [[ "$CURRENT_COMMIT" == "$NIGHTLY_COMMIT" ]]; then
echo "::notice::No changes since last successful nightly build. Skipping."
echo "SKIP_BUILD=true" >> $GITHUB_ENV
exit 0
fi
- name: Set action environment variables
if: ${{ env.SKIP_BUILD != 'true' }}
run: |
echo "NIGHTLY_TAG=nightly-$(date +%Y-%m-%d)" >> $GITHUB_ENV
echo "BUILD_ID=$(date +%Y-%m-%d-%H%M)-$(git rev-parse --short HEAD)-${BUILD_REF}" >> $GITHUB_ENV
- name: Show freshly set envvars
if: ${{ env.SKIP_BUILD != 'true' }}
run: |
echo "GCE_INSTANCE: $GCE_INSTANCE"
echo "BUILD_REF: $BUILD_REF"
echo "NIGHTLY_TAG: $NIGHTLY_TAG"
echo "BUILD_ID: $BUILD_ID"
- name: Tag nightly build
if: ${{ (github.event_name == 'schedule') }}
if: ${{ (github.event_name == 'schedule') && (env.SKIP_BUILD != 'true') }}
run: |
git config user.email "[email protected]"
git config user.name "pudlbot"
Expand All @@ -56,6 +70,7 @@ jobs:
- name: Docker Metadata
id: docker_metadata
if: ${{ env.SKIP_BUILD != 'true' }}
uses: docker/metadata-action@v5
with:
images: catalystcoop/pudl-etl
Expand All @@ -66,16 +81,18 @@ jobs:
type=ref,event=tag
- name: Set up Docker Buildx
if: ${{ env.SKIP_BUILD != 'true' }}
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
if: github.event_name != 'pull_request'
if: ${{ (github.event_name != 'pull_request') && (env.SKIP_BUILD != 'true') }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Build image and push to Docker Hub
if: ${{ env.SKIP_BUILD != 'true' }}
uses: docker/build-push-action@v5
with:
context: .
Expand All @@ -87,17 +104,20 @@ jobs:
cache-to: type=gha,mode=max

- id: "auth"
if: ${{ env.SKIP_BUILD != 'true' }}
uses: "google-github-actions/auth@v2"
with:
workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider"
service_account: "deploy-pudl-github-action@catalyst-cooperative-pudl.iam.gserviceaccount.com"

# Setup gcloud CLI
- name: Set up Cloud SDK
if: ${{ env.SKIP_BUILD != 'true' }}
uses: google-github-actions/setup-gcloud@v2

# Deploy PUDL image to GCE
- name: Deploy
if: ${{ env.SKIP_BUILD != 'true' }}
env:
DAGSTER_PG_PASSWORD: ${{ secrets.DAGSTER_PG_PASSWORD }}
PUDL_OUTPUT_PATH: ${{ env.GCS_OUTPUT_BUCKET }}/${{ env.BUILD_ID }}
Expand All @@ -107,7 +127,7 @@ jobs:
--metadata-from-file startup-script=./docker/vm_startup_script.sh
gcloud compute instances update-container "$GCE_INSTANCE" \
--zone "$GCE_INSTANCE_ZONE" \
--container-image "docker.io/catalystcoop/pudl-etl:${{ env.BUILD_REF}}" \
--container-image "docker.io/catalystcoop/pudl-etl:${{ env.BUILD_REF }}" \
--container-command "micromamba" \
--container-arg="run" \
--container-arg="--prefix" \
Expand All @@ -117,7 +137,7 @@ jobs:
--container-arg="bash" \
--container-arg="./docker/gcp_pudl_etl.sh" \
--container-env-file="./docker/.env" \
--container-env BUILD_REF=${{ env.BUILD_REF}} \
--container-env BUILD_REF=${{ env.BUILD_REF }} \
--container-env BUILD_ID=${{ env.BUILD_ID }} \
--container-env NIGHTLY_TAG=${{ env.NIGHTLY_TAG }} \
--container-env GITHUB_ACTION_TRIGGER=${{ github.event_name }} \
Expand All @@ -140,6 +160,7 @@ jobs:
# Start the VM
- name: Start the deploy-pudl-vm
if: ${{ env.SKIP_BUILD != 'true' }}
run: gcloud compute instances start "$GCE_INSTANCE" --zone="$GCE_INSTANCE_ZONE"

- name: Post to a pudl-deployments channel
Expand Down
140 changes: 85 additions & 55 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
# This script runs the entire ETL and validation tests in a docker container on a Google Compute Engine instance.
# This script won't work locally because it needs adequate GCP permissions.

# Set PUDL_GCS_OUTPUT *only* if it is currently unset
: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$BUILD_ID}"

set -x

function send_slack_msg() {
curl -X POST -H "Content-type: application/json" -H "Authorization: Bearer ${SLACK_TOKEN}" https://slack.com/api/chat.postMessage --data "{\"channel\": \"C03FHB9N0PQ\", \"text\": \"$1\"}"
}
Expand Down Expand Up @@ -57,38 +52,38 @@ function shutdown_vm() {
curl -X POST -H "Content-Length: 0" -H "Authorization: Bearer ${ACCESS_TOKEN}" "https://compute.googleapis.com/compute/v1/projects/catalyst-cooperative-pudl/zones/$GCE_INSTANCE_ZONE/instances/$GCE_INSTANCE/stop"
}

function copy_outputs_to_gcs() {
echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT"
gsutil -m cp -r "$PUDL_OUTPUT" "$PUDL_GCS_OUTPUT"
function save_outputs_to_gcs() {
echo "Copying outputs to GCP bucket $PUDL_GCS_OUTPUT" && \
gsutil -m cp -r "$PUDL_OUTPUT" "$PUDL_GCS_OUTPUT" && \
rm "$PUDL_OUTPUT/success"
}

function copy_outputs_to_distribution_bucket() {
# Only attempt to update outputs if we have a real value of BUILD_REF
if [ -n "$BUILD_REF" ]; then
echo "Removing old $BUILD_REF outputs from GCP distributon bucket."
gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "gs://pudl.catalyst.coop/$BUILD_REF"
echo "Copying outputs to GCP distribution bucket"
gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$BUILD_REF"

echo "Removing old $BUILD_REF outputs from AWS distributon bucket."
aws s3 rm "s3://pudl.catalyst.coop/$BUILD_REF" --recursive
echo "Copying outputs to AWS distribution bucket"
# This avoids accidentally blowing away the whole bucket if it's not set.
if [[ -n "$BUILD_REF" ]]; then
echo "Removing old $BUILD_REF outputs from GCP distributon bucket." && \
gsutil -m -u "$GCP_BILLING_PROJECT" rm -r "gs://pudl.catalyst.coop/$BUILD_REF" && \
echo "Copying outputs to GCP distribution bucket" && \
gsutil -m -u "$GCP_BILLING_PROJECT" cp -r "$PUDL_OUTPUT/*" "gs://pudl.catalyst.coop/$BUILD_REF" && \
echo "Removing old $BUILD_REF outputs from AWS distributon bucket." && \
aws s3 rm "s3://pudl.catalyst.coop/$BUILD_REF" --recursive && \
echo "Copying outputs to AWS distribution bucket" && \
aws s3 cp "$PUDL_OUTPUT/" "s3://pudl.catalyst.coop/$BUILD_REF" --recursive
fi
}

function zenodo_data_release() {
echo "Creating a new PUDL data release on Zenodo."
~/pudl/devtools/zenodo/zenodo_data_release.py --publish --env sandbox --source-dir "$PUDL_OUTPUT"
echo "Creating a new PUDL data release on Zenodo." && \
~/pudl/devtools/zenodo/zenodo_data_release.py --publish --env "$1" --source-dir "$PUDL_OUTPUT"
}

function notify_slack() {
# Notify pudl-builds slack channel of deployment status
if [ "$1" = "success" ]; then
if [[ "$1" == "success" ]]; then
message=":large_green_circle: :sunglasses: :unicorn_face: :rainbow: The deployment succeeded!! :partygritty: :database_parrot: :blob-dance: :large_green_circle:\n\n "
message+="<https://github.com/catalyst-cooperative/pudl/compare/main...${BUILD_REF}|Make a PR for \`${BUILD_REF}\` into \`main\`!>\n\n"
elif [ "$1" = "failure" ]; then
elif [[ "$1" == "failure" ]]; then
message=":large_red_square: Oh bummer the deployment failed ::fiiiiine: :sob: :cry_spin:\n\n "
else
echo "Invalid deployment status"
Expand All @@ -100,60 +95,95 @@ function notify_slack() {
}

function update_nightly_branch() {
git config --unset http.https://github.com/.extraheader
git config user.email "[email protected]"
git config user.name "pudlbot"
git remote set-url origin "https://pudlbot:$PUDL_BOT_PAT@github.com/catalyst-cooperative/pudl.git"
echo "BOGUS: Updating nightly branch to point at $NIGHTLY_TAG."
git fetch origin nightly:nightly
git checkout nightly
git merge --ff-only "$NIGHTLY_TAG"
ETL_SUCCESS=${PIPESTATUS[0]}
git push -u origin
# When building the image, GHA adds an HTTP basic auth header in git
# config, which overrides the auth we set below. So we unset it.
git config --unset http.https://github.com/.extraheader && \
git config user.email "[email protected]" && \
git config user.name "pudlbot" && \
git remote set-url origin "https://pudlbot:$PUDL_BOT_PAT@github.com/catalyst-cooperative/pudl.git" && \
echo "Updating nightly branch to point at $NIGHTLY_TAG." && \
git fetch --force --tags origin "$NIGHTLY_TAG" && \
git fetch origin nightly:nightly && \
git checkout nightly && \
git show-ref -d nightly "$NIGHTLY_TAG" && \
git merge --ff-only "$NIGHTLY_TAG" && \
git push -u origin nightly
}

function clean_up_outputs_for_distribution() {
# Compress the SQLite DBs for easier distribution
gzip --verbose "$PUDL_OUTPUT"/*.sqlite && \
# Remove redundant multi-file EPA CEMS outputs prior to distribution
rm -rf "$PUDL_OUTPUT/core_epacems__hourly_emissions/" && \
rm -f "$PUDL_OUTPUT/metadata.yml"
}

# # Run ETL. Copy outputs to GCS and shutdown VM if ETL succeeds or fails
########################################################################################
# MAIN SCRIPT
########################################################################################
# Initialize our success variables so they all definitely have a value to check
ETL_SUCCESS=0
SAVE_OUTPUTS_SUCCESS=0
UPDATE_NIGHTLY_SUCCESS=0
DATASETTE_SUCCESS=0
CLEAN_UP_OUTPUTS_SUCCESS=0
DISTRIBUTION_BUCKET_SUCCESS=0
ZENODO_SUCCESS=0

# Set PUDL_GCS_OUTPUT *only* if it is currently unset
: "${PUDL_GCS_OUTPUT:=gs://nightly-build-outputs.catalyst.coop/$BUILD_ID}"

# Run ETL. Copy outputs to GCS and shutdown VM if ETL succeeds or fails
# 2>&1 redirects stderr to stdout.
run_pudl_etl 2>&1 | tee "$LOGFILE"
ETL_SUCCESS=${PIPESTATUS[0]}

copy_outputs_to_gcs
save_outputs_to_gcs 2>&1 | tee -a "$LOGFILE"
SAVE_OUTPUTS_SUCCESS=${PIPESTATUS[0]}

# if pipeline is successful, distribute + publish datasette
if [[ $ETL_SUCCESS == 0 ]]; then
# Deploy the updated data to datasette
if [ "$BUILD_REF" = "dev" ]; then
python ~/pudl/devtools/datasette/publish.py 2>&1 | tee -a "$LOGFILE"
ETL_SUCCESS=${PIPESTATUS[0]}
if [[ "$GITHUB_ACTION_TRIGGER" == "schedule" ]]; then
update_nightly_branch 2>&1 | tee -a "$LOGFILE"
UPDATE_NIGHTLY_SUCCESS=${PIPESTATUS[0]}
fi

# Compress the SQLite DBs for easier distribution
# Remove redundant multi-file EPA CEMS outputs prior to distribution
gzip --verbose "$PUDL_OUTPUT"/*.sqlite && \
rm -rf "$PUDL_OUTPUT/core_epacems__hourly_emissions/" && \
rm -f "$PUDL_OUTPUT/metadata.yml"
ETL_SUCCESS=${PIPESTATUS[0]}
# Deploy the updated data to datasette if we're on dev
if [[ "$BUILD_REF" == "dev" ]]; then
python ~/pudl/devtools/datasette/publish.py 2>&1 | tee -a "$LOGFILE"
DATASETTE_SUCCESS=${PIPESTATUS[0]}
fi

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
# TODO: this behavior should be controlled by on/off switch here and this logic
# should be moved to the triggering github action. Having it here feels
# fragmented.
if [ "$GITHUB_ACTION_TRIGGER" = "push" ] || [ "$BUILD_REF" = "dev" ]; then
copy_outputs_to_distribution_bucket
ETL_SUCCESS=${PIPESTATUS[0]}
# TEMPORARY: this currently just makes a sandbox release, for testing:
zenodo_data_release 2>&1 | tee -a "$LOGFILE"
ETL_SUCCESS=${PIPESTATUS[0]}
# should be moved to the triggering github action. Having it here feels fragmented.
# Distribute outputs if branch is dev or the build was triggered by tag push
if [[ "$GITHUB_ACTION_TRIGGER" == "push" || "$BUILD_REF" == "dev" ]]; then
# Remove some cruft from the builds that we don't want to distribute
clean_up_outputs_for_distribution 2>&1 | tee -a "$LOGFILE"
CLEAN_UP_OUTPUTS_SUCCESS=${PIPESTATUS[0]}
# Copy cleaned up outputs to the S3 and GCS distribution buckets
copy_outputs_to_distribution_bucket | tee -a "$LOGFILE"
DISTRIBUTION_BUCKET_SUCCESS=${PIPESTATUS[0]}
# TODO: this currently just makes a sandbox release, for testing. Should be
# switched to production and only run on push of a version tag eventually.
# Push a data release to Zenodo for long term accessiblity
zenodo_data_release sandbox 2>&1 | tee -a "$LOGFILE"
ZENODO_SUCCESS=${PIPESTATUS[0]}
fi
fi

# This way we also save the logs from latter steps in the script
gsutil cp "$LOGFILE" "$PUDL_GCS_OUTPUT"

# Notify slack about entire pipeline's success or failure;
# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
# task that was run above
if [[ $ETL_SUCCESS == 0 ]]; then
if [[ $ETL_SUCCESS == 0 && \
$SAVE_OUTPUTS_SUCCESS == 0 && \
$UPDATE_NIGHTLY_SUCCESS == 0 && \
$DATASETTE_SUCCESS == 0 && \
$CLEAN_UP_OUTPUTS_SUCCESS == 0 && \
$DISTRIBUTION_BUCKET_SUCCESS == 0 && \
$ZENODO_SUCCESS == 0
]]; then
notify_slack "success"
else
notify_slack "failure"
Expand Down

0 comments on commit 5a81260

Please sign in to comment.