radius-project · ytimocin · Apr 29, 2024 · Apr 24, 2024 · willdavsmith · Apr 29, 2024
diff --git a/.github/scripts/delete-aws-resources.sh b/.github/scripts/delete-aws-resources.sh
@@ -6,7 +6,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#    
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
@@ -16,7 +16,6 @@
 # limitations under the License.
 # ------------------------------------------------------------
 
-
 APP_ID=$1
 APP_LABEL='radapp.io/application'
 RESOURCE_TYPES='AWS::RDS::DBInstance,AWS::RDS::DBSubnetGroup,AWS::MemoryDB::Cluster,AWS::MemoryDB::SubnetGroup'
@@ -34,21 +33,16 @@ function delete_aws_resources() {
   # Empty the file
   truncate -s 0 $DELETED_RESOURCES_FILE
 
-  for resource_type in ${RESOURCE_TYPES//,/ }
-  do
-    aws cloudcontrol list-resources --type-name "$resource_type" --query "ResourceDescriptions[].Identifier" --output text | tr '\t' '\n' | while read identifier
-    do
-      aws cloudcontrol get-resource --type-name "$resource_type" --identifier "$identifier" --query "ResourceDescription.Properties" --output text | while read resource
-      do
-        resource_tags=$(jq -c -r .Tags <<< "$resource")
-        for tag in $(jq -c -r '.[]' <<< "$resource_tags")
-        do
-          key=$(jq -r '.Key' <<< "$tag")
-          value=$(jq -r '.Value' <<< "$tag")
-          if [[ "$key" == "$APP_LABEL" && "$value" == "$APP_ID" ]]
-          then
+  for resource_type in ${RESOURCE_TYPES//,/ }; do
+    aws cloudcontrol list-resources --type-name "$resource_type" --query "ResourceDescriptions[].Identifier" --output text | tr '\t' '\n' | while read identifier; do
+      aws cloudcontrol get-resource --type-name "$resource_type" --identifier "$identifier" --query "ResourceDescription.Properties" --output text | while read resource; do
+        resource_tags=$(jq -c -r .Tags <<<"$resource")
+        for tag in $(jq -c -r '.[]' <<<"$resource_tags"); do
+          key=$(jq -r '.Key' <<<"$tag")
+          value=$(jq -r '.Value' <<<"$tag")
+          if [[ "$key" == "$APP_LABEL" && "$value" == "$APP_ID" ]]; then
             echo "Deleting resource of type: $resource_type with identifier: $identifier"
-            echo "$identifier\n" >> $DELETED_RESOURCES_FILE
+            echo "$identifier\n" >>$DELETED_RESOURCES_FILE
             aws cloudcontrol delete-resource --type-name "$resource_type" --identifier "$identifier"
           fi
         done
@@ -65,28 +59,28 @@ function delete_aws_resources() {
 
 RETRY_COUNT=0
 while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
-    # Trigger the function to delete the resources
-    delete_aws_resources
+  # Trigger the function to delete the resources
+  delete_aws_resources
 
-    # If the function returned 0, then no resources needed to be deleted
-    # on this run. This means that all resources have been deleted.
-    if [ $? -eq 0 ]; then
-        echo "All resources deleted successfully"
-        break
-    fi
+  # If the function returned 0, then no resources needed to be deleted
+  # on this run. This means that all resources have been deleted.
+  if [ $? -eq 0 ]; then
+    echo "All resources deleted successfully"
+    break
+  fi
 
-    # Still have resources to delete, increase the retry count
-    RETRY_COUNT=$((RETRY_COUNT + 1))
+  # Still have resources to delete, increase the retry count
+  RETRY_COUNT=$((RETRY_COUNT + 1))
 
-    # Check if there are more retries left
-    if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
-        # Retry after delay
-        echo "Retrying in $RETRY_DELAY seconds..."
-        sleep $RETRY_DELAY
-    fi
+  # Check if there are more retries left
+  if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
+    # Retry after delay
+    echo "Retrying in $RETRY_DELAY seconds..."
+    sleep $RETRY_DELAY
+  fi
 done
 
 # Check if the maximum number of retries exceeded
 if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
-    echo "Maximum number of retries exceeded"
+  echo "Maximum number of retries exceeded"
 fi
diff --git a/.github/workflows/test-aks.yaml b/.github/workflows/test-aks.yaml
@@ -111,7 +111,7 @@ jobs:
             deployArgs: -p environment='/planes/radius/local/resourceGroups/eshop-containers/providers/Applications.Core/environments/containers'
           - name: eshop-azure
             os: ubuntu-latest-m
-            runOnPullRequest: true
+            runOnPullRequest: false
             app: eshop-azure
             env: azure
             path: ./samples/eshop/eshop.bicep
@@ -120,7 +120,7 @@ jobs:
             credential: azure
           - name: eshop-aws
             os: ubuntu-latest-m
-            runOnPullRequest: true
+            runOnPullRequest: false
             app: eshop-aws
             env: aws
             path: ./samples/eshop/eshop.bicep
@@ -283,14 +283,12 @@ jobs:
       - name: Deploy app
         if: steps.gen-id.outputs.RUN_TEST == 'true'
         id: deploy-app
-        run: rad deploy ${{ matrix.path }} ${{ matrix.deployArgs }} -e ${{ matrix.env }}
-      - name: Wait for all pods to be ready
-        if: steps.gen-id.outputs.RUN_TEST == 'true'
-        id: wait-for-pods
-        run: |
-          namespace="${{ matrix.env }}-${{ matrix.app }}"
-          label="radapp.io/application=${{ matrix.app }}"
-          kubectl rollout status deployment -l $label -n $namespace --timeout=90s
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 30
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: rad deploy ${{ matrix.path }} ${{ matrix.deployArgs }} -e ${{ matrix.env }}
       - name: Run Playwright Test
         if: steps.gen-id.outputs.RUN_TEST == 'true' && matrix.uiTestFile != ''
         id: run-playwright-test
@@ -308,18 +306,28 @@ jobs:
           npm ci
           npx playwright install --with-deps
           npx playwright test ${{ matrix.uiTestFile }} --retries 3
+      # Upload Playwright test results even if the workflow is cancelled.
       - name: Upload Playwright Results
         uses: actions/upload-artifact@v3
-        if: always() && ( steps.run-playwright-test.outcome == 'success' || steps.run-playwright-test.outcome == 'failure' )
+        if: always() && (steps.run-playwright-test.outcome == 'success' || steps.run-playwright-test.outcome == 'failure')
         with:
           name: playwright-report-${{ matrix.name }}
           path: playwright/playwright-report/
           retention-days: 30
           if-no-files-found: error
+      # Upload Playwright test videos in case of test failure even if the workflow is cancelled.
+      - name: Upload Playwright Videos
+        uses: actions/upload-artifact@v4
+        if: always() && steps.run-playwright-test.outcome == 'failure'
+        with:
+          name: playwright-video-${{ matrix.name }}
+          path: playwright/test-results/
+          retention-days: 30
+          if-no-files-found: error
       # Handle failures
       - name: Get Pod logs for failed tests
         id: get-pod-logs
-        if: failure() && (steps.run-playwright-test.outcome == 'failure' || steps.wait-for-pods.outcome == 'failure' || steps.deploy-app.outcome == 'failure')
+        if: failure() && (steps.run-playwright-test.outcome == 'failure' || steps.deploy-app.outcome == 'failure')
         run: |
           # Create pod-logs directory
           mkdir -p playwright/pod-logs/${{ matrix.name }}
@@ -343,17 +351,17 @@ jobs:
           if-no-files-found: error
       - name: Create GitHub issue on failure
         if: failure() && github.event_name == 'schedule'
-        run: gh issue create --title "Samples deployment failed for ${{ matrix.app }}" --body "Test failed on ${{ github.repository }}. See [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for more details." --repo ${{ github.repository }} --label test-failure
+        run: gh issue create --title "Samples deployment failed for ${{ matrix.name }}" --body "Test failed on ${{ github.repository }}. See [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for more details." --repo ${{ github.repository }} --label test-failure
       # Cleanup
       - name: Delete app and environment
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && steps.deploy-app.outcome == 'success'
         run: |
           if command -v rad &> /dev/null; then
             rad app delete ${{ matrix.app }} -y
             rad env delete ${{ matrix.env }} -y
           fi
       - name: Delete Azure resource group
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && steps.create-azure-resource-group.outcome == 'success'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && steps.create-azure-resource-group.outcome == 'success'
         run: |
           # Delete Azure resources created by the test
           # if deletion fails, purge workflow will purge the resource group and its resources later
@@ -362,13 +370,12 @@ jobs:
             --name ${{ steps.gen-id.outputs.TEST_AZURE_RESOURCE_GROUP }} \
             --yes
       - name: Delete AWS Resources
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws' && steps.deploy-app.outcome == 'success'
         run: |
           # Delete all AWS resources created by the test
           ./.github/scripts/delete-aws-resources.sh '/planes/radius/local/resourcegroups/${{ matrix.env }}/providers/Applications.Core/applications/${{ matrix.app }}'
-      - name: Delete EKS Cluster
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws'
+      - name: Delete EKS Cluster ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }}
+        if: steps.create-eks.outcome == 'success'
         run: |
-          # Delete EKS cluster
           echo "Deleting EKS cluster: ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }}"
           eksctl delete cluster --name ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }} --wait --force
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -256,51 +256,53 @@ jobs:
           fi
           rad env switch ${{ matrix.env }}
       # Deploy application and run tests
+      # Retry the deployment step in case of transient failures
       - name: Deploy app
         if: steps.gen-id.outputs.RUN_TEST == 'true'
         id: deploy-app
-        run: rad deploy ${{ matrix.path }} ${{ matrix.deployArgs }}
-      - name: Wait for all pods to be ready
-        if: steps.gen-id.outputs.RUN_TEST == 'true'
-        id: wait-for-pods
-        run: |
-          namespace="${{ matrix.env }}-${{ matrix.app }}"
-          label="radapp.io/application=${{ matrix.app }}"
-          kubectl rollout status deployment -l $label -n $namespace --timeout=90s
-      - name: Run Playwright Test
-        if: steps.gen-id.outputs.RUN_TEST == 'true' && matrix.uiTestFile != ''
-        id: run-playwright-test
         uses: nick-fields/retry@v3
         with:
-          timeout_minutes: 5
+          timeout_minutes: 30
           max_attempts: 3
           retry_wait_seconds: 30
-          command: |
-            if [[ "${{ matrix.container }}" != "" ]]; then
-              rad resource expose containers ${{ matrix.container }} ${{ matrix.exposeArgs }} --port ${{ matrix.port }} &
-              echo "Endpoint: http://localhost:${{ matrix.port }}"
-              export ENDPOINT="http://localhost:${{ matrix.port }}"
-            else
-              endpoint="$(rad app status -a ${{ matrix.app }} | sed 's/ /\n/g' | grep http)"
-              echo "Endpoint: $endpoint"
-              export ENDPOINT=$endpoint
-            fi
-            cd playwright/
-            npm ci
-            npx playwright install --with-deps
-            npx playwright test ${{ matrix.uiTestFile }} --retries 3
+          command: rad deploy ${{ matrix.path }} ${{ matrix.deployArgs }}
+      - name: Run Playwright Test
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && matrix.uiTestFile != ''
+        id: run-playwright-test
+        run: |
+          if [[ "${{ matrix.container }}" != "" ]]; then
+            rad resource expose containers ${{ matrix.container }} ${{ matrix.exposeArgs }} --port ${{ matrix.port }} &
+            export ENDPOINT="http://localhost:3000/"
-            export ENDPOINT="http://localhost:3000/"
+            export ENDPOINT="http://localhost:${{ matrix.port }}"
-            export ENDPOINT="http://localhost:3000/"
+            export ENDPOINT="http://localhost:${{ matrix.port }}"
+          else
+            endpoint="$(rad app status -a ${{ matrix.app }} | sed 's/ /\n/g' | grep http)"
+            echo "Endpoint: $endpoint"
+            export ENDPOINT=$endpoint
+          fi
+
+          cd playwright/
+          npm ci
+          npx playwright install --with-deps
+          npx playwright test ${{ matrix.uiTestFile }} --retries 3
       - name: Upload Playwright Results
         uses: actions/upload-artifact@v4
-        if: always() && ( steps.run-playwright-test.outcome == 'success' || steps.run-playwright-test.outcome == 'failure' )
+        if: always() && (steps.run-playwright-test.outcome == 'success' || steps.run-playwright-test.outcome == 'failure')
         with:
           name: playwright-report-${{ matrix.name }}
           path: playwright/playwright-report/
           retention-days: 30
           if-no-files-found: error
+      - name: Upload Playwright Videos
+        uses: actions/upload-artifact@v4
+        if: always() && steps.run-playwright-test.outcome == 'failure'
+        with:
+          name: playwright-video-${{ matrix.name }}
+          path: playwright/test-results/
+          retention-days: 30
+          if-no-files-found: error
       # Handle failures
       - name: Get Pod logs for failed tests
         id: get-pod-logs
-        if: failure() && (steps.run-playwright-test.outcome == 'failure' || steps.wait-for-pods.outcome == 'failure' || steps.deploy-app.outcome == 'failure')
+        if: failure() && (steps.run-playwright-test.outcome == 'failure' || steps.deploy-app.outcome == 'failure')
         run: |
           # Create pod-logs directory
           mkdir -p playwright/pod-logs/${{ matrix.name }}
@@ -324,16 +326,16 @@ jobs:
           if-no-files-found: error
       - name: Create GitHub issue on failure
         if: failure() && github.event_name == 'schedule'
-        run: gh issue create --title "Samples deployment failed for ${{ matrix.app }}" --body "Test failed on ${{ github.repository }}. See [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for more details." --repo ${{ github.repository }} --label test-failure
+        run: gh issue create --title "Samples deployment failed for ${{ matrix.name }}" --body "Test failed on ${{ github.repository }}. See [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for more details." --repo ${{ github.repository }} --label test-failure
       # Cleanup
       - name: Delete app
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && steps.deploy-app.outcome == 'success'
         run: |
           if command -v rad &> /dev/null; then
             rad app delete ${{ matrix.app }} -y
           fi
       - name: Delete Azure resource group
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && steps.create-azure-resource-group.outcome == 'success'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && steps.create-azure-resource-group.outcome == 'success'
         run: |
           # Delete Azure resources created by the test
           # if deletion fails, purge workflow will purge the resource group and its resources later
@@ -342,18 +344,12 @@ jobs:
             --name ${{ steps.gen-id.outputs.TEST_AZURE_RESOURCE_GROUP }} \
             --yes
       - name: Delete AWS Resources
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws'
+        if: steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws' && steps.deploy-app.outcome == 'success'
         run: |
           # Delete all AWS resources created by the test
           ./.github/scripts/delete-aws-resources.sh '/planes/radius/local/resourcegroups/default/providers/Applications.Core/applications/${{ matrix.app }}'
-      - name: Delete EKS Cluster
-        if: always() && steps.gen-id.outputs.RUN_TEST == 'true' && matrix.credential == 'aws'
+      - name: Delete EKS Cluster ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }}
+        if: steps.create-eks.outcome == 'success'
         run: |
-          # Uninstall Radius from EKS cluster
-          # if rad cli exists
-          if command -v rad &> /dev/null; then
-            rad uninstall kubernetes
-          fi
-          # Delete EKS cluster
           echo "Deleting EKS cluster: ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }}"
           eksctl delete cluster --name ${{ steps.gen-id.outputs.TEST_EKS_CLUSTER_NAME }} --region ${{ env.AWS_REGION }} --wait --force