Skip to content

Commit

Permalink
Merge pull request #17 from jkremser/finish-last-task-before-death
Browse files Browse the repository at this point in the history
This is more effective approach to handle scaling down events
  • Loading branch information
jkremser authored May 30, 2024
2 parents 5e3a433 + ef951cf commit e876257
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 11 deletions.
6 changes: 4 additions & 2 deletions samples/stable-diffusion/manifests/app-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
name: stable-diffusion-worker
imagePullPolicy: IfNotPresent
resources:
# uncomment if you want one GPU exclusively for this app, otherwise the GPU will be shared among many
# uncomment if you want one GPU exclusively for this app, otherwise the GPU will be shared among many (gpu slicing)
# limits:
# nvidia.com/gpu: "1"
requests:
Expand Down Expand Up @@ -58,10 +58,11 @@ spec:
command: ["/bin/sh", "-c"]
args:
- |
trap 'while [ -f "/images/working" ]; do sleep 1; done; sleep 5' SIGTERM
mc alias set shared http://minio:9000 $MINIO_USERNAME $MINIO_PASSWORD;
mc admin info shared;
echo "Minio configured, starting sync.."
mc mirror --watch /images shared/images;
mc mirror --exclude working --watch /images shared/images;
volumeMounts:
- name: shared-images
Expand All @@ -79,3 +80,4 @@ spec:
volumes:
- name: shared-images
emptyDir: {}
terminationGracePeriodSeconds: 120
4 changes: 3 additions & 1 deletion samples/stable-diffusion/manifests/app.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,11 @@ spec:
command: ["/bin/sh", "-c"]
args:
- |
trap 'while [ -f "/images/working" ]; do sleep 1; done; sleep 5' SIGTERM
mc alias set shared http://minio:9000 $MINIO_USERNAME $MINIO_PASSWORD;
mc admin info shared;
echo "Minio configured, starting sync.."
mc mirror --watch /images shared/images;
mc mirror --exclude working --watch /images shared/images;
volumeMounts:
- name: shared-images
Expand All @@ -74,3 +75,4 @@ spec:
volumes:
- name: shared-images
emptyDir: {}
terminationGracePeriodSeconds: 120
2 changes: 1 addition & 1 deletion samples/stable-diffusion/manifests/scaledjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ spec:
echo "Minio configured, waiting for the result.."
until [ -f /images/*.png ] && [ -f /images/*.json ]; do sleep 1; printf .; done
echo -e "\nResults have been found: \n$(ls /images)\nSyncing.."
mc mirror /images shared/images;
mc mirror --exclude working /images shared/images;
volumeMounts:
- name: shared-images
Expand Down
6 changes: 3 additions & 3 deletions samples/stable-diffusion/manifests/scaledobject.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ spec:
scaleTargetRef:
name: stable-diffusion-worker
pollingInterval: 10
cooldownPeriod: 300
cooldownPeriod: 150
minReplicaCount: 0
maxReplicaCount: 8
advanced:
horizontalPodAutoscalerConfig:
behavior:
scaleUp:
stabilizationWindowSeconds: 15
stabilizationWindowSeconds: 5
scaleDown:
stabilizationWindowSeconds: 15
stabilizationWindowSeconds: 150
triggers:
# https://keda.sh/docs/2.14/scalers/rabbitmq-queue/
- type: rabbitmq
Expand Down
2 changes: 1 addition & 1 deletion samples/stable-diffusion/manifests/webapp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ spec:
mc alias set shared http://minio:9000 $MINIO_USERNAME $MINIO_PASSWORD
mc admin info shared
echo "Minio configured, starting sync.."
while true; do mc mirror --overwrite shared/images /images; sleep 1; done
while true; do mc mirror --exclude working --overwrite shared/images /images; sleep 1; done
volumeMounts:
- name: shared-images
Expand Down
9 changes: 6 additions & 3 deletions samples/stable-diffusion/tweaks/consumer.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/bin/bash
Q_NAME=${Q_NAME:-"tasks"}
FLAG_FILE=${FLAG_FILE:-"/app/results/working"}

handle_sigterm() {
rm -rf ${FLAG_FILE}
if [ -n "$_imageRequest" ]; then
echo "SIGTERM signal received while generating image \"${_imageRequest}\""
reQueue "${_imageRequest}"
else
echo "SIGTERM signal received, but no image was being processed."
fi
Expand All @@ -23,7 +24,9 @@ reQueue() {
generate() {
_prompt=$(echo ${_imageRequest} | jq '.prompt')
_count=$(echo ${_imageRequest} | jq 2> /dev/null '.count // 1')
python /app/src/app.py --number_of_images "${_count}" --prompt "\"${_prompt}\""
touch ${FLAG_FILE}
python /app/src/app.py --number_of_images "${_count}" --prompt "${_prompt}"
rm -rf ${FLAG_FILE}
echo "Done. Image for ${_imageRequest} has been stored in /app/results."
sleep 1
}
Expand All @@ -37,7 +40,7 @@ main() {
sleep 2
continue
fi
echo "Task received, generating: \"${_imageRequest}\""
echo -e "\n\n\nTask received, generating: \"${_imageRequest}\""
generate "${_imageRequest}"
[ "${EXIT_AFTER_ONE_TASK}" = "1" ] && exit 0
done
Expand Down

0 comments on commit e876257

Please sign in to comment.