diff --git a/containers/whisper/Dockerfile b/containers/whisper/Dockerfile index 0a5db25..f6d02f7 100644 --- a/containers/whisper/Dockerfile +++ b/containers/whisper/Dockerfile @@ -1,6 +1,6 @@ FROM registry.access.redhat.com/ubi10/ubi:10.0-1758699521 RUN dnf install -y --nodocs https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm RUN dnf install -y --nogpgcheck https://mirrors.rpmfusion.org/free/el/rpmfusion-free-release-10.noarch.rpm -RUN dnf update -y && dnf install -y python3-pip ffmpeg +RUN dnf update -y && dnf install -y python3-pip ffmpeg jq RUN pip install -U openai-whisper ~ diff --git a/das/.gitignore b/das/.gitignore new file mode 100644 index 0000000..8e22199 --- /dev/null +++ b/das/.gitignore @@ -0,0 +1,7 @@ +# Kubernetes secrets with actual credentials +manifests/*-secret.yml +!manifests/*-secret.yml.template + +# Common patterns +*.secret.yml +*.secret.yaml diff --git a/das/README.md b/das/README.md new file mode 100644 index 0000000..5c219aa --- /dev/null +++ b/das/README.md @@ -0,0 +1,7 @@ +## DAS deployment + +See [Dynamic Accelerator Slicer (DAS) Operator](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/hardware_accelerators/das-about-dynamic-accelerator-slicer-operator#das-operator-installing_das-about-dynamic-accelerator-slicer-operator) for detailed installation instructions. +NFD Instance manifest: https://github.com/openshift/instaslice-operator/blob/next/hack/manifests/nfd-instance.yaml. + +### Installation guide for "pure" Nvidia setup +https://docs.nvidia.com/datacenter/cloud-native/openshift/25.3.2/install-gpu-ocp.html \ No newline at end of file diff --git a/das/manifests/download-script-configmap.yml b/das/manifests/download-script-configmap.yml new file mode 100644 index 0000000..5ab00aa --- /dev/null +++ b/das/manifests/download-script-configmap.yml @@ -0,0 +1,237 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: audio-download-script +data: + download-audio.sh: | + #!/bin/sh + + # This script fetches a list of audio files from a GitHub repository and downloads + # a specific file based on the ITERATION environment variable. + # --- Configuration --- + # Base index offset (defaults to 0, can be overridden via BASE_INDEX env var) + BASE_INDEX=${BASE_INDEX:-0} + + # The index of the file to download (BASE_INDEX + ITERATION) + JOB_COMP_INDEX=$((${BASE_INDEX} + ${ITERATION:-0})) + + # --- Script Logic --- + set -e # Exit immediately if a command exits with a non-zero status. + + echo "Running for index: ${JOB_COMP_INDEX}" + + # Set up authentication header if token is provided + if [ -n "$GITHUB_TOKEN" ]; then + AUTH_HEADER="Authorization: token $GITHUB_TOKEN" + echo "Using authenticated GitHub API requests" + else + AUTH_HEADER="" + echo "Warning: No GITHUB_TOKEN set. Using unauthenticated requests (rate limited to 60/hour)" + fi + + echo "Fetching file list from GitHub API..." + + REPO_OWNER="revdotcom" + REPO_NAME="speech-datasets" + DIR_PATH="earnings22/media" + API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/contents/${DIR_PATH}" + + check_rate_limit() { + echo "Checking GitHub API rate limit..." >&2 + + if [ -n "$AUTH_HEADER" ]; then + rate_response=$(curl -s -H "$AUTH_HEADER" "https://api.github.com/rate_limit") + else + rate_response=$(curl -s "https://api.github.com/rate_limit") + fi + + remaining=$(echo "$rate_response" | jq -r '.rate.remaining') + limit=$(echo "$rate_response" | jq -r '.rate.limit') + reset_time=$(echo "$rate_response" | jq -r '.rate.reset') + + if [ "$remaining" != "null" ]; then + echo "Rate limit: $remaining/$limit remaining" >&2 + if [ "$remaining" -lt 10 ]; then + reset_date=$(date -r "$reset_time" 2>/dev/null || date -d "@$reset_time" 2>/dev/null || echo "unknown") + echo "Warning: Low rate limit remaining. Resets at: $reset_date" >&2 + fi + fi + } + + fetch_with_retry() { + local url="$1" + local max_retries=5 + local retry_count=0 + local base_delay=60 + + while [ $retry_count -lt $max_retries ]; do + echo "Attempt $((retry_count + 1))..." >&2 + + # Fetch directory contents with pagination support + # Use authentication header if available + if [ -n "$AUTH_HEADER" ]; then + http_code=$(curl -s -H "$AUTH_HEADER" -D /tmp/github_headers.txt -o /tmp/github_response.json -w "%{http_code}" "${url}") + else + http_code=$(curl -s -D /tmp/github_headers.txt -o /tmp/github_response.json -w "%{http_code}" "${url}") + fi + + if [ "$http_code" = "200" ]; then + cat /tmp/github_response.json + # Extract next page URL from Link header if it exists + if [ -f /tmp/github_headers.txt ]; then + cat /tmp/github_headers.txt > /tmp/link_header.txt + fi + rm -f /tmp/github_response.json /tmp/github_headers.txt + return 0 + elif [ "$http_code" = "403" ]; then + # Rate limited - check details and wait + echo "Rate limited (HTTP 403)." >&2 + check_rate_limit + echo "Waiting ${base_delay}s before retry..." >&2 + sleep $base_delay + retry_count=$((retry_count + 1)) + base_delay=$((base_delay * 2)) # Exponential backoff + else + echo "Error: HTTP $http_code" >&2 + if [ -f /tmp/github_response.json ]; then + cat /tmp/github_response.json >&2 + rm -f /tmp/github_response.json + fi + rm -f /tmp/github_headers.txt /tmp/link_header.txt + return 1 + fi + + # Clean up temp files if they exist + rm -f /tmp/github_response.json /tmp/github_headers.txt + done + + echo "Failed after $max_retries attempts" >&2 + return 1 + } + + # Extract next page URL from Link header + get_next_page_url() { + local header_file="$1" + # Link header format: ; rel="next", ; rel="last" + grep -i '^Link:' "$header_file" | sed -n 's/.*<\([^>]*\)>; rel="next".*/\1/p' + } + + # Fetch directory contents with pagination support for large directories + fetch_all_files() { + echo "Attempting to fetch via Contents API with pagination..." >&2 + + # Start with the first page URL (with per_page=100 to minimize requests) + local current_url="${API_URL}?per_page=100" + local all_items="[]" + local page=1 + + rm -f /tmp/link_header.txt + + while [ -n "$current_url" ]; do + echo "Fetching page $page..." >&2 + + # Fetch current page + page_data=$(fetch_with_retry "$current_url") + + if [ $? -ne 0 ]; then + echo "Failed to fetch page $page from GitHub API" >&2 + return 1 + fi + + # Check if we got an array response + if ! echo "$page_data" | jq -e 'type == "array"' > /dev/null 2>&1; then + echo "Error: Unexpected API response format on page $page" >&2 + echo "$page_data" | jq '.' >&2 + return 1 + fi + + # Count items on this page + page_count=$(echo "$page_data" | jq 'length') + echo "Page $page: $page_count items" >&2 + + # Merge this page's data with all_items + all_items=$(echo "$all_items" "$page_data" | jq -s '.[0] + .[1]') + + # Check for next page in Link header + if [ -f /tmp/link_header.txt ]; then + current_url=$(get_next_page_url /tmp/link_header.txt) + rm -f /tmp/link_header.txt + else + current_url="" + fi + + page=$((page + 1)) + + # Safety check: don't loop forever + if [ $page -gt 100 ]; then + echo "Warning: Stopped after 100 pages. This seems unusual." >&2 + break + fi + done + + total_count=$(echo "$all_items" | jq 'length') + echo "Total items fetched: $total_count" >&2 + + echo "$all_items" + return 0 + } + + DIRECTORY_JSON=$(fetch_all_files) + if [ $? -ne 0 ]; then + echo "Failed to fetch directory contents from GitHub API" + exit 1 + fi + + # Filter for MP3 files only + MP3_FILES_JSON=$(echo "$DIRECTORY_JSON" | jq '[.[] | select(.name | endswith(".mp3"))]') + + TOTAL_FILES=$(echo "$MP3_FILES_JSON" | jq 'length') + echo "Total .mp3 files found: $TOTAL_FILES" + + # Select the file at the specified index from the filtered list. + FILENAME=$(echo "$MP3_FILES_JSON" | jq -r ".[${JOB_COMP_INDEX}].name") + + # Exit gracefully if the index is out of bounds. + if [ -z "$FILENAME" ] || [ "$FILENAME" == "null" ]; then + echo "No file found for index ${JOB_COMP_INDEX}. Maximum index is $(($TOTAL_FILES - 1))." + exit 0 + fi + + echo "Index ${JOB_COMP_INDEX} selected file: $FILENAME" + + # Construct the Git LFS media download URL (for actual MP3 files, not pointers). + DOWNLOAD_URL="https://media.githubusercontent.com/media/revdotcom/speech-datasets/main/earnings22/media/$FILENAME" + echo "Downloading from $DOWNLOAD_URL..." + + # Download the file into /data directory with retry logic. + download_file() { + local url="$1" + local filename="$2" + local max_retries=3 + local retry_count=0 + + while [ $retry_count -lt $max_retries ]; do + echo "Download attempt $((retry_count + 1))..." >&2 + + if curl -L -f -o "/data/$filename" "$url"; then + echo "Download complete. File saved to /data/$filename" + return 0 + else + echo "Download failed. Retrying..." >&2 + retry_count=$((retry_count + 1)) + sleep 5 + fi + done + + echo "Failed to download $filename after $max_retries attempts" + return 1 + } + + download_file "$DOWNLOAD_URL" "$FILENAME" + + if [ $? -ne 0 ]; then + echo "Download failed for $FILENAME" + exit 1 + fi + + echo "Successfully downloaded audio file: $FILENAME" diff --git a/das/manifests/kube-burner-whisper.yml b/das/manifests/kube-burner-whisper.yml new file mode 100644 index 0000000..11fb772 --- /dev/null +++ b/das/manifests/kube-burner-whisper.yml @@ -0,0 +1,68 @@ +global: + gc: false + gcMetrics: false + measurements: + - name: jobLatency + +metricsEndpoints: +{{ if .ES_SERVER }} + - metrics: [metrics.yml] + indexer: + insecureSkipVerify: true + esServers: [{{.ES_SERVER}}] + defaultIndex: {{.ES_INDEX}} + type: opensearch +{{ end }} +{{ if .LOCAL_INDEXING }} + - metrics: [metrics.yml] + indexer: + type: local + metricsDirectory: whisper-result-{{.UUID}} +{{ end }} + +jobs: + - name: create-namespace + qps: 1 + burst: 1 + jobIterations: 1 + jobPause: 1s + cleanup: true + skipIndexing: true + objects: + - objectTemplate: namespace.yml + replicas: 1 + + - name: pre-requisites + namespace: whisper-scale + qps: 1 + burst: 1 + jobIterations: 1 + namespacedIterations: false + jobPause: 1s + cleanup: true + skipIndexing: true + objects: + - objectTemplate: download-script-configmap.yml + replicas: 1 + - objectTemplate: github-token-secret.yml + replicas: 1 + + - name: whisper-scale + namespace: whisper-scale + jobIterations: {{.ITERATIONS}} + preLoadImages: false #TODO - true + preLoadPeriod: 10m + namespacedIterations: false + cleanup: true + podWait: true + qps: 1 + burst: 1 + objects: + - objectTemplate: whisper-transcription-medium.yml + replicas: 1 + inputVars: + das: {{.DAS}} + - objectTemplate: whisper-transcription-large.yml + replicas: 1 + inputVars: + das: {{.DAS}} diff --git a/das/manifests/namespace.yml b/das/manifests/namespace.yml new file mode 100644 index 0000000..3d8e642 --- /dev/null +++ b/das/manifests/namespace.yml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: whisper-scale \ No newline at end of file diff --git a/das/manifests/whisper-transcription-large.yml b/das/manifests/whisper-transcription-large.yml new file mode 100644 index 0000000..c46ae99 --- /dev/null +++ b/das/manifests/whisper-transcription-large.yml @@ -0,0 +1,63 @@ +kind: Job +apiVersion: batch/v1 +metadata: + name: whisper-transcription-large-{{.JobName}}-{{.Iteration}} + labels: + group: whisper-large +spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + ttlSecondsAfterFinished: 0 + template: + spec: + restartPolicy: Never + containers: + - name: whisper-transcriber + image: quay.io/smalleni/whisper:latest + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - | + echo "Running download script..." + /scripts/download-audio.sh + FILE=$(ls /data/* | head -n 1) + echo "Running whisper transcription..." + whisper "$FILE" --model_dir /tmp/whisper_models --model large --output_dir /tmp + env: + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: github-token + key: token + - name: ITERATION + value: "{{.Iteration}}" + - name: BASE_INDEX + value: "50" + resources: + requests: + cpu: "4" + memory: "12Gi" + limits: + memory: "12Gi" +{{ if .das }} + nvidia.com/mig-3g.20gb: 1 +{{ else }} + nvidia.com/gpu: 1 +{{ end}} + volumeMounts: + - name: audio-data + mountPath: /data + - name: model-cache-volume + mountPath: /tmp/whisper_models + - name: download-script + mountPath: /scripts + volumes: + - name: audio-data + emptyDir: {} + - name: model-cache-volume + emptyDir: {} + - name: download-script + configMap: + name: audio-download-script + defaultMode: 0755 diff --git a/das/manifests/whisper-transcription-medium.yml b/das/manifests/whisper-transcription-medium.yml new file mode 100644 index 0000000..eda56a8 --- /dev/null +++ b/das/manifests/whisper-transcription-medium.yml @@ -0,0 +1,61 @@ +kind: Job +apiVersion: batch/v1 +metadata: + name: whisper-transcription-medium-{{.JobName}}-{{.Iteration}} + labels: + group: whisper-medium +spec: + parallelism: 1 + completions: 1 + completionMode: Indexed + ttlSecondsAfterFinished: 0 + template: + spec: + restartPolicy: Never + containers: + - name: whisper-transcriber + image: quay.io/smalleni/whisper:latest + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - | + echo "Running download script..." + /scripts/download-audio.sh + FILE=$(ls /data/* | head -n 1) + echo "Running whisper transcription..." + whisper "$FILE" --model_dir /tmp/whisper_models --model medium --output_dir /tmp + env: + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: github-token + key: token + - name: ITERATION + value: "{{.Iteration}}" + resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + memory: "8Gi" +{{ if .das }} + nvidia.com/mig-1g.5gb: 1 +{{ else }} + nvidia.com/gpu: 1 +{{ end}} + volumeMounts: + - name: audio-data + mountPath: /data + - name: model-cache-volume + mountPath: /tmp/whisper_models + - name: download-script + mountPath: /scripts + volumes: + - name: audio-data + emptyDir: {} + - name: model-cache-volume + emptyDir: {} + - name: download-script + configMap: + name: audio-download-script + defaultMode: 0755