Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion containers/whisper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM registry.access.redhat.com/ubi10/ubi:10.0-1758699521
RUN dnf install -y --nodocs https://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm
RUN dnf install -y --nogpgcheck https://mirrors.rpmfusion.org/free/el/rpmfusion-free-release-10.noarch.rpm
RUN dnf update -y && dnf install -y python3-pip ffmpeg
RUN dnf update -y && dnf install -y python3-pip ffmpeg jq
RUN pip install -U openai-whisper
~
7 changes: 7 additions & 0 deletions das/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Kubernetes secrets with actual credentials
manifests/*-secret.yml
!manifests/*-secret.yml.template

# Common patterns
*.secret.yml
*.secret.yaml
7 changes: 7 additions & 0 deletions das/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## DAS deployment

See [Dynamic Accelerator Slicer (DAS) Operator](https://docs.redhat.com/en/documentation/openshift_container_platform/4.19/html/hardware_accelerators/das-about-dynamic-accelerator-slicer-operator#das-operator-installing_das-about-dynamic-accelerator-slicer-operator) for detailed installation instructions.
NFD Instance manifest: https://github.com/openshift/instaslice-operator/blob/next/hack/manifests/nfd-instance.yaml.

### Installation guide for "pure" Nvidia setup
https://docs.nvidia.com/datacenter/cloud-native/openshift/25.3.2/install-gpu-ocp.html
237 changes: 237 additions & 0 deletions das/manifests/download-script-configmap.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: audio-download-script
data:
download-audio.sh: |
#!/bin/sh

# This script fetches a list of audio files from a GitHub repository and downloads
# a specific file based on the ITERATION environment variable.
# --- Configuration ---
# Base index offset (defaults to 0, can be overridden via BASE_INDEX env var)
BASE_INDEX=${BASE_INDEX:-0}

# The index of the file to download (BASE_INDEX + ITERATION)
JOB_COMP_INDEX=$((${BASE_INDEX} + ${ITERATION:-0}))

# --- Script Logic ---
set -e # Exit immediately if a command exits with a non-zero status.

echo "Running for index: ${JOB_COMP_INDEX}"

# Set up authentication header if token is provided
if [ -n "$GITHUB_TOKEN" ]; then
AUTH_HEADER="Authorization: token $GITHUB_TOKEN"
echo "Using authenticated GitHub API requests"
else
AUTH_HEADER=""
echo "Warning: No GITHUB_TOKEN set. Using unauthenticated requests (rate limited to 60/hour)"
fi

echo "Fetching file list from GitHub API..."

REPO_OWNER="revdotcom"
REPO_NAME="speech-datasets"
DIR_PATH="earnings22/media"
API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/contents/${DIR_PATH}"

check_rate_limit() {
echo "Checking GitHub API rate limit..." >&2

if [ -n "$AUTH_HEADER" ]; then
rate_response=$(curl -s -H "$AUTH_HEADER" "https://api.github.com/rate_limit")
else
rate_response=$(curl -s "https://api.github.com/rate_limit")
fi

remaining=$(echo "$rate_response" | jq -r '.rate.remaining')
limit=$(echo "$rate_response" | jq -r '.rate.limit')
reset_time=$(echo "$rate_response" | jq -r '.rate.reset')

if [ "$remaining" != "null" ]; then
echo "Rate limit: $remaining/$limit remaining" >&2
if [ "$remaining" -lt 10 ]; then
reset_date=$(date -r "$reset_time" 2>/dev/null || date -d "@$reset_time" 2>/dev/null || echo "unknown")
echo "Warning: Low rate limit remaining. Resets at: $reset_date" >&2
fi
fi
}

fetch_with_retry() {
local url="$1"
local max_retries=5
local retry_count=0
local base_delay=60

while [ $retry_count -lt $max_retries ]; do
echo "Attempt $((retry_count + 1))..." >&2

# Fetch directory contents with pagination support
# Use authentication header if available
if [ -n "$AUTH_HEADER" ]; then
http_code=$(curl -s -H "$AUTH_HEADER" -D /tmp/github_headers.txt -o /tmp/github_response.json -w "%{http_code}" "${url}")
else
http_code=$(curl -s -D /tmp/github_headers.txt -o /tmp/github_response.json -w "%{http_code}" "${url}")
fi

if [ "$http_code" = "200" ]; then
cat /tmp/github_response.json
# Extract next page URL from Link header if it exists
if [ -f /tmp/github_headers.txt ]; then
cat /tmp/github_headers.txt > /tmp/link_header.txt
fi
rm -f /tmp/github_response.json /tmp/github_headers.txt
return 0
elif [ "$http_code" = "403" ]; then
# Rate limited - check details and wait
echo "Rate limited (HTTP 403)." >&2
check_rate_limit
echo "Waiting ${base_delay}s before retry..." >&2
sleep $base_delay
retry_count=$((retry_count + 1))
base_delay=$((base_delay * 2)) # Exponential backoff
else
echo "Error: HTTP $http_code" >&2
if [ -f /tmp/github_response.json ]; then
cat /tmp/github_response.json >&2
rm -f /tmp/github_response.json
fi
rm -f /tmp/github_headers.txt /tmp/link_header.txt
return 1
fi

# Clean up temp files if they exist
rm -f /tmp/github_response.json /tmp/github_headers.txt
done

echo "Failed after $max_retries attempts" >&2
return 1
}

# Extract next page URL from Link header
get_next_page_url() {
local header_file="$1"
# Link header format: <https://api.github.com/...?page=2>; rel="next", <https://...>; rel="last"
grep -i '^Link:' "$header_file" | sed -n 's/.*<\([^>]*\)>; rel="next".*/\1/p'
}

# Fetch directory contents with pagination support for large directories
fetch_all_files() {
echo "Attempting to fetch via Contents API with pagination..." >&2

# Start with the first page URL (with per_page=100 to minimize requests)
local current_url="${API_URL}?per_page=100"
local all_items="[]"
local page=1

rm -f /tmp/link_header.txt

while [ -n "$current_url" ]; do
echo "Fetching page $page..." >&2

# Fetch current page
page_data=$(fetch_with_retry "$current_url")

if [ $? -ne 0 ]; then
echo "Failed to fetch page $page from GitHub API" >&2
return 1
fi

# Check if we got an array response
if ! echo "$page_data" | jq -e 'type == "array"' > /dev/null 2>&1; then
echo "Error: Unexpected API response format on page $page" >&2
echo "$page_data" | jq '.' >&2
return 1
fi

# Count items on this page
page_count=$(echo "$page_data" | jq 'length')
echo "Page $page: $page_count items" >&2

# Merge this page's data with all_items
all_items=$(echo "$all_items" "$page_data" | jq -s '.[0] + .[1]')

# Check for next page in Link header
if [ -f /tmp/link_header.txt ]; then
current_url=$(get_next_page_url /tmp/link_header.txt)
rm -f /tmp/link_header.txt
else
current_url=""
fi

page=$((page + 1))

# Safety check: don't loop forever
if [ $page -gt 100 ]; then
echo "Warning: Stopped after 100 pages. This seems unusual." >&2
break
fi
done

total_count=$(echo "$all_items" | jq 'length')
echo "Total items fetched: $total_count" >&2

echo "$all_items"
return 0
}

DIRECTORY_JSON=$(fetch_all_files)
if [ $? -ne 0 ]; then
echo "Failed to fetch directory contents from GitHub API"
exit 1
fi

# Filter for MP3 files only
MP3_FILES_JSON=$(echo "$DIRECTORY_JSON" | jq '[.[] | select(.name | endswith(".mp3"))]')

TOTAL_FILES=$(echo "$MP3_FILES_JSON" | jq 'length')
echo "Total .mp3 files found: $TOTAL_FILES"

# Select the file at the specified index from the filtered list.
FILENAME=$(echo "$MP3_FILES_JSON" | jq -r ".[${JOB_COMP_INDEX}].name")

# Exit gracefully if the index is out of bounds.
if [ -z "$FILENAME" ] || [ "$FILENAME" == "null" ]; then
echo "No file found for index ${JOB_COMP_INDEX}. Maximum index is $(($TOTAL_FILES - 1))."
exit 0
fi

echo "Index ${JOB_COMP_INDEX} selected file: $FILENAME"

# Construct the Git LFS media download URL (for actual MP3 files, not pointers).
DOWNLOAD_URL="https://media.githubusercontent.com/media/revdotcom/speech-datasets/main/earnings22/media/$FILENAME"
echo "Downloading from $DOWNLOAD_URL..."

# Download the file into /data directory with retry logic.
download_file() {
local url="$1"
local filename="$2"
local max_retries=3
local retry_count=0

while [ $retry_count -lt $max_retries ]; do
echo "Download attempt $((retry_count + 1))..." >&2

if curl -L -f -o "/data/$filename" "$url"; then
echo "Download complete. File saved to /data/$filename"
return 0
else
echo "Download failed. Retrying..." >&2
retry_count=$((retry_count + 1))
sleep 5
fi
done

echo "Failed to download $filename after $max_retries attempts"
return 1
}

download_file "$DOWNLOAD_URL" "$FILENAME"

if [ $? -ne 0 ]; then
echo "Download failed for $FILENAME"
exit 1
fi

echo "Successfully downloaded audio file: $FILENAME"
68 changes: 68 additions & 0 deletions das/manifests/kube-burner-whisper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
global:
gc: false
gcMetrics: false
measurements:
- name: jobLatency

metricsEndpoints:
{{ if .ES_SERVER }}
- metrics: [metrics.yml]
indexer:
insecureSkipVerify: true
esServers: [{{.ES_SERVER}}]
defaultIndex: {{.ES_INDEX}}
type: opensearch
{{ end }}
{{ if .LOCAL_INDEXING }}
- metrics: [metrics.yml]
indexer:
type: local
metricsDirectory: whisper-result-{{.UUID}}
{{ end }}

jobs:
- name: create-namespace
qps: 1
burst: 1
jobIterations: 1
jobPause: 1s
cleanup: true
skipIndexing: true
objects:
- objectTemplate: namespace.yml
replicas: 1

- name: pre-requisites
namespace: whisper-scale
qps: 1
burst: 1
jobIterations: 1
namespacedIterations: false
jobPause: 1s
cleanup: true
skipIndexing: true
objects:
- objectTemplate: download-script-configmap.yml
replicas: 1
- objectTemplate: github-token-secret.yml
replicas: 1

- name: whisper-scale
namespace: whisper-scale
jobIterations: {{.ITERATIONS}}
preLoadImages: false #TODO - true
preLoadPeriod: 10m
namespacedIterations: false
cleanup: true
podWait: true
qps: 1
burst: 1
objects:
- objectTemplate: whisper-transcription-medium.yml
replicas: 1
inputVars:
das: {{.DAS}}
- objectTemplate: whisper-transcription-large.yml
replicas: 1
inputVars:
das: {{.DAS}}
4 changes: 4 additions & 0 deletions das/manifests/namespace.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: whisper-scale
Loading