Skip to content

Fetch Paginated JSON and Update CSV #70

Fetch Paginated JSON and Update CSV

Fetch Paginated JSON and Update CSV #70

name: Fetch Paginated JSON and Update CSV
on:
schedule:
- cron: "0 0 * * *" # Runs once a day at midnight (UTC)
workflow_dispatch: # Allows for manual runs as well
push:
branches:
- main
jobs:
update_csv:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Install jq and csvkit
run: |
sudo apt-get update
sudo apt-get install -y jq
pip install csvkit
- name: Fetch paginated JSON data from API
env:
API_URL: ${{ secrets.API_URL }}
API_TOKEN: ${{ secrets.API_TOKEN }}
run: |
# Initialize page and limit parameters
PAGE=1
LIMIT=250
AGGREGATE_FILE=data.json
echo "[]" > $AGGREGATE_FILE # Initialize an empty JSON array
while true; do
# Fetch a page of data from the API
RESPONSE=$(curl -s -X GET \
-H "Authorization: Bearer $API_TOKEN" \
--data-urlencode "page=$PAGE" \
--data-urlencode "limit=$LIMIT" \
--data-urlencode "verified[eq]=true" \
"$API_URL")
# Check if the response is valid JSON and an array
if ! echo "$RESPONSE" | jq 'if type == "array" then . else error("Expected an array but got another type") end' > /dev/null 2>&1; then
echo "Error: Expected an array but received a different data structure from the API."
exit 1
fi
# Parse response and check if it's empty
PAGE_DATA=$(echo "$RESPONSE" | jq '.')
if [ "$(echo "$PAGE_DATA" | jq 'length')" -eq 0 ]; then
echo "No more data to fetch. Exiting pagination loop."
break
fi
# Append the page data to the aggregate JSON array
jq -s '.[0] + .[1]' $AGGREGATE_FILE <(echo "$PAGE_DATA") > tmp.json && mv tmp.json $AGGREGATE_FILE
# Increment page number for the next request
PAGE=$((PAGE + 1))
done
- name: Filter JSON to only include school names and save as school.csv
run: |
# Add first line to CSV file
CURRENT_TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo "\"Below is a list of all schools that we've manually verified. If you see one that hasn't been included, please add it via my.mlh.io and we'll verify it soon. List last updated at: $CURRENT_TIMESTAMP.\"" > schools.csv
# Extract only the "name" field from each entry and overwrite data.json
jq -r '.[].name' data.json >> schools.csv
# Wrap each "name" in an array to use @csv for correct CSV formatting
jq -r '.[].name | [.] | @csv' data.json >> schools.csv
- name: Commit and push CSV file using deploy key
env:
DEPLOY_KEY: ${{ secrets.UPDATE_SCHOOLS_DEPLOY_KEY }}
run: |
# Set up SSH
mkdir -p ~/.ssh
echo "$DEPLOY_KEY" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
# Configure git
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
# Commit and push changes
git add schools.csv
git commit -m "Update school CSV file from API data [skip ci]" || echo "No changes to commit"
git push [email protected]:mlh/mlh-policies.git HEAD:main