Skip to content

Commit

Permalink
Added logs and commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
gaurav committed Aug 7, 2024
1 parent 8fd23a4 commit f6892e5
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 9 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ RUN apk add bash
# Needed to sync with LakeFS.
RUN apk add rclone

# Needed to send commands to LakeFS.
RUN apk add wget

# Update Python
RUN pip install --upgrade pip

Expand Down
28 changes: 19 additions & 9 deletions scripts/bdc/ingest.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
#!/usr/bin/bash

# Bash strict mode
# Bash strict mode (minus IFS change, since we're not using arrays).
set -euo pipefail

# But not the IFS change, because we're not using arrays and it
# messes up RCLONE_FLAGS.

# A script for ingesting data from BDC into LakeFS.
echo Started ingest from BDC at `date`.
START_DATE=$(date)
echo "Started ingest from BDC at ${START_DATE}."

echo cleaning /data directory
# Step 1. Prepare directory.
echo Cleaning /data directory
rm -rf /data/*

echo Create log directory.
mkdir -p /data/logs

# Step 1. Download the list of dbGaP IDs from BDC.
python bdc/get_bdc_studies_from_gen3.py /data/bdc_dbgap_ids.csv
python bdc/get_bdc_studies_from_gen3.py /data/bdc_dbgap_ids.csv 2>&1 | tee /data/logs/get_bdc_studies_from_gen3.log

# Step 2. Download the dbGaP XML files from BDC.
mkdir -p /data/bdc
python bdc/get_dbgap_data_dicts.py /data/bdc_dbgap_ids.csv --format CSV --field "Accession" --outdir /data/bdc --group-by Program
python bdc/get_dbgap_data_dicts.py /data/bdc_dbgap_ids.csv --format CSV --field "Accession" --outdir /data/bdc --group-by Program 2>&1 | tee /data/logs/get_dbgap_data_dicts.log

# Step 3. Upload the dbGaP XML files to BDC.
echo Uploading dbGaP XML files to LakeFS using Rclone.
Expand Down Expand Up @@ -52,5 +54,13 @@ rclone sync "/data/bdc/PCGC/" "lakefs:$LAKEFS_REPOSITORY/main/PCGC/" $RCLONE_FLA
rclone sync "/data/bdc/RECOVER/" "lakefs:$LAKEFS_REPOSITORY/main/RECOVER/" $RCLONE_FLAGS
rclone sync "/data/bdc/topmed/" "lakefs:$LAKEFS_REPOSITORY/main/topmed/" $RCLONE_FLAGS

# Report errors.
# Save logs into repository as well.
rclone sync "/data/logs" "lakefs:$LAKEFS_REPOSITORY/main/logs/" $RCLONE_FLAGS

# Commit these changes. We could do this via lakefs CLI, but it's easier to just do it via curl.
curl -X POST -u "$LAKEFS_USERNAME:$LAKEFS_PASSWORD" "$LAKEFS_HOST/api/v1/repositories/$LAKEFS_REPOSITORY/branches/main/commit" \
-H "Content-Type: application/json" \
-d "{\"message\": \"Updated BDC data dictionaries starting at ${START_DATE}.\"}"

# Report completion.
echo Downloads complete at `date`.

0 comments on commit f6892e5

Please sign in to comment.