-
Notifications
You must be signed in to change notification settings - Fork 7
/
run_variation.sh
executable file
·103 lines (97 loc) · 6.9 KB
/
run_variation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# bot-specific settings
BOT_SIGNAL1='bot-downloading'
BOT_SIGNAL2='bot-processing'
BOT_SIGNAL3='bot-processed'
BOT_SIGNAL4='bot-failed'
DEST_BOT_TAGS='bot-go-report bot-go-consensus'
JOB_YML='variation-job.yml'
# read bot config
JOB_YML_DIR='job-yml-templates'
GALAXY_SERVER=$(grep '#use_server:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
DEST_NAME_BASE=$(grep '#new_history_base_name:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
DEST_TAG=$(grep '#new_history_tag:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
# variation bot-only config
DOWNLOAD_HISTORY=$(grep '#history_for_downloads:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
LINKS_HISTORY_TAG=$(grep '#metadata_history_tag:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
LINKS_COLLECTION_NAME=$(grep '#metadata_collection_name:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
DEFAULT_PROTOCOL=$(grep '#download_protocol:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
MIN_RUN_DELTA=$(grep '#min_run_delta:' "$JOB_YML_DIR/$JOB_YML" | cut -d ' ' -f 2-)
# start processing
WORKDIR=$DEST_TAG'_run_'$(date '+%s')
mkdir $WORKDIR &&
trap "rm -R $WORKDIR" EXIT
# check progress of previous invocation
SCHEDULING=$(echo "{collections[0][elements][0][element_identifier]}" | python bioblend-scripts/find_collection_elements.py "$LINKS_COLLECTION_NAME" -g "$GALAXY_SERVER" -a $API_KEY -t "$LINKS_HISTORY_TAG" -c $BOT_SIGNAL1 -n 1 --from-template)
if [ -n "$SCHEDULING" ]; then
echo "Another bot run is still scheduling; ID: $SCHEDULING"
exit 0
fi
# no scheduling WF invocation found => proceed
PREVIOUS_HISTORY=`python bioblend-scripts/get_most_recent_history_by_tag.py -g "$GALAXY_SERVER" -a $API_KEY --tag $DEST_TAG`
if [ -n "$PREVIOUS_HISTORY" ]; then
echo "Previous history ID is: '$PREVIOUS_HISTORY'"
# this bot has run before
# => check if the history generated by its last run has progressed sufficiently
python bioblend-scripts/check_history.py -g "$GALAXY_SERVER" -a $API_KEY -p $MIN_RUN_DELTA $PREVIOUS_HISTORY || exit 0
fi
# start building the job.yml needed by planemo run from its template
cat "$JOB_YML_DIR/$JOB_YML" | python bioblend-scripts/find_collection_elements.py "$LINKS_COLLECTION_NAME" -g "$GALAXY_SERVER" -a $API_KEY -t "$LINKS_HISTORY_TAG" -n 1 --from-template -o "$WORKDIR/$JOB_YML"
if [ ! -s "$WORKDIR/$JOB_YML" ]; then
echo "No history tagged with $LINKS_HISTORY_TAG has a collection named $LINKS_COLLECTION_NAME. Nothing to do."
exit 0
fi
# ------------- main actions: get data and run workflow -------------------
SOURCE_HISTORY_ID=$(grep '#from_history_id:' "$WORKDIR/$JOB_YML" | cut -d ' ' -f 2-)
SOURCE_HISTORY_NAME=$(grep '#from_history_name:' "$WORKDIR/$JOB_YML" | cut -d ' ' -f 2-)
ENA_LINKS=$(grep '#from_ena_links_in:' "$WORKDIR/$JOB_YML" | cut -d ' ' -f 2-)
DEST_NAME_SUFFIX=$(grep '#batch_name:' "$WORKDIR/$JOB_YML" | cut -d ' ' -f 2-)
echo "Going to work on links discovered in dataset ID: '$ENA_LINKS' of history ID: '$SOURCE_HISTORY_ID'"
# put a bot reserved tag on the input dataset to prevent it from being processed again
# this will be changed as the analysis proceeds
python bioblend-scripts/tag_history.py $SOURCE_HISTORY_ID --dataset-id $ENA_LINKS -g "$GALAXY_SERVER" -a $API_KEY -t $BOT_SIGNAL1 &&
# from here on make sure we replace the bot-downloading tag with one saying bot-failed if something
# goes wrong cause a left-behind bot-downloading tag will prevent future bot runs,
# but we still want a way to identify links that haven't been analyzed.
trap 'python bioblend-scripts/tag_history.py $SOURCE_HISTORY_ID --dataset-id $ENA_LINKS -g "$GALAXY_SERVER" -a $API_KEY -t $BOT_SIGNAL4 -r $BOT_SIGNAL1; exit 1' err &&
# download the data and add information about the collection to be built from it to the job yml file
INPUT_COLLECTION='Input Collection' &&
python bioblend-scripts/ftp_links_to_yaml.py $ENA_LINKS "$INPUT_COLLECTION" -i $DOWNLOAD_HISTORY -p $DEFAULT_PROTOCOL -g "$GALAXY_SERVER" -a $API_KEY >> "$WORKDIR/$JOB_YML" &&
echo "Data upload complete!" &&
# for the following replacements in the job yml file
# we need to move the current version to a temporary file
mv "$WORKDIR/$JOB_YML" "$WORKDIR/$JOB_YML".tmp &&
# now detect the type of workflow to run from the structure of the generated job yml file
# and modify the yml's collection name to match the input collection name expected by the workflow
if grep "list:paired" "$WORKDIR/$JOB_YML".tmp; then
WF_ID=$(grep '#pe_workflow_id:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
INPUT_REPLACE=$(grep '#pe_collection_name:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
sed "s/$INPUT_COLLECTION:/$INPUT_REPLACE:/" "$WORKDIR/$JOB_YML".tmp > "$WORKDIR/$JOB_YML"
else
NUM_LIST_LIST=$(grep -c "list:list" "$WORKDIR/$JOB_YML".tmp) &&
if [ $NUM_LIST_LIST -eq 2 ]; then
WF_ID=$(grep '#nested_pe_workflow_id:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
INPUT_REPLACE_FW=$(grep '#nested_pe_collection_name_fw:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
INPUT_REPLACE_RV=$(grep '#nested_pe_collection_name_rv:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
sed "s/""$INPUT_COLLECTION""_fw:/$INPUT_REPLACE_FW:/;s/""$INPUT_COLLECTION""_rv:/$INPUT_REPLACE_RV:/" "$WORKDIR/$JOB_YML".tmp > "$WORKDIR/$JOB_YML"
elif [ $NUM_LIST_LIST -eq 1 ]; then
WF_ID=$(grep '#nested_se_workflow_id:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
INPUT_REPLACE=$(grep '#nested_se_collection_name:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
sed "s/$INPUT_COLLECTION:/$INPUT_REPLACE:/" "$WORKDIR/$JOB_YML".tmp > "$WORKDIR/$JOB_YML"
else
WF_ID=$(grep '#se_workflow_id:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
INPUT_REPLACE=$(grep '#se_collection_name:' "$WORKDIR/$JOB_YML".tmp | cut -d ' ' -f 2-) &&
sed "s/$INPUT_COLLECTION:/$INPUT_REPLACE:/" "$WORKDIR/$JOB_YML".tmp > "$WORKDIR/$JOB_YML"
fi
fi
# data should be downloaded at this point, time to let planemo handle the rest!
python bioblend-scripts/tag_history.py $SOURCE_HISTORY_ID --dataset-id $ENA_LINKS -g "$GALAXY_SERVER" -a $API_KEY -t $BOT_SIGNAL2 -r $BOT_SIGNAL1 &&
# bot-downloading tag has been removed from links dataset, no need to handle this on errors anymore
trap - err &&
# run the WF
planemo -v run $WF_ID "$WORKDIR/$JOB_YML" --history_name "$DEST_NAME_BASE $DEST_NAME_SUFFIX" --tags $DEST_TAG --galaxy_url "$GALAXY_SERVER" --galaxy_user_key $API_KEY --engine external_galaxy 2>&1 > /dev/null | grep -o 'GET /api/histories/[^?]*\?' > "$WORKDIR/run_info.txt" &&
# on successful completion of the WF invocation inform downstream bots
# by tagging the new history accordingly
DEST_HISTORY_ID=$(grep -m1 -o 'histories/[0-9a-f]*' "$WORKDIR/run_info.txt" | cut -d / -f 2) &&
python bioblend-scripts/tag_history.py $DEST_HISTORY_ID -g "$GALAXY_SERVER" -a $API_KEY -t $DEST_BOT_TAGS &&
# mark the source history ENA links dataset as processed
python bioblend-scripts/tag_history.py $SOURCE_HISTORY_ID --dataset-id $ENA_LINKS -g "$GALAXY_SERVER" -a $API_KEY -t $BOT_SIGNAL3 -r $BOT_SIGNAL1 $BOT_SIGNAL2