Skip to content

Commit

Permalink
Merge pull request #25 from VirtualFlyBrain/vfb_pipeline_parallel
Browse files Browse the repository at this point in the history
Vfb pipeline parallel
  • Loading branch information
Robbie1977 authored Aug 29, 2024
2 parents 616b9d6 + f97f55c commit 3c0bfaa
Showing 1 changed file with 132 additions and 67 deletions.
199 changes: 132 additions & 67 deletions process.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ echo "Start: vfb-pipeline-collectdata"
echo "VFBTIME:"
date

# Define and export necessary variables
VFB_FULL_DIR=/tmp/vfb_fullontologies
VFB_SLICES_DIR=/tmp/vfb_slices
VFB_DOWNLOAD_DIR=/tmp/vfb_download
Expand All @@ -19,21 +20,20 @@ SHACL_DIR=${WORKSPACE}/shacl
KB_FILE=$VFB_DOWNLOAD_DIR/kb.owl
VFB_NEO4J_SRC=${WORKSPACE}/VFB_neo4j


export ROBOT_JAVA_ARGS=${ROBOT_ARGS}

echo "** Collecting Data! **"

echo 'START' >> ${WORKSPACE}/tick.out
## tail -f ${WORKSPACE}/tick.out >&1 &>&1

echo "** Updateing Neo4J VFB codebase **"
# Update Neo4J VFB codebase
echo "** Updating Neo4J VFB codebase **"
cd $VFB_NEO4J_SRC
git pull origin master
git checkout ${GITBRANCH}
git pull
pip install -r requirements.txt

# Create temporary directories
echo "** Creating temporary directories.. **"
cd ${WORKSPACE}
ls -l $VFB_FINAL
Expand All @@ -46,8 +46,10 @@ date

echo '** Downloading relevant ontologies.. **'
echo '** in full: **'

# Process each URL pattern in parallel
while read -r url_pattern; do
echo $url_pattern
echo "Processing: $url_pattern"
if [[ "$url_pattern" == *"*"* ]]; then
base_url="${url_pattern%/*}/"
pattern="${url_pattern##*/}"
Expand All @@ -57,16 +59,19 @@ while read -r url_pattern; do

for file in $file_list; do
file_url="${base_url}${file}"
wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url"
wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" &
done
else
wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern"
wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" &
fi
done < vfb_fullontologies.txt


echo '** in slices: **'

# Process each URL pattern in slices in parallel
while read -r url_pattern; do
echo $url_pattern
echo "Processing: $url_pattern"
if [[ "$url_pattern" == *"*"* ]]; then
base_url="${url_pattern%/*}/"
pattern="${url_pattern##*/}"
Expand All @@ -76,13 +81,30 @@ while read -r url_pattern; do

for file in $file_list; do
file_url="${base_url}${file}"
wget -N -P "$VFB_SLICES_DIR" "$file_url"
wget -N -P "$VFB_SLICES_DIR" "$file_url" &
done
else
wget -N -P "$VFB_SLICES_DIR" "$url_pattern"
wget -N -P "$VFB_SLICES_DIR" "$url_pattern" &
fi
done < vfb_slices.txt



echo '** Downloads called. **'

echo "VFBTIME:"
date

echo '** Removing embargoed data directly from KB before export **'
echo 'Non Production Datasets:'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:DataSet)<-[:has_source]-(i:Individual)<-[:depicts]-(ic:Individual) WHERE not n.production=[true] DETACH DELETE ic DETACH DELETE i DETACH DELETE n"}]}'
echo 'Blocked Anatomical Individuals:'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(i.block) DETACH DELETE ic DETACH DELETE i"}]}'
echo 'Blocked Images:'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(ir.block) DELETE ir"}]}'
echo 'Clean Channels/Individuals with no Image:'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual) WHERE not (ic)-[:in_register_with]->(:Template) and i.short_form starts with 'VFB_' DETACH DELETE ic DETACH DELETE i"}]}'

echo "VFBTIME:"
date

Expand All @@ -92,62 +114,70 @@ curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:Entity) WHERE exists(n.block) DETACH DELETE n"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH ()-[r]-() WHERE exists(r.block) DELETE r"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt

echo "VFBTIME:"
date

echo '** Merging parts into KB.OWL **'
python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE}
robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.owl -i $VFB_DOWNLOAD_DIR/kb_part_2.owl -i $VFB_DOWNLOAD_DIR/kb_part_3.owl -i $VFB_DOWNLOAD_DIR/kb_rels.owl -o ${KB_FILE}
rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl

echo "VFBTIME:"
date


if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
echo '** Deleting embargoed data.. **'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt

echo 'First 10 embargoed datasets: '
head -10 ${VFB_FINAL}/embargoed_datasets.txt

echo 'Embargoed datasets: select_embargoed_channels'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt
echo 'Embargoed datasets: select_embargoed_images'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt
echo 'Embargoed datasets: select_embargoed_datasets'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt

echo 'Embargoed data: Removing everything'
cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl
mv ${KB_FILE}.tmp.owl ${KB_FILE}

echo "VFBTIME:"
date
fi
# if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
# echo '** Deleting embargoed data.. **'
# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt

# echo 'First 10 embargoed datasets: '
# head -10 ${VFB_FINAL}/embargoed_datasets.txt

# echo 'Embargoed datasets: select_embargoed_channels'
# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt &
# echo 'Embargoed datasets: select_embargoed_images'
# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt &
# echo 'Embargoed datasets: select_embargoed_datasets'
# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt &
# wait

# echo 'Embargoed data: Removing everything'
# cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
# robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} &
# echo "VFBTIME:"
# date
# fi

# Wait for all background jobs to complete
wait

echo 'Merging all input ontologies.'
cd $VFB_DOWNLOAD_DIR
for i in *.owl; do
[ -f "$i" ] || break
echo "Merging: "$i
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i"
echo "Merging: $i"
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i" && echo "Finished: $i" &
done
for i in *.owl.gz; do
[ -f "$i" ] || break
echo "Merging: "$i
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl"
echo "Merging: $i"
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i.owl" && echo "Finished: $i" &
done
wait

echo 'Copy all OWL files to output directory..'
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL &
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR &

echo 'Creating slices for external ontologies: Extracting seeds.'
cd $VFB_DOWNLOAD_DIR
for i in *.owl; do
[ -f "$i" ] || break
seedfile=$i"_terms.txt"
echo "Extracting seed from: "$i" to "$seedfile
echo "Extracting seed from: $i to $seedfile"
[ ! -f "$seedfile" ] || break
${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile
${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile && echo "Finished: $i" &
done
wait

cat *_terms.txt | sort | uniq > ${VFB_FINAL}/seed.txt

Expand All @@ -158,60 +188,95 @@ echo 'Creating slices for external ontologies: Extracting modules'
cd $VFB_SLICES_DIR
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: "$i
echo "Processing: $i"
mod=$i"_module.owl"
${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod
cp $mod $VFB_FINAL
cp $mod $VFB_DEBUG_DIR
${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod && cp $mod $VFB_FINAL && cp $mod $VFB_DEBUG_DIR && echo "Finished: $i" &
done

wait

echo "VFBTIME:"
date

# Uncomment the following block if debugging files are needed
# echo 'Create debugging files for pipeline..'
# cd $VFB_DEBUG_DIR
# robot merge --inputs "*.owl" remove --axioms "disjoint" --output $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl
# robot merge -i kb.owl -i fbbt.owl --output $VFB_FINAL_DEBUG/vfb-kb_fbbt.owl
# robot reason --reasoner ELK --input $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl --output $VFB_FINAL_DEBUG/vfb-dependencies-reasoned.owl


if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
echo 'Removing all possible sources for unsatisfiable classes and inconsistency...'
cd $VFB_FINAL
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: $i"

# Define the function to process each OWL file
process_owl_file() {
local owl_file="$1"

echo "Processing: $owl_file"

# Check if the file should be skipped
while read -r url_pattern; do
if [ $url_pattern == $i ]; then
echo "Skipping $i"
continue 2
if [ "$url_pattern" == "$owl_file" ]; then
echo "Skipping $owl_file"
return
fi
done < ${WORKSPACE}/vfb_skip_axiom_checks.txt

# Remove axioms
for axiom_type in $UNSAT_AXIOM_TYPES; do
echo "Removing $axiom_type axioms from $i"
${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
remove --axioms $axiom_type --preserve-structure false -o "$i.tmp.owl"
mv "$i.tmp.owl" "$i"
echo "Removing $axiom_type axioms from $owl_file"
${WORKSPACE}/robot remove --input "$owl_file" --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
remove --axioms $axiom_type --preserve-structure false -o "$owl_file.tmp.owl"
mv "$owl_file.tmp.owl" "$owl_file"
done
echo "Finished: $owl_file"
}

# Export the function so it can be used in subshells
export -f process_owl_file

# Process each OWL file in parallel
for i in *.owl; do
[ -f "$i" ] || continue
process_owl_file "$i" &
done

# Wait for all background jobs to complete
wait
fi

# Function to handle conversion and validation
process_owl_file() {
local owl_file="$1"
local ttl_file="${owl_file%.owl}.ttl"

echo "Processing: $owl_file"
${WORKSPACE}/robot convert --check false --input "$owl_file" -f ttl --output "$ttl_file"

# Perform validation if conditions are met
if [ "$owl_file" == "kb.owl" ] && [ "$VALIDATE" = true ] && [ "$VALIDATESHACL" = true ]; then
echo "Validating KB with SHACL for $ttl_file.."
shaclvalidate.sh -datafile "$ttl_file" -shapesfile $WORKSPACE/shacl/kb.shacl > "$VFB_FINAL/validation_$owl_file.txt"
fi

# Gzip the TTL file after validation
gzip -f "$ttl_file"
}

echo 'Converting all OWL files to gzipped TTL'
cd $VFB_FINAL
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: "$i
${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl"
if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then
if [ "$VALIDATESHACL" = true ]; then
echo "Validating KB with SHACL.."
shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt
fi
fi
# Loop through each OWL file and process it in parallel
for owl_file in *.owl; do
[ -f "$owl_file" ] || continue
# Run the process in a subshell and put it in the background
(process_owl_file "$owl_file") &
done

# Wait for all background processes to complete
wait

gzip -f *.ttl
gzip -f *.ttl || :

echo "End: vfb-pipeline-collectdata"
echo "VFBTIME:"
Expand Down

0 comments on commit 3c0bfaa

Please sign in to comment.