From cb0018c7c41e2a9bf0e7aa7d049133d1396ad916 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 15:53:36 +0100 Subject: [PATCH 01/14] allowing parallel processing where possible --- process.sh | 112 ++++++++++++++++++++++++++++------------------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/process.sh b/process.sh index a0480ed..537c6f4 100644 --- a/process.sh +++ b/process.sh @@ -7,6 +7,7 @@ echo "Start: vfb-pipeline-collectdata" echo "VFBTIME:" date +# Define and export necessary variables VFB_FULL_DIR=/tmp/vfb_fullontologies VFB_SLICES_DIR=/tmp/vfb_slices VFB_DOWNLOAD_DIR=/tmp/vfb_download @@ -19,21 +20,20 @@ SHACL_DIR=${WORKSPACE}/shacl KB_FILE=$VFB_DOWNLOAD_DIR/kb.owl VFB_NEO4J_SRC=${WORKSPACE}/VFB_neo4j - export ROBOT_JAVA_ARGS=${ROBOT_ARGS} echo "** Collecting Data! **" - echo 'START' >> ${WORKSPACE}/tick.out -## tail -f ${WORKSPACE}/tick.out >&1 &>&1 -echo "** Updateing Neo4J VFB codebase **" +# Update Neo4J VFB codebase +echo "** Updating Neo4J VFB codebase **" cd $VFB_NEO4J_SRC git pull origin master git checkout ${GITBRANCH} git pull pip install -r requirements.txt +# Create temporary directories echo "** Creating temporary directories.. **" cd ${WORKSPACE} ls -l $VFB_FINAL @@ -44,44 +44,47 @@ mkdir $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_ echo "VFBTIME:" date +# Parallel downloading and processing using xargs echo '** Downloading relevant ontologies.. **' echo '** in full: **' -while read -r url_pattern; do - echo $url_pattern - if [[ "$url_pattern" == *"*"* ]]; then - base_url="${url_pattern%/*}/" - pattern="${url_pattern##*/}" - pattern="${pattern//\*/.*}" - page=$(curl -s "$base_url") - file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//') - - for file in $file_list; do - file_url="${base_url}${file}" - wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" - done - else - wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" - fi -done < vfb_fullontologies.txt +cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c ' + url_pattern="{}" + if [[ "$url_pattern" == *"*"* ]]; then + base_url="${url_pattern%/*}/" + pattern="${url_pattern##*/}" + pattern="${pattern//\*/.*}" + page=$(curl -s "$base_url") + file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//") + + for file in $file_list; do + file_url="${base_url}${file}" + wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" & + done + else + wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" & + fi +' +wait echo '** in slices: **' -while read -r url_pattern; do - echo $url_pattern - if [[ "$url_pattern" == *"*"* ]]; then - base_url="${url_pattern%/*}/" - pattern="${url_pattern##*/}" - pattern="${pattern//\*/.*}" - page=$(curl -s "$base_url") - file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//') - - for file in $file_list; do - file_url="${base_url}${file}" - wget -N -P "$VFB_SLICES_DIR" "$file_url" - done - else - wget -N -P "$VFB_SLICES_DIR" "$url_pattern" - fi -done < vfb_slices.txt +cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c ' + url_pattern="{}" + if [[ "$url_pattern" == *"*"* ]]; then + base_url="${url_pattern%/*}/" + pattern="${url_pattern##*/}" + pattern="${pattern//\*/.*}" + page=$(curl -s "$base_url") + file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//") + + for file in $file_list; do + file_url="${base_url}${file}" + wget -N -P "$VFB_SLICES_DIR" "$file_url" & + done + else + wget -N -P "$VFB_SLICES_DIR" "$url_pattern" & + fi +' +wait echo "VFBTIME:" date @@ -98,7 +101,6 @@ robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.ow echo "VFBTIME:" date - if [ "$REMOVE_EMBARGOED_DATA" = true ]; then echo '** Deleting embargoed data.. **' robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt @@ -126,14 +128,15 @@ echo 'Merging all input ontologies.' cd $VFB_DOWNLOAD_DIR for i in *.owl; do [ -f "$i" ] || break - echo "Merging: "$i - ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i" + echo "Merging: $i" + ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i" & done for i in *.owl.gz; do [ -f "$i" ] || break - echo "Merging: "$i - ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl" + echo "Merging: $i" + ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl" & done +wait echo 'Copy all OWL files to output directory..' cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL @@ -144,10 +147,11 @@ cd $VFB_DOWNLOAD_DIR for i in *.owl; do [ -f "$i" ] || break seedfile=$i"_terms.txt" - echo "Extracting seed from: "$i" to "$seedfile + echo "Extracting seed from: $i to $seedfile" [ ! -f "$seedfile" ] || break - ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile + ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile & done +wait cat *_terms.txt | sort | uniq > ${VFB_FINAL}/seed.txt @@ -158,23 +162,24 @@ echo 'Creating slices for external ontologies: Extracting modules' cd $VFB_SLICES_DIR for i in *.owl; do [ -f "$i" ] || break - echo "Processing: "$i + echo "Processing: $i" mod=$i"_module.owl" - ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod + ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod & cp $mod $VFB_FINAL cp $mod $VFB_DEBUG_DIR done +wait echo "VFBTIME:" date +# Uncomment the following block if debugging files are needed # echo 'Create debugging files for pipeline..' # cd $VFB_DEBUG_DIR # robot merge --inputs "*.owl" remove --axioms "disjoint" --output $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl # robot merge -i kb.owl -i fbbt.owl --output $VFB_FINAL_DEBUG/vfb-kb_fbbt.owl # robot reason --reasoner ELK --input $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl --output $VFB_FINAL_DEBUG/vfb-dependencies-reasoned.owl - if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then echo 'Removing all possible sources for unsatisfiable classes and inconsistency...' cd $VFB_FINAL @@ -191,25 +196,26 @@ if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then echo "Removing $axiom_type axioms from $i" ${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \ remove --axioms $axiom_type --preserve-structure false -o "$i.tmp.owl" - mv "$i.tmp.owl" "$i" + mv "$i.tmp.owl" "$i" & done done + wait fi echo 'Converting all OWL files to gzipped TTL' cd $VFB_FINAL for i in *.owl; do [ -f "$i" ] || break - echo "Processing: "$i - ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl" + echo "Processing: $i" + ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl" & if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then if [ "$VALIDATESHACL" = true ]; then echo "Validating KB with SHACL.." - shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt + shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt & fi fi done - +wait gzip -f *.ttl From 002b1ce11e1e39c0280d1281c966d7a753735030 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 18:12:41 +0100 Subject: [PATCH 02/14] can be done in parallel --- process.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/process.sh b/process.sh index 537c6f4..3f9a358 100644 --- a/process.sh +++ b/process.sh @@ -109,12 +109,13 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then head -10 ${VFB_FINAL}/embargoed_datasets.txt echo 'Embargoed datasets: select_embargoed_channels' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt + robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt & echo 'Embargoed datasets: select_embargoed_images' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt + robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt & echo 'Embargoed datasets: select_embargoed_datasets' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt - + robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt & + wait + echo 'Embargoed data: Removing everything' cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl @@ -139,8 +140,8 @@ done wait echo 'Copy all OWL files to output directory..' -cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL -cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR +cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL & +cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR & echo 'Creating slices for external ontologies: Extracting seeds.' cd $VFB_DOWNLOAD_DIR From 7f40b4fec64d78b49be9651e74277d947c7fd38a Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 18:18:54 +0100 Subject: [PATCH 03/14] can be run in background until next wait --- process.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/process.sh b/process.sh index 3f9a358..272ba3a 100644 --- a/process.sh +++ b/process.sh @@ -118,9 +118,7 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then echo 'Embargoed data: Removing everything' cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt - robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl - mv ${KB_FILE}.tmp.owl ${KB_FILE} - + robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} & echo "VFBTIME:" date fi From 2234c5f1c61a88e21e227f4a5c0c9f1f054d29a1 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 18:26:26 +0100 Subject: [PATCH 04/14] fix for validation --- process.sh | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/process.sh b/process.sh index 272ba3a..faebdad 100644 --- a/process.sh +++ b/process.sh @@ -201,19 +201,34 @@ if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then wait fi +# Function to handle conversion and validation +process_owl_file() { + local owl_file="$1" + local ttl_file="${owl_file%.owl}.ttl" + + echo "Processing: $owl_file" + ${WORKSPACE}/robot convert --check false --input "$owl_file" -f ttl --output "$ttl_file" + + # Perform validation if conditions are met + if [ "$owl_file" == "kb.owl" ] && [ "$VALIDATE" = true ] && [ "$VALIDATESHACL" = true ]; then + echo "Validating KB with SHACL for $ttl_file.." + shaclvalidate.sh -datafile "$ttl_file" -shapesfile $WORKSPACE/shacl/kb.shacl > "$VFB_FINAL/validation_$owl_file.txt" + fi + + # Gzip the TTL file after validation + gzip -f "$ttl_file" +} + echo 'Converting all OWL files to gzipped TTL' cd $VFB_FINAL -for i in *.owl; do - [ -f "$i" ] || break - echo "Processing: $i" - ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl" & - if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then - if [ "$VALIDATESHACL" = true ]; then - echo "Validating KB with SHACL.." - shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt & - fi - fi +# Loop through each OWL file and process it in parallel +for owl_file in *.owl; do + [ -f "$owl_file" ] || continue + # Run the process in a subshell and put it in the background + (process_owl_file "$owl_file") & done + +# Wait for all background processes to complete wait gzip -f *.ttl From df3600595197df8f5fcbadc87a6ffc9ef9df1971 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 20:02:36 +0100 Subject: [PATCH 05/14] Pre-removing embargo data to speed up export --- process.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/process.sh b/process.sh index faebdad..cfdab2e 100644 --- a/process.sh +++ b/process.sh @@ -89,6 +89,19 @@ wait echo "VFBTIME:" date +echo '** Removing embargoed data directly from KB before export **' +echo 'Non Production Datasets:' +curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:DataSet)<-[:has_source]-(i:Individual)<-[:depicts]-(ic:Individual) WHERE not n.production=[true] DETACH DELETE ic DETACH DELETE i DETACH DELETE n"}]}' +echo 'Blocked Anatomical Individuals:' +curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(i.block) DETACH DELETE ic DETACH DELETE i"}]}' +echo 'Blocked Images:' +curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(ir.block) DELETE ir"}]}' +echo 'Clean Channels/Individuals with no Image:' +curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual) WHERE not (ic)-[:in_register_with]->(:Template) and i.short_form starts with 'VFB_' DETACH DELETE ic DETACH DELETE i"}]}' + +echo "VFBTIME:" +date + echo '** Exporting KB to OWL **' curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (c) REMOVE c.label_rdfs RETURN c"}]}' >> ${VFB_DEBUG_DIR}/neo4j_remove_rdfs_label.txt curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (p) WHERE EXISTS(p.label) SET p.label_rdfs=[] + p.label"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt From 7861c9cbb4cb7dc6cff89c3f7c40c67aa4575083 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 20:05:11 +0100 Subject: [PATCH 06/14] removing parts after merge --- process.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/process.sh b/process.sh index cfdab2e..f6f5d44 100644 --- a/process.sh +++ b/process.sh @@ -110,6 +110,7 @@ curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE} robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.owl -i $VFB_DOWNLOAD_DIR/kb_part_2.owl -i $VFB_DOWNLOAD_DIR/kb_part_3.owl -i $VFB_DOWNLOAD_DIR/kb_rels.owl -o ${KB_FILE} +rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl echo "VFBTIME:" date From 6c559dcfdf8c29c4bdb53fdb456531a3e45099a5 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 20:19:09 +0100 Subject: [PATCH 07/14] can all happen in parallel until merging --- process.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/process.sh b/process.sh index f6f5d44..3384af0 100644 --- a/process.sh +++ b/process.sh @@ -64,7 +64,7 @@ cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c ' wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" & fi ' -wait + echo '** in slices: **' cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c ' @@ -84,7 +84,7 @@ cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c ' wget -N -P "$VFB_SLICES_DIR" "$url_pattern" & fi ' -wait + echo "VFBTIME:" date @@ -137,6 +137,8 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then date fi +wait + echo 'Merging all input ontologies.' cd $VFB_DOWNLOAD_DIR for i in *.owl; do From b51eccb76100857424cd10846a3b841088037b97 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:02:14 +0100 Subject: [PATCH 08/14] fixing downloads --- process.sh | 77 +++++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/process.sh b/process.sh index 3384af0..d033bcb 100644 --- a/process.sh +++ b/process.sh @@ -44,47 +44,53 @@ mkdir $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_ echo "VFBTIME:" date -# Parallel downloading and processing using xargs echo '** Downloading relevant ontologies.. **' echo '** in full: **' -cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c ' - url_pattern="{}" - if [[ "$url_pattern" == *"*"* ]]; then - base_url="${url_pattern%/*}/" - pattern="${url_pattern##*/}" - pattern="${pattern//\*/.*}" - page=$(curl -s "$base_url") - file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//") - - for file in $file_list; do - file_url="${base_url}${file}" - wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" & - done - else - wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" & - fi -' + +# Process each URL pattern in parallel +while read -r url_pattern; do + echo "Processing: $url_pattern" + if [[ "$url_pattern" == *"*"* ]]; then + base_url="${url_pattern%/*}/" + pattern="${url_pattern##*/}" + pattern="${pattern//\*/.*}" + page=$(curl -s "$base_url") + file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//') + + for file in $file_list; do + file_url="${base_url}${file}" + wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" & + done + else + wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" & + fi +done < vfb_fullontologies.txt echo '** in slices: **' -cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c ' - url_pattern="{}" - if [[ "$url_pattern" == *"*"* ]]; then - base_url="${url_pattern%/*}/" - pattern="${url_pattern##*/}" - pattern="${pattern//\*/.*}" - page=$(curl -s "$base_url") - file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//") - - for file in $file_list; do - file_url="${base_url}${file}" - wget -N -P "$VFB_SLICES_DIR" "$file_url" & - done - else - wget -N -P "$VFB_SLICES_DIR" "$url_pattern" & - fi -' +# Process each URL pattern in slices in parallel +while read -r url_pattern; do + echo "Processing: $url_pattern" + if [[ "$url_pattern" == *"*"* ]]; then + base_url="${url_pattern%/*}/" + pattern="${url_pattern##*/}" + pattern="${pattern//\*/.*}" + page=$(curl -s "$base_url") + file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//') + + for file in $file_list; do + file_url="${base_url}${file}" + wget -N -P "$VFB_SLICES_DIR" "$file_url" & + done + else + wget -N -P "$VFB_SLICES_DIR" "$url_pattern" & + fi +done < vfb_slices.txt + + + +echo '** Downloads called. **' echo "VFBTIME:" date @@ -137,6 +143,7 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then date fi +# Wait for all background jobs to complete wait echo 'Merging all input ontologies.' From 0c5e3cb0c8483c5742be9b6f83b82891fffb6ce2 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:09:54 +0100 Subject: [PATCH 09/14] commenting out secondary embargo processing --- process.sh | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/process.sh b/process.sh index d033bcb..9662583 100644 --- a/process.sh +++ b/process.sh @@ -121,27 +121,27 @@ rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl echo "VFBTIME:" date -if [ "$REMOVE_EMBARGOED_DATA" = true ]; then - echo '** Deleting embargoed data.. **' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt - - echo 'First 10 embargoed datasets: ' - head -10 ${VFB_FINAL}/embargoed_datasets.txt - - echo 'Embargoed datasets: select_embargoed_channels' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt & - echo 'Embargoed datasets: select_embargoed_images' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt & - echo 'Embargoed datasets: select_embargoed_datasets' - robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt & - wait +# if [ "$REMOVE_EMBARGOED_DATA" = true ]; then +# echo '** Deleting embargoed data.. **' +# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt + +# echo 'First 10 embargoed datasets: ' +# head -10 ${VFB_FINAL}/embargoed_datasets.txt + +# echo 'Embargoed datasets: select_embargoed_channels' +# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt & +# echo 'Embargoed datasets: select_embargoed_images' +# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt & +# echo 'Embargoed datasets: select_embargoed_datasets' +# robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt & +# wait - echo 'Embargoed data: Removing everything' - cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt - robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} & - echo "VFBTIME:" - date -fi +# echo 'Embargoed data: Removing everything' +# cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt +# robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} & +# echo "VFBTIME:" +# date +# fi # Wait for all background jobs to complete wait From 5773f6b6ee70d590e3cc9d962f377d1b3695173b Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:39:26 +0100 Subject: [PATCH 10/14] adding section for merging --- process.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/process.sh b/process.sh index 9662583..ab0a918 100644 --- a/process.sh +++ b/process.sh @@ -114,6 +114,10 @@ curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:Entity) WHERE exists(n.block) DETACH DELETE n"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH ()-[r]-() WHERE exists(r.block) DELETE r"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt +echo "VFBTIME:" +date + +echo '** Merging parts into KB.OWL **' python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE} robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.owl -i $VFB_DOWNLOAD_DIR/kb_part_2.owl -i $VFB_DOWNLOAD_DIR/kb_part_3.owl -i $VFB_DOWNLOAD_DIR/kb_rels.owl -o ${KB_FILE} rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl From 2446da0763b1a917f1a792b3c41318c7c3fb1cde Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:43:33 +0100 Subject: [PATCH 11/14] adding progress output --- process.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/process.sh b/process.sh index ab0a918..fea606a 100644 --- a/process.sh +++ b/process.sh @@ -155,12 +155,12 @@ cd $VFB_DOWNLOAD_DIR for i in *.owl; do [ -f "$i" ] || break echo "Merging: $i" - ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i" & + ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i" && echo "Finished: $i" & done for i in *.owl.gz; do [ -f "$i" ] || break echo "Merging: $i" - ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl" & + ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i.owl" && echo "Finished: $i" & done wait From 48955a51fdc21f643160588f022792e1045f988d Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:51:14 +0100 Subject: [PATCH 12/14] error handling --- process.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/process.sh b/process.sh index fea606a..0102a17 100644 --- a/process.sh +++ b/process.sh @@ -175,7 +175,7 @@ for i in *.owl; do seedfile=$i"_terms.txt" echo "Extracting seed from: $i to $seedfile" [ ! -f "$seedfile" ] || break - ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile & + ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile && echo "Finished: $i" & done wait @@ -258,7 +258,7 @@ done # Wait for all background processes to complete wait -gzip -f *.ttl +gzip -f *.ttl || : echo "End: vfb-pipeline-collectdata" echo "VFBTIME:" From 1917705e597378cdb27ff7133d80903eb354e4d8 Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 21:58:18 +0100 Subject: [PATCH 13/14] syntax fix for slices --- process.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/process.sh b/process.sh index 0102a17..de0e7fa 100644 --- a/process.sh +++ b/process.sh @@ -190,10 +190,9 @@ for i in *.owl; do [ -f "$i" ] || break echo "Processing: $i" mod=$i"_module.owl" - ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod & - cp $mod $VFB_FINAL - cp $mod $VFB_DEBUG_DIR + ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod && cp $mod $VFB_FINAL && cp $mod $VFB_DEBUG_DIR && echo "Finished: $i" & done + wait echo "VFBTIME:" From f97f55cb15d090af51c0a8763e6975489d6f595e Mon Sep 17 00:00:00 2001 From: Rob Court Date: Fri, 23 Aug 2024 22:44:09 +0100 Subject: [PATCH 14/14] parallel processing final part --- process.sh | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/process.sh b/process.sh index de0e7fa..e95133c 100644 --- a/process.sh +++ b/process.sh @@ -208,22 +208,41 @@ date if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then echo 'Removing all possible sources for unsatisfiable classes and inconsistency...' cd $VFB_FINAL - for i in *.owl; do - [ -f "$i" ] || break - echo "Processing: $i" + + # Define the function to process each OWL file + process_owl_file() { + local owl_file="$1" + + echo "Processing: $owl_file" + + # Check if the file should be skipped while read -r url_pattern; do - if [ $url_pattern == $i ]; then - echo "Skipping $i" - continue 2 + if [ "$url_pattern" == "$owl_file" ]; then + echo "Skipping $owl_file" + return fi done < ${WORKSPACE}/vfb_skip_axiom_checks.txt + + # Remove axioms for axiom_type in $UNSAT_AXIOM_TYPES; do - echo "Removing $axiom_type axioms from $i" - ${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \ - remove --axioms $axiom_type --preserve-structure false -o "$i.tmp.owl" - mv "$i.tmp.owl" "$i" & + echo "Removing $axiom_type axioms from $owl_file" + ${WORKSPACE}/robot remove --input "$owl_file" --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \ + remove --axioms $axiom_type --preserve-structure false -o "$owl_file.tmp.owl" + mv "$owl_file.tmp.owl" "$owl_file" done + echo "Finished: $owl_file" + } + + # Export the function so it can be used in subshells + export -f process_owl_file + + # Process each OWL file in parallel + for i in *.owl; do + [ -f "$i" ] || continue + process_owl_file "$i" & done + + # Wait for all background jobs to complete wait fi