From cb0018c7c41e2a9bf0e7aa7d049133d1396ad916 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 15:53:36 +0100
Subject: [PATCH 01/14] allowing parallel processing where possible

---
 process.sh | 112 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 59 insertions(+), 53 deletions(-)

diff --git a/process.sh b/process.sh
index a0480ed..537c6f4 100644
--- a/process.sh
+++ b/process.sh
@@ -7,6 +7,7 @@ echo "Start: vfb-pipeline-collectdata"
 echo "VFBTIME:"
 date
 
+# Define and export necessary variables
 VFB_FULL_DIR=/tmp/vfb_fullontologies
 VFB_SLICES_DIR=/tmp/vfb_slices
 VFB_DOWNLOAD_DIR=/tmp/vfb_download
@@ -19,21 +20,20 @@ SHACL_DIR=${WORKSPACE}/shacl
 KB_FILE=$VFB_DOWNLOAD_DIR/kb.owl
 VFB_NEO4J_SRC=${WORKSPACE}/VFB_neo4j
 
-
 export ROBOT_JAVA_ARGS=${ROBOT_ARGS}
 
 echo "** Collecting Data! **"
-
 echo 'START' >> ${WORKSPACE}/tick.out
-## tail -f ${WORKSPACE}/tick.out >&1 &>&1
 
-echo "** Updateing Neo4J VFB codebase **"
+# Update Neo4J VFB codebase
+echo "** Updating Neo4J VFB codebase **"
 cd $VFB_NEO4J_SRC
 git pull origin master
 git checkout ${GITBRANCH}
 git pull
 pip install -r requirements.txt
 
+# Create temporary directories
 echo "** Creating temporary directories.. **"
 cd ${WORKSPACE}
 ls -l $VFB_FINAL
@@ -44,44 +44,47 @@ mkdir $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_
 echo "VFBTIME:"
 date
 
+# Parallel downloading and processing using xargs
 echo '** Downloading relevant ontologies.. **'
 echo '** in full: **'
-while read -r url_pattern; do
-    echo $url_pattern
-    if [[ "$url_pattern" == *"*"* ]]; then
-        base_url="${url_pattern%/*}/"
-        pattern="${url_pattern##*/}"
-        pattern="${pattern//\*/.*}"
-        page=$(curl -s "$base_url")
-        file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//')
-
-        for file in $file_list; do
-            file_url="${base_url}${file}"
-            wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url"
-        done
-    else
-        wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern"
-    fi
-done < vfb_fullontologies.txt
+cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c '
+  url_pattern="{}"
+  if [[ "$url_pattern" == *"*"* ]]; then
+    base_url="${url_pattern%/*}/"
+    pattern="${url_pattern##*/}"
+    pattern="${pattern//\*/.*}"
+    page=$(curl -s "$base_url")
+    file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//")
+
+    for file in $file_list; do
+      file_url="${base_url}${file}"
+      wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" &
+    done
+  else
+    wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" &
+  fi
+'
+wait
 
 echo '** in slices: **'
-while read -r url_pattern; do
-    echo $url_pattern
-    if [[ "$url_pattern" == *"*"* ]]; then
-        base_url="${url_pattern%/*}/"
-        pattern="${url_pattern##*/}"
-        pattern="${pattern//\*/.*}"
-        page=$(curl -s "$base_url")
-        file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//')
-
-        for file in $file_list; do
-            file_url="${base_url}${file}"
-            wget -N -P "$VFB_SLICES_DIR" "$file_url"
-        done
-    else
-        wget -N -P "$VFB_SLICES_DIR" "$url_pattern"
-    fi
-done < vfb_slices.txt
+cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c '
+  url_pattern="{}"
+  if [[ "$url_pattern" == *"*"* ]]; then
+    base_url="${url_pattern%/*}/"
+    pattern="${url_pattern##*/}"
+    pattern="${pattern//\*/.*}"
+    page=$(curl -s "$base_url")
+    file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//")
+
+    for file in $file_list; do
+      file_url="${base_url}${file}"
+      wget -N -P "$VFB_SLICES_DIR" "$file_url" &
+    done
+  else
+    wget -N -P "$VFB_SLICES_DIR" "$url_pattern" &
+  fi
+'
+wait
 
 echo "VFBTIME:"
 date
@@ -98,7 +101,6 @@ robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.ow
 echo "VFBTIME:"
 date
 
-
 if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
   echo '** Deleting embargoed data.. **'
   robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt
@@ -126,14 +128,15 @@ echo 'Merging all input ontologies.'
 cd $VFB_DOWNLOAD_DIR
 for i in *.owl; do
     [ -f "$i" ] || break
-    echo "Merging: "$i
-    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i"
+    echo "Merging: $i"
+    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i" &
 done
 for i in *.owl.gz; do
     [ -f "$i" ] || break
-    echo "Merging: "$i
-    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl"
+    echo "Merging: $i"
+    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl" &
 done
+wait
 
 echo 'Copy all OWL files to output directory..'
 cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL
@@ -144,10 +147,11 @@ cd $VFB_DOWNLOAD_DIR
 for i in *.owl; do
     [ -f "$i" ] || break
     seedfile=$i"_terms.txt"
-    echo "Extracting seed from: "$i" to "$seedfile
+    echo "Extracting seed from: $i to $seedfile"
     [ ! -f "$seedfile" ] || break
-    ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile
+    ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile &
 done
+wait
 
 cat *_terms.txt | sort | uniq > ${VFB_FINAL}/seed.txt
 
@@ -158,23 +162,24 @@ echo 'Creating slices for external ontologies: Extracting modules'
 cd $VFB_SLICES_DIR
 for i in *.owl; do
     [ -f "$i" ] || break
-    echo "Processing: "$i
+    echo "Processing: $i"
     mod=$i"_module.owl"
-    ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod
+    ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod &
     cp $mod $VFB_FINAL
     cp $mod $VFB_DEBUG_DIR
 done
+wait
 
 echo "VFBTIME:"
 date
 
+# Uncomment the following block if debugging files are needed
 # echo 'Create debugging files for pipeline..'
 # cd $VFB_DEBUG_DIR
 # robot merge --inputs "*.owl" remove --axioms "disjoint" --output $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl
 # robot merge -i kb.owl -i fbbt.owl --output $VFB_FINAL_DEBUG/vfb-kb_fbbt.owl
 # robot reason --reasoner ELK --input $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl --output $VFB_FINAL_DEBUG/vfb-dependencies-reasoned.owl
 
-
 if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
   echo 'Removing all possible sources for unsatisfiable classes and inconsistency...'
   cd $VFB_FINAL
@@ -191,25 +196,26 @@ if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
       echo "Removing $axiom_type axioms from $i"
       ${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
         remove --axioms $axiom_type --preserve-structure false -o "$i.tmp.owl"
-      mv "$i.tmp.owl" "$i"
+      mv "$i.tmp.owl" "$i" &
     done
   done
+  wait
 fi
 
 echo 'Converting all OWL files to gzipped TTL'
 cd $VFB_FINAL
 for i in *.owl; do
     [ -f "$i" ] || break
-    echo "Processing: "$i
-    ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl"
+    echo "Processing: $i"
+    ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl" &
     if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then
       if [ "$VALIDATESHACL" = true ]; then
         echo "Validating KB with SHACL.."
-        shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt
+        shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt &
       fi
     fi
 done
-
+wait
 
 gzip -f *.ttl
 

From 002b1ce11e1e39c0280d1281c966d7a753735030 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 18:12:41 +0100
Subject: [PATCH 02/14] can be done in parallel

---
 process.sh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/process.sh b/process.sh
index 537c6f4..3f9a358 100644
--- a/process.sh
+++ b/process.sh
@@ -109,12 +109,13 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
   head -10 ${VFB_FINAL}/embargoed_datasets.txt
 
   echo 'Embargoed datasets: select_embargoed_channels'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt
+  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt &
   echo 'Embargoed datasets: select_embargoed_images'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt
+  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt &
   echo 'Embargoed datasets: select_embargoed_datasets'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt
-
+  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt &
+  wait
+  
   echo 'Embargoed data: Removing everything'
   cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
   robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl
@@ -139,8 +140,8 @@ done
 wait
 
 echo 'Copy all OWL files to output directory..'
-cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL
-cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR
+cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL &
+cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR &
 
 echo 'Creating slices for external ontologies: Extracting seeds.'
 cd $VFB_DOWNLOAD_DIR

From 7f40b4fec64d78b49be9651e74277d947c7fd38a Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 18:18:54 +0100
Subject: [PATCH 03/14] can be run in background until next wait

---
 process.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/process.sh b/process.sh
index 3f9a358..272ba3a 100644
--- a/process.sh
+++ b/process.sh
@@ -118,9 +118,7 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
   
   echo 'Embargoed data: Removing everything'
   cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
-  robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl
-  mv ${KB_FILE}.tmp.owl ${KB_FILE}
-
+  robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} &
   echo "VFBTIME:"
   date
 fi

From 2234c5f1c61a88e21e227f4a5c0c9f1f054d29a1 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 18:26:26 +0100
Subject: [PATCH 04/14] fix for validation

---
 process.sh | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/process.sh b/process.sh
index 272ba3a..faebdad 100644
--- a/process.sh
+++ b/process.sh
@@ -201,19 +201,34 @@ if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
   wait
 fi
 
+# Function to handle conversion and validation
+process_owl_file() {
+    local owl_file="$1"
+    local ttl_file="${owl_file%.owl}.ttl"
+
+    echo "Processing: $owl_file"
+    ${WORKSPACE}/robot convert --check false --input "$owl_file" -f ttl --output "$ttl_file"
+
+    # Perform validation if conditions are met
+    if [ "$owl_file" == "kb.owl" ] && [ "$VALIDATE" = true ] && [ "$VALIDATESHACL" = true ]; then
+        echo "Validating KB with SHACL for $ttl_file.."
+        shaclvalidate.sh -datafile "$ttl_file" -shapesfile $WORKSPACE/shacl/kb.shacl > "$VFB_FINAL/validation_$owl_file.txt"
+    fi
+
+    # Gzip the TTL file after validation
+    gzip -f "$ttl_file"
+}
+
 echo 'Converting all OWL files to gzipped TTL'
 cd $VFB_FINAL
-for i in *.owl; do
-    [ -f "$i" ] || break
-    echo "Processing: $i"
-    ${WORKSPACE}/robot convert --check false --input $i -f ttl --output $i".ttl" &
-    if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then
-      if [ "$VALIDATESHACL" = true ]; then
-        echo "Validating KB with SHACL.."
-        shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt &
-      fi
-    fi
+# Loop through each OWL file and process it in parallel
+for owl_file in *.owl; do
+    [ -f "$owl_file" ] || continue
+    # Run the process in a subshell and put it in the background
+    (process_owl_file "$owl_file") &
 done
+
+# Wait for all background processes to complete
 wait
 
 gzip -f *.ttl

From df3600595197df8f5fcbadc87a6ffc9ef9df1971 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 20:02:36 +0100
Subject: [PATCH 05/14] Pre-removing embargo data to speed up export

---
 process.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/process.sh b/process.sh
index faebdad..cfdab2e 100644
--- a/process.sh
+++ b/process.sh
@@ -89,6 +89,19 @@ wait
 echo "VFBTIME:"
 date
 
+echo '** Removing embargoed data directly from KB before export **'
+echo 'Non Production Datasets:'
+curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:DataSet)<-[:has_source]-(i:Individual)<-[:depicts]-(ic:Individual) WHERE not n.production=[true] DETACH DELETE ic DETACH DELETE i DETACH DELETE n"}]}'
+echo 'Blocked Anatomical Individuals:'
+curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(i.block) DETACH DELETE ic DETACH DELETE i"}]}'
+echo 'Blocked Images:'
+curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual)-[ir:in_register_with]->(tc:Template) WHERE exists(ir.block) DELETE ir"}]}'
+echo 'Clean Channels/Individuals with no Image:'
+curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (i:Individual)<-[:depicts]-(ic:Individual) WHERE not (ic)-[:in_register_with]->(:Template) and i.short_form starts with 'VFB_' DETACH DELETE ic DETACH DELETE i"}]}'
+
+echo "VFBTIME:"
+date
+
 echo '** Exporting KB to OWL **'
 curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (c) REMOVE c.label_rdfs RETURN c"}]}' >> ${VFB_DEBUG_DIR}/neo4j_remove_rdfs_label.txt
 curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (p) WHERE EXISTS(p.label) SET p.label_rdfs=[] + p.label"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt

From 7861c9cbb4cb7dc6cff89c3f7c40c67aa4575083 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 20:05:11 +0100
Subject: [PATCH 06/14] removing parts after merge

---
 process.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/process.sh b/process.sh
index cfdab2e..f6f5d44 100644
--- a/process.sh
+++ b/process.sh
@@ -110,6 +110,7 @@ curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword
 
 python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE}
 robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.owl -i $VFB_DOWNLOAD_DIR/kb_part_2.owl -i $VFB_DOWNLOAD_DIR/kb_part_3.owl -i $VFB_DOWNLOAD_DIR/kb_rels.owl -o ${KB_FILE}
+rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl
 
 echo "VFBTIME:"
 date

From 6c559dcfdf8c29c4bdb53fdb456531a3e45099a5 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 20:19:09 +0100
Subject: [PATCH 07/14] can all happen in parallel until merging

---
 process.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/process.sh b/process.sh
index f6f5d44..3384af0 100644
--- a/process.sh
+++ b/process.sh
@@ -64,7 +64,7 @@ cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c '
     wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" &
   fi
 '
-wait
+
 
 echo '** in slices: **'
 cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c '
@@ -84,7 +84,7 @@ cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c '
     wget -N -P "$VFB_SLICES_DIR" "$url_pattern" &
   fi
 '
-wait
+
 
 echo "VFBTIME:"
 date
@@ -137,6 +137,8 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
   date
 fi
 
+wait
+
 echo 'Merging all input ontologies.'
 cd $VFB_DOWNLOAD_DIR
 for i in *.owl; do

From b51eccb76100857424cd10846a3b841088037b97 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:02:14 +0100
Subject: [PATCH 08/14] fixing downloads

---
 process.sh | 77 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/process.sh b/process.sh
index 3384af0..d033bcb 100644
--- a/process.sh
+++ b/process.sh
@@ -44,47 +44,53 @@ mkdir $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_
 echo "VFBTIME:"
 date
 
-# Parallel downloading and processing using xargs
 echo '** Downloading relevant ontologies.. **'
 echo '** in full: **'
-cat vfb_fullontologies.txt | xargs -n 1 -P 4 -I {} sh -c '
-  url_pattern="{}"
-  if [[ "$url_pattern" == *"*"* ]]; then
-    base_url="${url_pattern%/*}/"
-    pattern="${url_pattern##*/}"
-    pattern="${pattern//\*/.*}"
-    page=$(curl -s "$base_url")
-    file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//")
-
-    for file in $file_list; do
-      file_url="${base_url}${file}"
-      wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" &
-    done
-  else
-    wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" &
-  fi
-'
+
+# Process each URL pattern in parallel
+while read -r url_pattern; do
+    echo "Processing: $url_pattern"
+    if [[ "$url_pattern" == *"*"* ]]; then
+        base_url="${url_pattern%/*}/"
+        pattern="${url_pattern##*/}"
+        pattern="${pattern//\*/.*}"
+        page=$(curl -s "$base_url")
+        file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//')
+
+        for file in $file_list; do
+            file_url="${base_url}${file}"
+            wget -N -P "$VFB_DOWNLOAD_DIR" "$file_url" &
+        done
+    else
+        wget -N -P "$VFB_DOWNLOAD_DIR" "$url_pattern" &
+    fi
+done < vfb_fullontologies.txt
 
 
 echo '** in slices: **'
-cat vfb_slices.txt | xargs -n 1 -P 4 -I {} sh -c '
-  url_pattern="{}"
-  if [[ "$url_pattern" == *"*"* ]]; then
-    base_url="${url_pattern%/*}/"
-    pattern="${url_pattern##*/}"
-    pattern="${pattern//\*/.*}"
-    page=$(curl -s "$base_url")
-    file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed "s/^href=\"//;s/\"$//")
-
-    for file in $file_list; do
-      file_url="${base_url}${file}"
-      wget -N -P "$VFB_SLICES_DIR" "$file_url" &
-    done
-  else
-    wget -N -P "$VFB_SLICES_DIR" "$url_pattern" &
-  fi
-'
 
+# Process each URL pattern in slices in parallel
+while read -r url_pattern; do
+    echo "Processing: $url_pattern"
+    if [[ "$url_pattern" == *"*"* ]]; then
+        base_url="${url_pattern%/*}/"
+        pattern="${url_pattern##*/}"
+        pattern="${pattern//\*/.*}"
+        page=$(curl -s "$base_url")
+        file_list=$(echo "$page" | grep -Eo "href=\"$pattern\"" | sed 's/^href="//;s/"$//')
+
+        for file in $file_list; do
+            file_url="${base_url}${file}"
+            wget -N -P "$VFB_SLICES_DIR" "$file_url" &
+        done
+    else
+        wget -N -P "$VFB_SLICES_DIR" "$url_pattern" &
+    fi
+done < vfb_slices.txt
+
+
+
+echo '** Downloads called. **'
 
 echo "VFBTIME:"
 date
@@ -137,6 +143,7 @@ if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
   date
 fi
 
+# Wait for all background jobs to complete
 wait
 
 echo 'Merging all input ontologies.'

From 0c5e3cb0c8483c5742be9b6f83b82891fffb6ce2 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:09:54 +0100
Subject: [PATCH 09/14] commenting out secondary embargo processing

---
 process.sh | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/process.sh b/process.sh
index d033bcb..9662583 100644
--- a/process.sh
+++ b/process.sh
@@ -121,27 +121,27 @@ rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl
 echo "VFBTIME:"
 date
 
-if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
-  echo '** Deleting embargoed data.. **'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt
-
-  echo 'First 10 embargoed datasets: '
-  head -10 ${VFB_FINAL}/embargoed_datasets.txt
-
-  echo 'Embargoed datasets: select_embargoed_channels'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt &
-  echo 'Embargoed datasets: select_embargoed_images'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt &
-  echo 'Embargoed datasets: select_embargoed_datasets'
-  robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt &
-  wait
+# if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
+#   echo '** Deleting embargoed data.. **'
+#   robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt
+
+#   echo 'First 10 embargoed datasets: '
+#   head -10 ${VFB_FINAL}/embargoed_datasets.txt
+
+#   echo 'Embargoed datasets: select_embargoed_channels'
+#   robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt &
+#   echo 'Embargoed datasets: select_embargoed_images'
+#   robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt &
+#   echo 'Embargoed datasets: select_embargoed_datasets'
+#   robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt &
+#   wait
   
-  echo 'Embargoed data: Removing everything'
-  cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
-  robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} &
-  echo "VFBTIME:"
-  date
-fi
+#   echo 'Embargoed data: Removing everything'
+#   cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
+#   robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl && mv ${KB_FILE}.tmp.owl ${KB_FILE} &
+#   echo "VFBTIME:"
+#   date
+# fi
 
 # Wait for all background jobs to complete
 wait

From 5773f6b6ee70d590e3cc9d962f377d1b3695173b Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:39:26 +0100
Subject: [PATCH 10/14] adding section for merging

---
 process.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/process.sh b/process.sh
index 9662583..ab0a918 100644
--- a/process.sh
+++ b/process.sh
@@ -114,6 +114,10 @@ curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword
 curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:Entity) WHERE exists(n.block) DETACH DELETE n"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
 curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH ()-[r]-() WHERE exists(r.block) DELETE r"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
 
+echo "VFBTIME:"
+date
+
+echo '** Merging parts into KB.OWL **'
 python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE}
 robot merge -i $VFB_DOWNLOAD_DIR/kb_part_0.owl -i $VFB_DOWNLOAD_DIR/kb_part_1.owl -i $VFB_DOWNLOAD_DIR/kb_part_2.owl -i $VFB_DOWNLOAD_DIR/kb_part_3.owl -i $VFB_DOWNLOAD_DIR/kb_rels.owl -o ${KB_FILE}
 rm -fv $VFB_DOWNLOAD_DIR/kb_*.owl

From 2446da0763b1a917f1a792b3c41318c7c3fb1cde Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:43:33 +0100
Subject: [PATCH 11/14] adding progress output

---
 process.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/process.sh b/process.sh
index ab0a918..fea606a 100644
--- a/process.sh
+++ b/process.sh
@@ -155,12 +155,12 @@ cd $VFB_DOWNLOAD_DIR
 for i in *.owl; do
     [ -f "$i" ] || break
     echo "Merging: $i"
-    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i" &
+    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i" && echo "Finished: $i" &
 done
 for i in *.owl.gz; do
     [ -f "$i" ] || break
     echo "Merging: $i"
-    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl" &
+    ${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv -v "$i.tmp.owl" "$i.owl" && echo "Finished: $i" &
 done
 wait
 

From 48955a51fdc21f643160588f022792e1045f988d Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:51:14 +0100
Subject: [PATCH 12/14] error handling

---
 process.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/process.sh b/process.sh
index fea606a..0102a17 100644
--- a/process.sh
+++ b/process.sh
@@ -175,7 +175,7 @@ for i in *.owl; do
     seedfile=$i"_terms.txt"
     echo "Extracting seed from: $i to $seedfile"
     [ ! -f "$seedfile" ] || break
-    ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile &
+    ${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile  && echo "Finished: $i" &
 done
 wait
 
@@ -258,7 +258,7 @@ done
 # Wait for all background processes to complete
 wait
 
-gzip -f *.ttl
+gzip -f *.ttl || :
 
 echo "End: vfb-pipeline-collectdata"
 echo "VFBTIME:"

From 1917705e597378cdb27ff7133d80903eb354e4d8 Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 21:58:18 +0100
Subject: [PATCH 13/14] syntax fix for slices

---
 process.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/process.sh b/process.sh
index 0102a17..de0e7fa 100644
--- a/process.sh
+++ b/process.sh
@@ -190,10 +190,9 @@ for i in *.owl; do
     [ -f "$i" ] || break
     echo "Processing: $i"
     mod=$i"_module.owl"
-    ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod &
-    cp $mod $VFB_FINAL
-    cp $mod $VFB_DEBUG_DIR
+    ${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod && cp $mod $VFB_FINAL && cp $mod $VFB_DEBUG_DIR && echo "Finished: $i" &
 done
+
 wait
 
 echo "VFBTIME:"

From f97f55cb15d090af51c0a8763e6975489d6f595e Mon Sep 17 00:00:00 2001
From: Rob Court <rcourt@ed.ac.uk>
Date: Fri, 23 Aug 2024 22:44:09 +0100
Subject: [PATCH 14/14] parallel processing final part

---
 process.sh | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/process.sh b/process.sh
index de0e7fa..e95133c 100644
--- a/process.sh
+++ b/process.sh
@@ -208,22 +208,41 @@ date
 if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
   echo 'Removing all possible sources for unsatisfiable classes and inconsistency...'
   cd $VFB_FINAL
-  for i in *.owl; do
-    [ -f "$i" ] || break
-    echo "Processing: $i"
+
+  # Define the function to process each OWL file
+  process_owl_file() {
+    local owl_file="$1"
+
+    echo "Processing: $owl_file"
+
+    # Check if the file should be skipped
     while read -r url_pattern; do
-      if [ $url_pattern == $i ]; then
-        echo "Skipping $i"
-        continue 2
+      if [ "$url_pattern" == "$owl_file" ]; then
+        echo "Skipping $owl_file"
+        return
       fi
     done < ${WORKSPACE}/vfb_skip_axiom_checks.txt
+
+    # Remove axioms
     for axiom_type in $UNSAT_AXIOM_TYPES; do
-      echo "Removing $axiom_type axioms from $i"
-      ${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
-        remove --axioms $axiom_type --preserve-structure false -o "$i.tmp.owl"
-      mv "$i.tmp.owl" "$i" &
+      echo "Removing $axiom_type axioms from $owl_file"
+      ${WORKSPACE}/robot remove --input "$owl_file" --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
+        remove --axioms $axiom_type --preserve-structure false -o "$owl_file.tmp.owl"
+      mv "$owl_file.tmp.owl" "$owl_file"
     done
+    echo "Finished: $owl_file"
+  }
+
+  # Export the function so it can be used in subshells
+  export -f process_owl_file
+
+  # Process each OWL file in parallel
+  for i in *.owl; do
+    [ -f "$i" ] || continue
+    process_owl_file "$i" &
   done
+
+  # Wait for all background jobs to complete
   wait
 fi