-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.sh
172 lines (137 loc) · 6.2 KB
/
process.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash
set -e
echo "process started"
echo "Start: vfb-pipeline-collectdata"
echo "VFBTIME:"
date
VFB_FULL_DIR=/tmp/vfb_fullontologies
VFB_SLICES_DIR=/tmp/vfb_slices
VFB_DOWNLOAD_DIR=/tmp/vfb_download
VFB_DEBUG_DIR=/tmp/vfb_debugging
VFB_FINAL=/out
VFB_FINAL_DEBUG=/out/vfb_debugging
SCRIPTS=${WORKSPACE}/VFB_neo4j/src/uk/ac/ebi/vfb/neo4j/
SPARQL_DIR=${WORKSPACE}/sparql
SHACL_DIR=${WORKSPACE}/shacl
KB_FILE=$VFB_DOWNLOAD_DIR/kb.owl
VFB_NEO4J_SRC=${WORKSPACE}/VFB_neo4j
export ROBOT_JAVA_ARGS=${ROBOT_ARGS}
echo "** Collecting Data! **"
echo 'START' >> ${WORKSPACE}/tick.out
## tail -f ${WORKSPACE}/tick.out >&1 &>&1
echo "** Updateing Neo4J VFB codebase **"
cd $VFB_NEO4J_SRC
git pull origin master
git checkout ${GITBRANCH}
git pull
echo "** Creating temporary directories.. **"
cd ${WORKSPACE}
ls -l $VFB_FINAL
rm -rf $VFB_FINAL/*
rm -rf $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_DEBUG
mkdir $VFB_FULL_DIR $VFB_SLICES_DIR $VFB_DOWNLOAD_DIR $VFB_DEBUG_DIR $VFB_FINAL_DEBUG
echo "VFBTIME:"
date
echo '** Downloading relevant ontologies.. **'
wget -N -P $VFB_DOWNLOAD_DIR -i vfb_fullontologies.txt
wget -N -P $VFB_SLICES_DIR -i vfb_slices.txt
echo "VFBTIME:"
date
echo '** Exporting KB to OWL **'
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (c) REMOVE c.label_rdfs RETURN c"}]}' >> ${VFB_DEBUG_DIR}/neo4j_remove_rdfs_label.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (p) WHERE EXISTS(p.label) SET p.label_rdfs=[] + p.label"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH (n:Entity) WHERE exists(n.block) DETACH DELETE n"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
curl -i -X POST ${KBserver}/db/data/transaction/commit -u ${KBuser}:${KBpassword} -H 'Content-Type: application/json' -d '{"statements": [{"statement": "MATCH ()-[r]-() WHERE exists(r.block) DELETE r"}]}' >> ${VFB_DEBUG_DIR}/neo4j_change_label_to_rdfs.txt
python3 ${SCRIPTS}neo4j_kb_export.py ${KBserver} ${KBuser} ${KBpassword} ${KB_FILE}
echo "VFBTIME:"
date
if [ "$REMOVE_EMBARGOED_DATA" = true ]; then
echo '** Deleting embargoed data.. **'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/embargoed_datasets_${STAGING}.sparql ${VFB_FINAL}/embargoed_datasets.txt
echo 'First 10 embargoed datasets: '
head -10 ${VFB_FINAL}/embargoed_datasets.txt
echo 'Embargoed datasets: select_embargoed_channels'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_channels_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt
echo 'Embargoed datasets: select_embargoed_images'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_images_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_images.txt
echo 'Embargoed datasets: select_embargoed_datasets'
robot query -f csv -i ${KB_FILE} --query ${SPARQL_DIR}/select_embargoed_datasets_${STAGING}.sparql ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt
echo 'Embargoed data: Removing everything'
cat ${VFB_DOWNLOAD_DIR}/embargoed_channels.txt ${VFB_DOWNLOAD_DIR}/embargoed_images.txt ${VFB_DOWNLOAD_DIR}/embargoed_datasets.txt | sort | uniq > ${VFB_FINAL}/remove_embargoed.txt
robot remove --input ${KB_FILE} --term-file ${VFB_FINAL}/remove_embargoed.txt --output ${KB_FILE}.tmp.owl
mv ${KB_FILE}.tmp.owl ${KB_FILE}
echo "VFBTIME:"
date
fi
echo 'Merging all input ontologies.'
cd $VFB_DOWNLOAD_DIR
for i in *.owl; do
[ -f "$i" ] || break
echo "Merging: "$i
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i"
done
for i in *.owl.gz; do
[ -f "$i" ] || break
echo "Merging: "$i
${WORKSPACE}/robot merge --input $i -o "$i.tmp.owl" && mv "$i.tmp.owl" "$i.owl"
done
echo 'Copy all OWL files to output directory..'
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_FINAL
cp $VFB_DOWNLOAD_DIR/*.owl $VFB_DEBUG_DIR
echo 'Creating slices for external ontologies: Extracting seeds.'
cd $VFB_DOWNLOAD_DIR
for i in *.owl; do
[ -f "$i" ] || break
seedfile=$i"_terms.txt"
echo "Extracting seed from: "$i
${WORKSPACE}/robot query -f csv -i $i --query ${SPARQL_DIR}/terms.sparql $seedfile
done
cat *_terms.txt | sort | uniq > ${VFB_FINAL}/seed.txt
echo "VFBTIME:"
date
echo 'Creating slices for external ontologies: Extracting modules'
cd $VFB_SLICES_DIR
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: "$i
mod=$i"_module.owl"
${WORKSPACE}/robot extract -i $i -T ${VFB_FINAL}/seed.txt --method BOT -o $mod
cp $mod $VFB_FINAL
cp $mod $VFB_DEBUG_DIR
done
echo "VFBTIME:"
date
echo 'Create debugging files for pipeline..'
cd $VFB_DEBUG_DIR
robot merge --inputs "*.owl" remove --axioms "disjoint" --output $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl
robot merge -i kb.owl -i fbbt.owl --output $VFB_FINAL_DEBUG/vfb-kb_fbbt.owl
robot reason --reasoner ELK --input $VFB_FINAL_DEBUG/vfb-dependencies-merged.owl --output $VFB_FINAL_DEBUG/vfb-dependencies-reasoned.owl
if [ "$REMOVE_UNSAT_CAUSING_AXIOMS" = true ]; then
echo 'Removing all possible sources for unsatisfiable classes and inconsistency...'
cd $VFB_FINAL
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: "$i
${WORKSPACE}/robot remove --input $i --term "http://www.w3.org/2002/07/owl#Nothing" --axioms logical --preserve-structure false \
remove --axioms "${UNSAT_AXIOM_TYPES}" --preserve-structure false -o "$i.tmp.owl"
mv "$i.tmp.owl" "$i"
done
fi
echo 'Converting all OWL files to gzipped TTL'
cd $VFB_FINAL
for i in *.owl; do
[ -f "$i" ] || break
echo "Processing: "$i
${WORKSPACE}/robot convert --input $i -f ttl --output $i".ttl"
if [ "$i" == "kb.owl" ] && [ "$VALIDATE" = true ]; then
if [ "$VALIDATESHACL" = true ]; then
echo "Validating KB with SHACL.."
shaclvalidate.sh -datafile "$i.ttl" -shapesfile $WORKSPACE/shacl/kb.shacl > $VFB_FINAL/validation.txt
fi
fi
done
gzip -f *.ttl
echo "End: vfb-pipeline-collectdata"
echo "VFBTIME:"
date
echo "process complete"