forked from openshift/origin-aggregated-logging
-
Notifications
You must be signed in to change notification settings - Fork 1
/
entrypoint.sh
executable file
·272 lines (243 loc) · 10.7 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/bin/bash
# This script serves as a common entrypoint for CI infra
# as well as developers looking to run test suites for the
# project. The script can either set up a cluster to test
# or run against a cluster that is already up.
#
# Cluster end-to-end tests will be run first, followed by
# other test suites. If a specific suite or suites are req-
# uested with $SUITE, only that suite will be run.
#
# This script expects the following environment variables:
# - TEST_ONLY: do not set up a cluster. Must be paired with
# a $KUEBCONFIG that points to the cluster to test
# - SUITE: a regex that will choose which test suites are
# run. Test suite entrypoints exist under hack/testing/
# with the test- prefix. The regex in $SUITE is a simple
# filter.
# - EXCLUDE_SUITE: a regex that will choose which test suites
# are not run. Test suite entrypoints exist under hack/testing/
# with the test- prefix. The regex in $EXCLUDE_SUITE is
# a simple filter like $SUITE only with opposite effect.
# - JUNIT_REPORT: generate a jUnit XML report for tests
source "$(dirname "${BASH_SOURCE[0]}" )/../lib/init.sh"
source "${OS_O_A_L_DIR}/hack/testing/util.sh"
# we have to declare a suite start in order to use the os::cmd functions
os::test::junit::declare_suite_start "entrypoint"
LOGGING_NS=openshift-logging
if oc get project logging -o name > /dev/null 2>&1 && [ $(oc get dc -n logging -o name 2> /dev/null | wc -l) -gt 0 ] ; then
LOGGING_NS=logging
fi
export LOGGING_NS
export OC_LOG_DIR=${OC_LOG_DIR:-${ARTIFACT_DIR:-/tmp}/oclogs}
if [ ! -d $OC_LOG_DIR ] ; then
mkdir -p $OC_LOG_DIR
fi
# if using operators, turn off the managed state and shutdown the cluster-logging-operator
disable_cluster_logging_operator
fluentd_ds=$( get_fluentd_ds_name )
oc get -n ${LOGGING_NS} $fluentd_ds -o yaml > "${ARTIFACT_DIR}/logging-fluentd-orig.yaml"
# patch fluentd and the node to make it easier to test in new environment
if oc -n ${LOGGING_NS} get clusterlogging instance > /dev/null 2>&1 ; then
# wait for kibana to return a node value
os::cmd::try_until_text "oc get -n ${LOGGING_NS} pods -l component=kibana -o jsonpath='{.items[0].spec.nodeName}'" "."
kibnode=$( oc get -n ${LOGGING_NS} pods -l component=kibana -o jsonpath='{.items[0].spec.nodeName}' )
if [ -z "$kibnode" ] ; then
oc -n ${LOGGING_NS} get pods -o wide || :
for kpod in $( oc -n ${LOGGING_NS} get pods -l component=kibana -o jsonpath='{.items[*].metadata.name}' ) ; do
oc -n ${LOGGING_NS} describe pod $kpod > $ARTIFACT_DIR/$kpod.describe || :
oc -n ${LOGGING_NS} get pod $kpod -o yaml > $ARTIFACT_DIR/$kpod.yaml || :
done
oc describe node > $ARTIFACT_DIR/nodes || :
false
fi
oc label node $kibnode --overwrite logging-infra-fluentd=true
oc patch -n ${LOGGING_NS} $fluentd_ds --type=json --patch '[
{"op":"remove","path":"/spec/template/spec/tolerations"},
{"op":"replace","path":"/spec/template/spec/nodeSelector","value":{"logging-infra-fluentd":"true"}}]'
# make sure nodeSelectors are set correctly if restarted by clo
oc patch -n ${LOGGING_NS} clusterlogging instance --type=json --patch '[
{"op":"add","path":"/spec/collection/logs/fluentd/nodeSelector","value":{"logging-infra-fluentd":"true"}}]'
# wait until there is only 1 fluentd running on the kibana node
os::cmd::try_until_text "oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath='{ .status.numberReady }'" '^1$' $(( 2 * minute ))
os::cmd::try_until_text "oc get -n ${LOGGING_NS} pods -l component=fluentd -o jsonpath='{.items[0].spec.nodeName}'" "$kibnode" $(( 2 * minute ))
echo after patching fluentd
oc get -n ${LOGGING_NS} pods -o wide
oc get -n ${LOGGING_NS} $fluentd_ds -o yaml > $ARTIFACT_DIR/fluentd_daemonset.yaml
# using EXCLUDE_SUITE in Makefile instead - that will completely skip the test, rather
# than running it and getting the failure - if you fix a test, be sure to remove
# it from the list in Makefile
expected_failures=(
NONE
)
else
expected_failures=(
NONE
)
# some tests expect the node to be labeled as in the ci environment
kibnode=$( oc get pods -l component=kibana -o jsonpath='{.items[0].spec.nodeName}' )
oc label node $kibnode --overwrite logging-ci-test=true
fi
stop_fluentd
# HACK HACK HACK
#
# There seems to be some sort of performance problem - richm 2017-08-15 not
# sure what has changed, but now running an all-in-one for CI, with both
# openshift master and node running as systemd services logging to the
# journal, and the default/logging pods, and the os, are spewing too much for
# fluentd to keep up with when it has 100m cpu (default), on a aws m4.xlarge
# system for now, remove the limits on fluentd to unblock the tests
if [[ -z "${USE_DEFAULT_FLUENTD_CPU_LIMIT:-}" && -n "$(oc get -n ${LOGGING_NS} $fluentd_ds -o jsonpath={.spec.template.spec.containers[0].resources.limits.cpu})" ]] ; then
oc patch -n ${LOGGING_NS} $fluentd_ds --type=json --patch '[
{"op":"remove","path":"/spec/template/spec/containers/0/resources/limits/cpu"}]'
fi
# Make CI run with enabled debug logs for journald (BZ 1505602)
oc set -n ${LOGGING_NS} env $fluentd_ds COLLECT_JOURNAL_DEBUG_LOGS=true
# Make CI run with MUX_CLIENT_MODE off by default - individual tests will set
# MUX_CLIENT_MODE=maximal or minimal
oc set -n ${LOGGING_NS} env $fluentd_ds MUX_CLIENT_MODE-
# Starting in 3.10, we can no longer mount /var/lib/docker/containers
oc volumes -n ${LOGGING_NS} $fluentd_ds --overwrite --add -t hostPath \
--name=varlibdockercontainers -m /var/lib/docker --path=/var/lib/docker || :
# we're finished hacking fluentd - start it
start_fluentd
# start a fluentd performance monitor
monitor_fluentd_top() {
# assumes running in a subshell
cp $KUBECONFIG $ARTIFACT_DIR/monitor_fluentd_top.kubeconfig
export KUBECONFIG=$ARTIFACT_DIR/monitor_fluentd_top.kubeconfig
oc project ${LOGGING_NS} > /dev/null
while true ; do
fpod=$( get_running_pod fluentd 2> /dev/null ) || :
if [ -n "$fpod" ] ; then
oc exec $fpod -- top -b -d 1 || :
else
# if we got here, the fluentd pod was restarted
echo $( date --rfc-3339=ns ) fluentd is not running
sleep 1
fi
done > $ARTIFACT_DIR/monitor_fluentd_top.log 2>&1
}
monitor_fluentd_pos() {
while true ; do
local cursor=$( get_journal_pos_cursor )
if [ -n "$cursor" ] ; then
local startts=$( date +%s )
local count=$( oal_sudo journalctl -m -c $cursor | wc -l )
local endts=$( date +%s )
echo $endts $( expr $endts - $startts ) $count
else
echo $( date --rfc-3339=ns ) no /var/log/journal.pos
fi
sleep 1
done > $ARTIFACT_DIR/monitor_fluentd_pos.log 2>&1
}
monitor_journal_lograte() {
local interval=60
while true ; do
count=$( oal_sudo journalctl -m -S "$( date +'%Y-%m-%d %H:%M:%S' --date="$interval seconds ago" )" | wc -l )
echo $( date +%s ) $count
sleep $interval
done > $ARTIFACT_DIR/monitor_journal_lograte.log 2>&1
}
monitor_es_bulk_stats() {
local interval=5
cp $KUBECONFIG $ARTIFACT_DIR/monitor_es_bulk_stats.kubeconfig
export KUBECONFIG=$ARTIFACT_DIR/monitor_es_bulk_stats.kubeconfig
oc project ${LOGGING_NS} > /dev/null
# wait for espod
local espod=$( get_es_pod es 2> /dev/null ) || :
while [ -z "${espod}" ] ; do
sleep 1
espod=$( get_es_pod es 2> /dev/null ) || :
done
es_ver=$( get_es_major_ver ) || :
while [ -z "${es_ver}" ] ; do
es_ver=$( get_es_major_ver ) || :
sleep 1
done
bulk_url=$( get_bulk_thread_pool_url $es_ver "v" c r a q s qs )
while true ; do
local essvc=$( get_es_svc es 2> /dev/null ) || :
local esopssvc=$( get_es_svc es-ops 2> /dev/null ) || :
esopspod=${esopssvc:-$essvc}
if [ -n "${essvc}" ] ; then
date -Ins >> $ARTIFACT_DIR/monitor_es_bulk_stats-es.log 2>&1
curl_es $essvc "${bulk_url}" >> $ARTIFACT_DIR/monitor_es_bulk_stats-es.log 2>&1 || :
fi
if [ -n "${esopssvc}" -a "${essvc}" != "${esopssvc}" ] ; then
date -Ins >> $ARTIFACT_DIR/monitor_es_bulk_stats-es-ops.log 2>&1
curl_es $esopssvc "${bulk_url}" >> $ARTIFACT_DIR/monitor_es_bulk_stats-es-ops.log 2>&1 || :
fi
sleep $interval
done
}
monitor_fluentd_top & killpids=$!
monitor_fluentd_pos & killpids="$killpids $!"
monitor_journal_lograte & killpids="$killpids $!"
monitor_es_bulk_stats & killpids="$killpids $!"
function cleanup() {
return_code=$?
kill $killpids
os::cleanup::all "${return_code}"
exit "${return_code}"
}
trap "cleanup" EXIT
rm -f ${OS_O_A_L_DIR}/temp/htpw.file
if [[ -z "${TEST_ONLY:-}" ]]; then
"${OS_O_A_L_DIR}/hack/testing/setup.sh"
elif [[ -z "${KUBECONFIG:-}" ]]; then
os::log::fatal "A \$KUBECONFIG must be specified with \$TEST_ONLY."
fi
function run_suite() {
local test="$1"
suite_name="$( basename "${test}" '.sh' )"
os::test::junit::declare_suite_start "test/setup/${suite_name}"
os::cmd::expect_success "oc login -u system:admin"
os::cmd::expect_success "oc project $LOGGING_NS"
os::test::junit::declare_suite_end
os::log::info "Logging test suite ${suite_name} started at $( date )"
ops_cluster=${ENABLE_OPS_CLUSTER:-"true"}
if OS_TMP_ENV_SET= LOG_DIR= ARTIFACT_DIR= "${test}" "${ops_cluster}"; then
os::log::info "Logging test suite ${suite_name} succeeded at $( date )"
if grep -q "${suite_name}" <<<"${expected_failures[@]}"; then
os::log::warning "Logging suite ${suite_name} is expected to fail"
fi
else
os::log::warning "Logging test suite ${suite_name} failed at $( date )"
if grep -q "${suite_name}" <<<"${expected_failures[@]}"; then
os::log::info "Logging suite ${suite_name} failure result ignored"
else
failed="true"
fi
fi
}
# done with entrypoint/boostrapping - begin main tests
os::test::junit::declare_suite_end
# disable pathname expansion for SUITE and EXCLUDE_SUITE
# e.g. .* will expand to . .. .gitignore .travis.yml
# we do not want that
set -f
EXCLUDE_SUITE="${EXCLUDE_SUITE:-"$^"}"
for suite_selector in ${SUITE:-".*"} ; do
set +f
for test in $( find "${OS_O_A_L_DIR}/hack/testing" -type f -name 'check-*.sh' | grep -E "${suite_selector}" | grep -Ev "${EXCLUDE_SUITE}" | sort ); do
run_suite "${test}"
done
done
set -f
for suite_selector in ${SUITE:-".*"} ; do
set +f
for test in $( find "${OS_O_A_L_DIR}/hack/testing" -type f -name 'test-*.sh' | grep -E "${suite_selector}" | grep -Ev "${EXCLUDE_SUITE}" | sort ); do
run_suite "${test}"
done
done
if [ -n "${OC_LOG_DIR}" -a -d "${OC_LOG_DIR}" ] ; then
pushd $OC_LOG_DIR > /dev/null
tar cfz oclogs.tgz oc.*.log.*
rm -f oc.*.log.*
popd > /dev/null
fi
if [[ -n "${failed:-}" ]]; then
exit 1
fi