From 134fe5e95e0fc67d8de6a6cbbd88bc242d6ab065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 28 Jul 2023 15:26:46 +0000 Subject: [PATCH] Add basic tracing capabilities This commit uses the OpenTelemetry SDK to produce tracing data. In the dev environment, tracing data is pushed to Grafana Agent via gRPC. Grafana Tempo is used to store traces and Grafana to consume the data from a web dashboard. --- .env | 1 + .gitignore | 1 + docker-compose.observability.yml | 36 +++++ enduro.toml | 5 + go.mod | 16 ++- go.sum | 21 +++ hack/etc/grafana-agent.yaml | 22 +++ hack/etc/grafana-datasources.yaml | 16 +++ hack/etc/grafana-tempo.yaml | 20 +++ hack/etc/prometheus.yaml | 11 ++ internal/api/api.go | 7 +- internal/batch/service.go | 4 +- internal/collection/goa.go | 4 +- internal/collection/workflow.go | 11 +- main.go | 132 +++++++++++++++--- .../en/docs/development/environment.md | 43 ++++++ .../en/docs/user-manual/configuration.md | 52 ++++++- 17 files changed, 374 insertions(+), 28 deletions(-) create mode 100644 .env create mode 100644 docker-compose.observability.yml create mode 100644 hack/etc/grafana-agent.yaml create mode 100644 hack/etc/grafana-datasources.yaml create mode 100644 hack/etc/grafana-tempo.yaml create mode 100644 hack/etc/prometheus.yaml diff --git a/.env b/.env new file mode 100644 index 00000000..b2e83160 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +COMPOSE_FILE=docker-compose.yml:docker-compose.observability.yml diff --git a/.gitignore b/.gitignore index 577f001d..63da1157 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/.env /build /dist /enduro diff --git a/docker-compose.observability.yml b/docker-compose.observability.yml new file mode 100644 index 00000000..d59c5148 --- /dev/null +++ b/docker-compose.observability.yml @@ -0,0 +1,36 @@ +services: + + grafana: + image: grafana/grafana:10.0.3 + volumes: + - ./hack/etc/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor + ports: + - "127.0.0.1:3000:3000" + + grafana-tempo: + image: "grafana/tempo:latest" + entrypoint: + - "/tempo" + command: + - "-config.file=/etc/grafana-tempo.yaml" + volumes: + - "./hack/etc/grafana-tempo.yaml:/etc/grafana-tempo.yaml" + - "./hack/stuff/tempo-data:/tmp/tempo" + ports: + - "127.0.0.1:4317:4317" + + grafana-agent: + image: "grafana/agent:latest" + entrypoint: + - "/bin/grafana-agent" + command: + - "-config.file=/etc/grafana-agent.yaml" + volumes: + - "./hack/etc/grafana-agent.yaml:/etc/grafana-agent.yaml" + ports: + - "127.0.0.1:12345:4317" diff --git a/enduro.toml b/enduro.toml index d9fc4dec..8510fb8f 100644 --- a/enduro.toml +++ b/enduro.toml @@ -4,6 +4,11 @@ verbosity = 2 debug = true debugListen = "127.0.0.1:9001" +[telemetry.traces] +enabled = true +address = "127.0.0.1:12345" +ratio = 1.0 + [temporal] namespace = "default" address = "127.0.0.1:7233" diff --git a/go.mod b/go.mod index 24222fb7..67ad5b45 100644 --- a/go.mod +++ b/go.mod @@ -29,12 +29,18 @@ require ( github.com/spf13/viper v1.16.0 github.com/stretchr/testify v1.8.4 go.artefactual.dev/tools v0.3.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0 + go.opentelemetry.io/otel v1.16.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.16.0 + go.opentelemetry.io/otel/sdk v1.16.0 + go.opentelemetry.io/otel/trace v1.16.0 go.temporal.io/api v1.23.0 go.temporal.io/sdk v1.23.1 goa.design/goa/v3 v3.12.1 goa.design/plugins/v3 v3.12.1 gocloud.dev v0.30.0 golang.org/x/sync v0.3.0 + google.golang.org/grpc v1.56.1 gotest.tools/v3 v3.5.0 ) @@ -68,6 +74,8 @@ require ( github.com/dimfeld/httptreemux/v5 v5.5.0 // indirect github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect github.com/facebookgo/clock v0.0.0-20150410010913-600d898af40a // indirect + github.com/felixge/httpsnoop v1.0.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.2.4 // indirect github.com/gogo/googleapis v1.4.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -79,6 +87,7 @@ require ( github.com/google/wire v0.5.0 // indirect github.com/googleapis/gax-go/v2 v2.11.0 // indirect github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.15.2 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect @@ -107,6 +116,10 @@ require ( github.com/yuin/gopher-lua v1.1.0 // indirect github.com/zach-klippenstein/goregen v0.0.0-20160303162051-795b5e3961ea // indirect go.opencensus.io v0.24.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.16.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.16.0 // indirect + go.opentelemetry.io/otel/metric v1.16.0 // indirect + go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.24.0 // indirect @@ -119,8 +132,9 @@ require ( golang.org/x/tools v0.10.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/api v0.128.0 // indirect + google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20230629202037-9506855d4529 // indirect - google.golang.org/grpc v1.56.1 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index e16ddd19..cf5deacd 100644 --- a/go.sum +++ b/go.sum @@ -1123,6 +1123,7 @@ github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYF github.com/fatih/color v1.14.1/go.mod h1:2oHN61fhTpgcxD3TSWCgKDiH1+x4OiDVVGH8WlgGZGg= github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/felixge/httpsnoop v1.0.2/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/flowstack/go-jsonschema v0.1.1/go.mod h1:yL7fNggx1o8rm9RlgXv7hTBWxdBM0rVwpMwimd3F3N0= github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= @@ -1183,6 +1184,7 @@ github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/stdr v1.2.0/go.mod h1:YkVgnZu1ZjjL7xTxrfm/LLZBfkhTqSR1ydtm6jTKKwI= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo= github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA= @@ -1316,6 +1318,7 @@ github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EO github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= +github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= github.com/golang/glog v1.1.0/go.mod h1:pfYeQZ3JWZoXTV5sFc986z3HTpwQs9At6P4ImfuP3NQ= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -1496,6 +1499,7 @@ github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.15.2 h1:gDLXvp5S9izjldquuoAhDzccbskOL6tDC5jMSyx3zxE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.15.2/go.mod h1:7pdNwVWBBHGiCxa9lAszqCJMbfTISJ7oMftp8+UGV08= github.com/hanwen/go-fuse/v2 v2.3.0/go.mod h1:xKwi1cF7nXAOBCXujD5ie0ZKsxc8GGSA1rlMJc+8IJs= github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q= @@ -2269,34 +2273,48 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.2 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0/go.mod h1:2AboqHi0CiIZU0qwhtUfCYD1GeUzvvIXWNkhDt7ZMG4= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.31.0/go.mod h1:PFmBsWbldL1kiWZk9+0LBZz2brhByaGsvp6pRICMlPE= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.40.0/go.mod h1:pcQ3MM3SWvrA71U4GDqv9UFDJ3HQsW7y5ZO3tDTlUdI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0 h1:pginetY7+onl4qN1vl0xW/V/v6OBZ0vVdH+esuJgvmM= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.42.0/go.mod h1:XiYsayHc36K3EByOO6nbAXnAWbrUxdjUROCEeeROOH8= go.opentelemetry.io/otel v0.20.0/go.mod h1:Y3ugLH2oa81t5QO+Lty+zXf8zC9L26ax4Nzoxm/dooo= go.opentelemetry.io/otel v1.3.0/go.mod h1:PWIKzi6JCp7sM0k9yZ43VX+T345uNbAkDKwHVjb2PTs= go.opentelemetry.io/otel v1.6.0/go.mod h1:bfJD2DZVw0LBxghOTlgnlI0CV3hLDu9XF/QKOUXMTQQ= go.opentelemetry.io/otel v1.6.1/go.mod h1:blzUabWHkX6LJewxvadmzafgh/wnvBSDBdOuwkAtrWQ= go.opentelemetry.io/otel v1.11.1/go.mod h1:1nNhXBbWSD0nsL38H6btgnFN2k4i0sNLHNNMZMSbUGE= go.opentelemetry.io/otel v1.14.0/go.mod h1:o4buv+dJzx8rohcUeRmWUZhqupFvzWis188WlggnNeU= +go.opentelemetry.io/otel v1.16.0 h1:Z7GVAX/UkAXPKsy94IU+i6thsQS4nb7LviLpnaNeW8s= +go.opentelemetry.io/otel v1.16.0/go.mod h1:vl0h9NUa1D5s1nv3A5vZOYWn8av4K8Ml6JDeHrT/bx4= go.opentelemetry.io/otel/exporters/otlp v0.20.0/go.mod h1:YIieizyaN77rtLJra0buKiNBOm9XQfkPEKBeuhoMwAM= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.3.0/go.mod h1:VpP4/RMn8bv8gNo9uK7/IMY4mtWLELsS+JIP0inH0h4= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.6.1/go.mod h1:NEu79Xo32iVb+0gVNV8PMd7GoWqnyDXRlj04yFjqz40= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.14.0/go.mod h1:UFG7EBMRdXyFstOwH028U0sVf+AvukSGhF0g8+dmNG8= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.16.0 h1:t4ZwRPU+emrcvM2e9DHd0Fsf0JTPVcbfa/BhTDF03d0= +go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.16.0/go.mod h1:vLarbg68dH2Wa77g71zmKQqlQ8+8Rq3GRG31uc0WcWI= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.3.0/go.mod h1:hO1KLR7jcKaDDKDkvI9dP/FIhpmna5lkqPUQdEjFAM8= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.6.1/go.mod h1:YJ/JbY5ag/tSQFXzH3mtDmHqzF3aFn3DI/aB1n7pt4w= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.14.0/go.mod h1:HrbCVv40OOLTABmOn1ZWty6CHXkU8DK/Urc43tHug70= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.16.0 h1:cbsD4cUcviQGXdw8+bo5x2wazq10SKz8hEbtCRPcU78= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.16.0/go.mod h1:JgXSGah17croqhJfhByOLVY719k1emAXC8MVhCIJlRs= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.3.0/go.mod h1:keUU7UfnwWTWpJ+FWnyqmogPa82nuU5VUANFq49hlMY= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.6.1/go.mod h1:UJJXJj0rltNIemDMwkOJyggsvyMG9QHfJeFH0HS5JjM= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.14.0/go.mod h1:5w41DY6S9gZrbjuq6Y+753e96WfPha5IcsOSZTtullM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.16.0 h1:TVQp/bboR4mhZSav+MdgXB8FaRho1RC8UwVn3T0vjVc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.16.0/go.mod h1:I33vtIe0sR96wfrUcilIzLoA3mLHhRmz9S9Te0S3gDo= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.3.0/go.mod h1:QNX1aly8ehqqX1LEa6YniTU7VY9I6R3X/oPxhGdTceE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.6.1/go.mod h1:DAKwdo06hFLc0U88O10x4xnb5sc7dDRDqRuiN+io8JE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.14.0/go.mod h1:+N7zNjIJv4K+DeX67XXET0P+eIciESgaFDBqh+ZJFS4= go.opentelemetry.io/otel/metric v0.20.0/go.mod h1:598I5tYlH1vzBjn+BTuhzTCSb/9debfNp6R3s7Pr1eU= go.opentelemetry.io/otel/metric v0.28.0/go.mod h1:TrzsfQAmQaB1PDcdhBauLMk7nyyg9hm+GoQq/ekE9Iw= go.opentelemetry.io/otel/metric v0.37.0/go.mod h1:DmdaHfGt54iV6UKxsV9slj2bBRJcKC1B1uvDLIioc1s= +go.opentelemetry.io/otel/metric v1.16.0 h1:RbrpwVG1Hfv85LgnZ7+txXioPDoh6EdbZHo26Q3hqOo= +go.opentelemetry.io/otel/metric v1.16.0/go.mod h1:QE47cpOmkwipPiefDwo2wDzwJrlfxxNYodqc4xnGCo4= go.opentelemetry.io/otel/oteltest v0.20.0/go.mod h1:L7bgKf9ZB7qCwT9Up7i9/pn0PWIa9FqQ2IQ8LoxiGnw= go.opentelemetry.io/otel/sdk v0.20.0/go.mod h1:g/IcepuwNsoiX5Byy2nNV0ySUF1em498m7hBWC279Yc= go.opentelemetry.io/otel/sdk v1.3.0/go.mod h1:rIo4suHNhQwBIPg9axF8V9CA72Wz2mKF1teNrup8yzs= go.opentelemetry.io/otel/sdk v1.6.1/go.mod h1:IVYrddmFZ+eJqu2k38qD3WezFR2pymCzm8tdxyh3R4E= go.opentelemetry.io/otel/sdk v1.11.1/go.mod h1:/l3FE4SupHJ12TduVjUkZtlfFqDCQJlOlithYrdktys= go.opentelemetry.io/otel/sdk v1.14.0/go.mod h1:bwIC5TjrNG6QDCHNWvW4HLHtUQ4I+VQDsnjhvyZCALM= +go.opentelemetry.io/otel/sdk v1.16.0 h1:Z1Ok1YsijYL0CSJpHt4cS3wDDh7p572grzNrBMiMWgE= +go.opentelemetry.io/otel/sdk v1.16.0/go.mod h1:tMsIuKXuuIWPBAOrH+eHtvhTL+SntFtXF9QD68aP6p4= go.opentelemetry.io/otel/sdk/export/metric v0.20.0/go.mod h1:h7RBNMsDJ5pmI1zExLi+bJK+Dr8NQCh0qGhm1KDnNlE= go.opentelemetry.io/otel/sdk/metric v0.20.0/go.mod h1:knxiS8Xd4E/N+ZqKmUPf3gTTZ4/0TjTXukfxjzSTpHE= go.opentelemetry.io/otel/trace v0.20.0/go.mod h1:6GjCW8zgDjwGHGa6GkyeB8+/5vjT16gUEi0Nf1iBdgw= @@ -2305,10 +2323,13 @@ go.opentelemetry.io/otel/trace v1.6.0/go.mod h1:qs7BrU5cZ8dXQHBGxHMOxwME/27YH2qE go.opentelemetry.io/otel/trace v1.6.1/go.mod h1:RkFRM1m0puWIq10oxImnGEduNBzxiN7TXluRBtE+5j0= go.opentelemetry.io/otel/trace v1.11.1/go.mod h1:f/Q9G7vzk5u91PhbmKbg1Qn0rzH1LJ4vbPHFGkTPtOk= go.opentelemetry.io/otel/trace v1.14.0/go.mod h1:8avnQLK+CG77yNLUae4ea2JDQ6iT+gozhnZjy/rw9G8= +go.opentelemetry.io/otel/trace v1.16.0 h1:8JRpaObFoW0pxuVPapkgH8UhHQj+bJW8jJsCZEu5MQs= +go.opentelemetry.io/otel/trace v1.16.0/go.mod h1:Yt9vYq1SdNz3xdjZZK7wcXv1qv2pwLkqr2QVwea0ef0= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.11.0/go.mod h1:QpEjXPrNQzrFDZgoTo49dgHR9RYRSrg3NAKnUGl9YpQ= go.opentelemetry.io/proto/otlp v0.12.1/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= +go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJPI1Nnw= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.temporal.io/api v1.21.0/go.mod h1:xlsUEakkN2vU2/WV7e5NqMG4N93nfuNfvbXdaXUpU8w= go.temporal.io/api v1.23.0 h1:4y9mTQjEHsE0Du0WJ2ExJUcP/1/a+B/UefzIDm4ALTE= diff --git a/hack/etc/grafana-agent.yaml b/hack/etc/grafana-agent.yaml new file mode 100644 index 00000000..23b7f03c --- /dev/null +++ b/hack/etc/grafana-agent.yaml @@ -0,0 +1,22 @@ +server: + log_level: "info" + +traces: + configs: + - name: "default" + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + remote_write: + - endpoint: "grafana-tempo:4317" + insecure: true + batch: + timeout: "1s" + send_batch_size: 100 + automatic_logging: + backend: "stdout" + spans: true + processes: true + roots: true diff --git a/hack/etc/grafana-datasources.yaml b/hack/etc/grafana-datasources.yaml new file mode 100644 index 00000000..16fe24b3 --- /dev/null +++ b/hack/etc/grafana-datasources.yaml @@ -0,0 +1,16 @@ +apiVersion: 1 + +datasources: +- name: "Tempo" + type: "tempo" + access: "proxy" + orgId: 1 + url: "http://grafana-tempo:3200" + basicAuth: false + isDefault: true + version: 1 + editable: false + apiVersion: 1 + uid: "tempo" + jsonData: + httpMethod: "GET" diff --git a/hack/etc/grafana-tempo.yaml b/hack/etc/grafana-tempo.yaml new file mode 100644 index 00000000..385429c3 --- /dev/null +++ b/hack/etc/grafana-tempo.yaml @@ -0,0 +1,20 @@ +server: + http_listen_port: 3200 + graceful_shutdown_timeout: "1s" + +distributor: + receivers: + otlp: + protocols: + grpc: + log_received_spans: + enabled: true + include_all_attributes: true + +storage: + trace: + backend: "local" + wal: + path: "/tmp/tempo/wal" + local: + path: "/tmp/tempo/blocks" diff --git a/hack/etc/prometheus.yaml b/hack/etc/prometheus.yaml new file mode 100644 index 00000000..fc8e4ad2 --- /dev/null +++ b/hack/etc/prometheus.yaml @@ -0,0 +1,11 @@ +global: + scrape_interval: "15s" + evaluation_interval: "15s" + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + - job_name: "tempo" + static_configs: + - targets: ["tempo:3200"] diff --git a/internal/api/api.go b/internal/api/api.go index f5555dc9..2dea820a 100644 --- a/internal/api/api.go +++ b/internal/api/api.go @@ -19,6 +19,8 @@ import ( "github.com/go-logr/logr" "github.com/gorilla/websocket" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "go.opentelemetry.io/otel/trace" goahttp "goa.design/goa/v3/http" goahttpmwr "goa.design/goa/v3/http/middleware" "goa.design/goa/v3/middleware" @@ -37,7 +39,9 @@ import ( ) func HTTPServer( - logger logr.Logger, config *Config, + logger logr.Logger, + tp trace.TracerProvider, + config *Config, pipesvc intpipe.Service, batchsvc intbatch.Service, colsvc intcol.Service, @@ -81,6 +85,7 @@ func HTTPServer( // Global middlewares. var handler http.Handler = mux + handler = otelhttp.NewHandler(handler, "enduro/internal/api", otelhttp.WithTracerProvider(tp)) handler = goahttpmwr.RequestID()(handler) handler = versionHeaderMiddleware(config.AppVersion)(handler) if config.Debug { diff --git a/internal/batch/service.go b/internal/batch/service.go index 510f24a1..b8abff95 100644 --- a/internal/batch/service.go +++ b/internal/batch/service.go @@ -9,6 +9,7 @@ import ( "github.com/go-logr/logr" "go.artefactual.dev/tools/ref" + "go.opentelemetry.io/otel/trace" temporalapi_enums "go.temporal.io/api/enums/v1" temporalapi_serviceerror "go.temporal.io/api/serviceerror" temporalsdk_client "go.temporal.io/sdk/client" @@ -134,7 +135,8 @@ func (s *batchImpl) Hints(ctx context.Context) (*goabatch.BatchHintsResult, erro func (s *batchImpl) InitProcessingWorkflow(ctx context.Context, req *collection.ProcessingWorkflowRequest) error { req.ValidationConfig = validation.Config{} - err := collection.InitProcessingWorkflow(ctx, s.cc, s.taskQueue, req) + tr := trace.NewNoopTracerProvider().Tracer("") + err := collection.InitProcessingWorkflow(ctx, tr, s.cc, s.taskQueue, req) if err != nil { s.logger.Error(err, "Error initializing processing workflow.") } diff --git a/internal/collection/goa.go b/internal/collection/goa.go index 07eb435d..87362d98 100644 --- a/internal/collection/goa.go +++ b/internal/collection/goa.go @@ -13,6 +13,7 @@ import ( "strings" "time" + "go.opentelemetry.io/otel/trace" temporalapi_common "go.temporal.io/api/common/v1" temporalapi_enums "go.temporal.io/api/enums/v1" temporalapi_serviceerror "go.temporal.io/api/serviceerror" @@ -265,7 +266,8 @@ func (w *goaWrapper) Retry(ctx context.Context, payload *goacollection.RetryPayl req.WorkflowID = *goacol.WorkflowID req.CollectionID = goacol.ID - if err := InitProcessingWorkflow(ctx, w.cc, w.taskQueue, req); err != nil { + tr := trace.NewNoopTracerProvider().Tracer("") + if err := InitProcessingWorkflow(ctx, tr, w.cc, w.taskQueue, req); err != nil { return fmt.Errorf("error starting the new workflow instance: %w", err) } diff --git a/internal/collection/workflow.go b/internal/collection/workflow.go index 81593259..0b903fbc 100644 --- a/internal/collection/workflow.go +++ b/internal/collection/workflow.go @@ -6,6 +6,8 @@ import ( "time" "github.com/google/uuid" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" temporalsdk_api_enums "go.temporal.io/api/enums/v1" temporalsdk_client "go.temporal.io/sdk/client" @@ -60,7 +62,10 @@ type ProcessingWorkflowRequest struct { RejectDuplicates bool } -func InitProcessingWorkflow(ctx context.Context, c temporalsdk_client.Client, taskQueue string, req *ProcessingWorkflowRequest) error { +func InitProcessingWorkflow(ctx context.Context, tr trace.Tracer, c temporalsdk_client.Client, taskQueue string, req *ProcessingWorkflowRequest) error { + _, span := tr.Start(ctx, "InitProcessingWorkflow") + defer span.End() + if req.WorkflowID == "" { req.WorkflowID = fmt.Sprintf("processing-workflow-%s", uuid.New().String()) } @@ -74,6 +79,10 @@ func InitProcessingWorkflow(ctx context.Context, c temporalsdk_client.Client, ta WorkflowIDReusePolicy: temporalsdk_api_enums.WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE, } _, err := c.ExecuteWorkflow(ctx, opts, ProcessingWorkflowName, req) + if err != nil { + span.SetStatus(codes.Error, "ExecuteWorkflow failed") + span.RecordError(err) + } return err } diff --git a/main.go b/main.go index 00432060..6337a780 100644 --- a/main.go +++ b/main.go @@ -13,15 +13,25 @@ import ( "syscall" "time" + "github.com/go-logr/logr" "github.com/oklog/run" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/spf13/pflag" "github.com/spf13/viper" "go.artefactual.dev/tools/log" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.17.0" + "go.opentelemetry.io/otel/trace" temporalsdk_activity "go.temporal.io/sdk/activity" temporalsdk_client "go.temporal.io/sdk/client" temporalsdk_worker "go.temporal.io/sdk/worker" temporalsdk_workflow "go.temporal.io/sdk/workflow" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" "github.com/artefactual-labs/enduro/internal/api" "github.com/artefactual-labs/enduro/internal/batch" @@ -92,12 +102,28 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // Set up tracing. + tp, shutdown, err := initTracerProvider(ctx, logger, config.Telemetry) + if err != nil { + logger.Error(err, "Error creating tracer provider.") + os.Exit(1) + } + defer func() { _ = shutdown(ctx) }() + tracer := tp.Tracer("enduro") + database, err := db.Connect(config.Database.DSN) if err != nil { logger.Error(err, "Database configuration failed.") os.Exit(1) } - _ = database.Ping() + _, span := tracer.Start(ctx, "db-ping") + span.SetAttributes(attribute.String("db.driver", "mysql")) + if err := database.Ping(); err != nil { + span.SetStatus(codes.Error, "ping failed") + span.RecordError(err) + } + span.AddEvent("Connected!") + span.End() temporalClient, err := temporalsdk_client.Dial(temporalsdk_client.Options{ Namespace: config.Temporal.Namespace, @@ -152,7 +178,7 @@ func main() { g.Add( func() error { - srv = api.HTTPServer(logger, &config.API, pipesvc, batchsvc, colsvc) + srv = api.HTTPServer(logger, tp, &config.API, pipesvc, batchsvc, colsvc) return srv.ListenAndServe() }, func(err error) { @@ -166,8 +192,8 @@ func main() { // Watchers, where each watcher is a group actor. { for _, w := range wsvc.Watchers() { + w := w done := make(chan struct{}) - cur := w g.Add( func() error { for { @@ -175,30 +201,36 @@ func main() { case <-done: return nil default: - event, err := cur.Watch(ctx) + event, err := w.Watch(ctx) if err != nil { if !errors.Is(err, watcher.ErrWatchTimeout) { - logger.Error(err, "Error monitoring watcher interface.", "watcher", cur) + logger.Error(err, "Error monitoring watcher interface.", "watcher", w) } continue } + ctx, span := tracer.Start(ctx, "Watcher") + span.SetAttributes( + attribute.String("watcher", event.WatcherName), + attribute.String("bucket", event.Bucket), + attribute.String("key", event.Key), + attribute.Bool("dir", event.IsDir), + ) logger.V(1).Info("Starting new workflow", "watcher", event.WatcherName, "bucket", event.Bucket, "key", event.Key, "dir", event.IsDir) - go func() { - req := collection.ProcessingWorkflowRequest{ - WatcherName: event.WatcherName, - PipelineNames: event.PipelineName, - RetentionPeriod: event.RetentionPeriod, - CompletedDir: event.CompletedDir, - StripTopLevelDir: event.StripTopLevelDir, - RejectDuplicates: event.RejectDuplicates, - Key: event.Key, - IsDir: event.IsDir, - ValidationConfig: config.Validation, - } - if err := collection.InitProcessingWorkflow(ctx, temporalClient, config.Temporal.TaskQueue, &req); err != nil { - logger.Error(err, "Error initializing processing workflow.") - } - }() + req := collection.ProcessingWorkflowRequest{ + WatcherName: event.WatcherName, + PipelineNames: event.PipelineName, + RetentionPeriod: event.RetentionPeriod, + CompletedDir: event.CompletedDir, + StripTopLevelDir: event.StripTopLevelDir, + RejectDuplicates: event.RejectDuplicates, + Key: event.Key, + IsDir: event.IsDir, + ValidationConfig: config.Validation, + } + if err := collection.InitProcessingWorkflow(ctx, tracer, temporalClient, config.Temporal.TaskQueue, &req); err != nil { + logger.Error(err, "Error initializing processing workflow.") + } + span.End() } } }, @@ -346,6 +378,7 @@ type configuration struct { Watcher watcher.Config Pipeline []pipeline.Config Validation validation.Config + Telemetry TelemetryConfig // This is a workaround for client-specific functionality. // Simple mechanism to support an arbitrary number of hooks and parameters. @@ -393,3 +426,60 @@ func readConfig(v *viper.Viper, config *configuration, configFile string) (found return found, nil } + +type TelemetryConfig struct { + Traces struct { + Enabled bool + Address string + SamplingRatio *float64 + } +} + +func initTracerProvider(ctx context.Context, logger logr.Logger, cfg TelemetryConfig) (trace.TracerProvider, func(context.Context) error, error) { + if !cfg.Traces.Enabled || cfg.Traces.Address == "" { + logger.V(1).Info("Tracing system is disabled.", "enabled", cfg.Traces.Enabled, "addr", cfg.Traces.Address) + shutdown := func(context.Context) error { return nil } + return trace.NewNoopTracerProvider(), shutdown, nil + } + + conn, err := grpc.DialContext( + ctx, + cfg.Traces.Address, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + if err != nil { + return nil, nil, fmt.Errorf("can't connect to telemetry data collector: %v", err) + } + + exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithGRPCConn(conn)) + if err != nil { + return nil, nil, fmt.Errorf("can't create gRPC telemetry data exporter: %v", err) + } + + resource, _ := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(appName), + semconv.ServiceVersion(version), + ), + ) + + var ratio float64 = 1 + if cfg.Traces.SamplingRatio != nil { + ratio = *cfg.Traces.SamplingRatio + } + sampler := sdktrace.ParentBased(sdktrace.TraceIDRatioBased(ratio)) + + tp := sdktrace.NewTracerProvider( + sdktrace.WithSampler(sampler), + sdktrace.WithResource(resource), + sdktrace.WithBatcher(exporter), + ) + shutdown := func(context.Context) error { return tp.Shutdown(ctx) } + + logger.V(1).Info("Using OTel gRPC tracer provider.", "addr", cfg.Traces.Address, "ratio", ratio) + + return tp, shutdown, nil +} diff --git a/website/content/en/docs/development/environment.md b/website/content/en/docs/development/environment.md index be37e475..9b6cc045 100644 --- a/website/content/en/docs/development/environment.md +++ b/website/content/en/docs/development/environment.md @@ -117,6 +117,49 @@ You can enable it in Visual Studio Code as follows: } ``` +## Enable tracing + +Enable the observability services by editing the root `.env` file as follows: + + COMPOSE_FILE=docker-compose.yml:docker-compose.observability.yml + +Start all services again: + + docker compose up -d + +Three new services should be ready: + +* Grafana (listens on `127.0.0.1:3000`), +* Grafana Agent (listens on `127.0.0.1:12345`), and +* Grafana Tempo (listens on `127.0.0.1:4317`). + +Enable tracing in `enduro.toml`: + +```toml +[telemetry.traces] +enabled = true +address = "127.0.0.1:12345" +ratio = 1.0 +``` + +Run enduro: + + make run + +Things that you can do: + +* Observe that `grafana-agent` logs received traces: + + docker compose logs -f grafana-agent + +* Observe that `grafana-tempo` logs stored traces: + + docker compose logs -f grafana-tempo + +* Use the [Grafana explorer](http://127.0.0.1:3000/explore) to search and + visualize traces. + + [docker-engine]: https://docs.docker.com/engine/install/ [mc]: https://docs.min.io/docs/minio-client-quickstart-guide.html [go]: https://golang.org/doc/install diff --git a/website/content/en/docs/user-manual/configuration.md b/website/content/en/docs/user-manual/configuration.md index 91df7b44..730e36f9 100644 --- a/website/content/en/docs/user-manual/configuration.md +++ b/website/content/en/docs/user-manual/configuration.md @@ -23,10 +23,16 @@ file using the optional argument `--config=example.toml`. Main configuration attributes that do not belong to a specific section. +#### `verbosity` (Int) + +Chattiness of log operations. `0` is the default and best for production. + +E.g.: `0` + #### `debug` (Bool) -When enabled, the application logger will be configured with increased -verbosity and a colored formatter. +When enabled, the application logger will be configured with a human-readable +format and a colored log record formatter. E.g.: `false` @@ -37,6 +43,48 @@ data. E.g.: `"127.0.0.1:9001"` +### `[telemetry]` + +Telemetry configuration details. + +#### `[telemetry.traces]` + +Tracing configuration. + +For example: + +```toml +[telemetry.traces] +enabled = false +address = "127.0.0.1:12345" +ratio = 1.0 +``` + +#### `enabled` (Bool) + +When enabled, traces will be delivered to the tracing data collector. It is +disabled by default. + +E.g.: `false` + +#### `address` (String) + +Address of the OpenTelemetry tracing data collector (gRPC), e.g. Grafana Agent +or OpenTelemetry Collector. + +E.g.: `"127.0.0.1:12345"` + +#### `ratio` (Float64) + +Sampling ratio. A sampling ratio of 0.25 means that, on average, one out of +every four traces will be sampled. The default is to sample every transfer. + +E.g.: `0.25` + +#### `[telemetry.metrics]` + +Not available yet. + ### `[temporal]` Connection details with the Temporal server.