From dec238f94d879857667ed1aafc60e16d98060fc8 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 13:46:39 -0800 Subject: [PATCH 01/47] get build with new fluent-bit --- .pipelines/azure-pipeline-build.yml | 5 +- otelcollector/build/linux/Dockerfile | 194 +++++++++++------------ otelcollector/fluent-bit/fluent-bit.conf | 172 ++------------------ otelcollector/scripts/setup.sh | 5 +- 4 files changed, 117 insertions(+), 259 deletions(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index 5e2ba7f78..489cb8a5d 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -2,6 +2,7 @@ trigger: branches: include: - main + - grace/remove-telegraf pr: autoCancel: true @@ -23,7 +24,7 @@ variables: NODE_EXPORTER_IMAGE: 'mcr.microsoft.com/oss/prometheus/node-exporter:v1.6.0' IS_PR: $[eq(variables['Build.Reason'], 'PullRequest')] IS_MAIN_BRANCH: $[eq(variables['Build.SourceBranchName'], 'main')] - BUILD_WINDOWS: true + BUILD_WINDOWS: false Codeql.Enabled: true GOLANG_VERSION: '1.22.7' @@ -634,7 +635,7 @@ stages: docker buildx create --name dockerbuilder --driver docker-container --driver-opt image=mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 --use docker buildx inspect --bootstrap - docker buildx build . --platform=linux/amd64,linux/arm64 --file ./build/linux/Dockerfile -t $(LINUX_FULL_IMAGE_NAME) --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --push # --cache-to type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector,mode=max --cache-from type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector + docker buildx build . --platform=linux/amd64 --file ./build/linux/Dockerfile -t $(LINUX_FULL_IMAGE_NAME) --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --push # --cache-to type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector,mode=max --cache-from type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector docker pull $(LINUX_FULL_IMAGE_NAME) docker system prune --all -f workingDirectory: $(Build.SourcesDirectory)/otelcollector/ diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 9ea4e7139..be5eebe98 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -123,106 +123,106 @@ RUN chmod 775 $tmpdir/*.sh; RUN sync; RUN $tmpdir/setup.sh ${TARGETARCH} # If wanting to run without distroless, uncomment this line and comment everything after -# CMD [ "/opt/main.sh" ] +ENTRYPOINT ["./opt/main"] -FROM mcr.microsoft.com/cbl-mariner/distroless/base:2.0 -# Below is for ContainerInsightsPrometheusCollector-Prod AppInsights Resource -ENV APPLICATIONINSIGHTS_AUTH_PUBLIC MWNkYTMxMTItYWY1Ni00ZmNiLWI4MDQtZjg5NDVhYTFjYjMy -# Below is for ContainerInsightsPrometheusCollector-Fairfax AppInsights Resource -ENV APPLICATIONINSIGHTS_AUTH_USGOVERNMENT ZmRjMTE0MmUtY2U0YS1mNTFmLWE4M2EtODBjM2ZjNDYwNGE5 -# Below is for ContainerInsightsPrometheusCollector-Mooncake AppInsights Resource -ENV APPLICATIONINSIGHTS_AUTH_CHINACLOUD ZTcyY2ZjOTYtNjY3Zi1jZGYwLTkwOWMtNzhiZjAwZjQ0NDg4 -# Below is for ContainerInsightsPrometheusCollector-USSec AppInsights Resource -ENV APPLICATIONINSIGHTS_AUTH_USSEC ZTg4MzFlZGYtNWQ1ZC0wYjZmLTk3MGUtNDkxNTgyYjliMDFl -# Below is for ContainerInsightsPrometheusCollector-USNat AppInsights Resource -ENV APPLICATIONINSIGHTS_AUTH_USNAT ZTliNjRmZmUtZDZlYi0xYjczLThjYWQtNDU2OTFjN2FhNzIw -# Set environment variables for mdsd -ENV MDSD_LOG="/opt/microsoft/linuxmonagent" -ENV SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -ENV MDSD_FLUENT_SOCKET_PORT="28230" -ENV ENABLE_MCS="true" -ENV MONITORING_USE_GENEVA_CONFIG_SERVICE="false" -ENV MDSD_USE_LOCAL_PERSISTENCY="false" -ENV SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" -ENV TELEMETRY_DISABLED false -# Needed for ME, see https://github.com/microsoft/cpprestsdk/issues/1481 -ENV MALLOC_ARENA_MAX=2 -ENV PATH="/busybin:${PATH}" -ENV OS_TYPE "linux" +# FROM mcr.microsoft.com/cbl-mariner/distroless/base:2.0 +# # Below is for ContainerInsightsPrometheusCollector-Prod AppInsights Resource +# ENV APPLICATIONINSIGHTS_AUTH_PUBLIC MWNkYTMxMTItYWY1Ni00ZmNiLWI4MDQtZjg5NDVhYTFjYjMy +# # Below is for ContainerInsightsPrometheusCollector-Fairfax AppInsights Resource +# ENV APPLICATIONINSIGHTS_AUTH_USGOVERNMENT ZmRjMTE0MmUtY2U0YS1mNTFmLWE4M2EtODBjM2ZjNDYwNGE5 +# # Below is for ContainerInsightsPrometheusCollector-Mooncake AppInsights Resource +# ENV APPLICATIONINSIGHTS_AUTH_CHINACLOUD ZTcyY2ZjOTYtNjY3Zi1jZGYwLTkwOWMtNzhiZjAwZjQ0NDg4 +# # Below is for ContainerInsightsPrometheusCollector-USSec AppInsights Resource +# ENV APPLICATIONINSIGHTS_AUTH_USSEC ZTg4MzFlZGYtNWQ1ZC0wYjZmLTk3MGUtNDkxNTgyYjliMDFl +# # Below is for ContainerInsightsPrometheusCollector-USNat AppInsights Resource +# ENV APPLICATIONINSIGHTS_AUTH_USNAT ZTliNjRmZmUtZDZlYi0xYjczLThjYWQtNDU2OTFjN2FhNzIw +# # Set environment variables for mdsd +# ENV MDSD_LOG="/opt/microsoft/linuxmonagent" +# ENV SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +# ENV MDSD_FLUENT_SOCKET_PORT="28230" +# ENV ENABLE_MCS="true" +# ENV MONITORING_USE_GENEVA_CONFIG_SERVICE="false" +# ENV MDSD_USE_LOCAL_PERSISTENCY="false" +# ENV SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" +# ENV TELEMETRY_DISABLED false +# # Needed for ME, see https://github.com/microsoft/cpprestsdk/issues/1481 +# ENV MALLOC_ARENA_MAX=2 +# ENV PATH="/busybin:${PATH}" +# ENV OS_TYPE "linux" -# files -COPY --from=builder /opt /opt -COPY --from=builder /etc /etc -COPY --from=builder /busybin /busybin -COPY --from=builder /static/react /static/react -COPY --from=builder /usr/sbin/me.config /usr/sbin/me_internal.config /usr/sbin/me_ds.config /usr/sbin/me_ds_internal.config /usr/sbin/ -COPY --from=builder /var/opt/microsoft /var/opt/microsoft -COPY --from=builder /var/lib/logrotate /var/lib/logrotate -COPY --from=builder /var/spool/cron /var/spool/cron -COPY --from=builder /usr/share/p11-kit /usr/share/p11-kit -COPY --from=builder /usr/share/pki/ /usr/share/pki -COPY --from=builder /opt/microsoft/liveness /opt/microsoft/liveness -COPY --from=builder /opt/microsoft/configmapparser /opt/microsoft/configmapparser +# # files +# COPY --from=builder /opt /opt +# COPY --from=builder /etc /etc +# COPY --from=builder /busybin /busybin +# COPY --from=builder /static/react /static/react +# COPY --from=builder /usr/sbin/me.config /usr/sbin/me_internal.config /usr/sbin/me_ds.config /usr/sbin/me_ds_internal.config /usr/sbin/ +# COPY --from=builder /var/opt/microsoft /var/opt/microsoft +# COPY --from=builder /var/lib/logrotate /var/lib/logrotate +# COPY --from=builder /var/spool/cron /var/spool/cron +# COPY --from=builder /usr/share/p11-kit /usr/share/p11-kit +# COPY --from=builder /usr/share/pki/ /usr/share/pki +# COPY --from=builder /opt/microsoft/liveness /opt/microsoft/liveness +# COPY --from=builder /opt/microsoft/configmapparser /opt/microsoft/configmapparser -# executables -COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension -COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait -COPY --from=builder /usr/bin/bash /usr/bin/bash -COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox -COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit -COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf -COPY --from=builder /usr/sbin/crond /usr/sbin/crond -COPY --from=builder /usr/bin/vim /usr/bin/vim -COPY --from=builder /usr/share/vim /usr/share/vim -COPY --from=builder /usr/sbin/mdsd /usr/sbin/mdsd -COPY --from=builder /usr/sbin/logrotate /usr/sbin/logrotate -COPY --from=builder /usr/bin/gzip /usr/bin/ -COPY --from=builder /usr/bin/curl /usr/bin/ -COPY --from=builder /usr/bin/update-ca-trust /usr/bin -COPY --from=builder /bin/sh /bin/sh -COPY --from=builder /usr/bin/p11-kit /usr/bin -COPY --from=builder /usr/bin/trust /usr/bin +# # executables +# COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension +# COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait +# COPY --from=builder /usr/bin/bash /usr/bin/bash +# COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox +# COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit +# COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf +# COPY --from=builder /usr/sbin/crond /usr/sbin/crond +# COPY --from=builder /usr/bin/vim /usr/bin/vim +# COPY --from=builder /usr/share/vim /usr/share/vim +# COPY --from=builder /usr/sbin/mdsd /usr/sbin/mdsd +# COPY --from=builder /usr/sbin/logrotate /usr/sbin/logrotate +# COPY --from=builder /usr/bin/gzip /usr/bin/ +# COPY --from=builder /usr/bin/curl /usr/bin/ +# COPY --from=builder /usr/bin/update-ca-trust /usr/bin +# COPY --from=builder /bin/sh /bin/sh +# COPY --from=builder /usr/bin/p11-kit /usr/bin +# COPY --from=builder /usr/bin/trust /usr/bin -# bash dependencies -COPY --from=builder /lib/libreadline.so.8 /lib/ -COPY --from=builder /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /usr/lib/ -# inotifywait dependencies -COPY --from=builder /lib/libinotifytools.so.0 /lib/ -# crond dependencies -COPY --from=builder /lib/libselinux.so.1 /lib/libpam.so.0 /lib/libc.so.6 /lib/libpcre.so.1 /lib/libaudit.so.1 /lib/libcap-ng.so.0/ /lib/ -# vim dependencies -COPY --from=builder /lib/libm.so.6 /lib/libtinfo.so.6 /lib/ -# metricsextension dependencies -# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 /lib/libstdc++.so.6 /lib/libm.so.6 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/libbrotlidec.so.1 /lib/libbrotlienc.so.1 /lib/libz.so.1 /lib/libbrotlicommon.so.1 /lib/ -COPY --from=builder /lib64/libuuid.so.1 /lib64 -# fluent-bit dependencies -# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ -# telegraf dependencies -COPY --from=builder /lib/libc.so.6 /lib/ -# mdsd dependencies -COPY --from=builder /usr/lib/libdl.so.2 /usr/lib/librt.so.1 /usr/lib/libpthread.so.0 /usr/lib/libm.so.6 /usr/lib/libstdc++.so.6 /usr/lib/libgcc_s.so.1 /usr/lib/ -# logrotate dependencies -COPY --from=builder /lib/libselinux.so.1 /lib/libpopt.so.0 /lib/libpcre.so.1 /lib/ -# curl dependencies -# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -COPY --from=builder /lib/libcurl.so.4 /lib/libz.so.1 /lib/libc.so.6 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libzstd.so.1 /lib/ -COPY --from=builder /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /usr/lib/libresolv.so.2 /usr/lib/ -# sh dependencies -COPY --from=builder /lib/libreadline.so.8 /lib/libc.so.6 /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /lib/ -# update-ca-trust dependencies -COPY --from=builder /usr/lib64/pkcs11 /usr/lib64 -COPY --from=builder /usr/lib/pkcs11 /usr/lib/ -COPY --from=builder /usr/libexec/p11-kit /usr/libexec -COPY --from=builder /lib/libp11-kit.so.0 /lib/libtasn1.so.6 /lib/libc.so.6 /lib/libffi.so.8 /lib/ -COPY --from=builder /usr/lib/p11-kit-trust.so /usr/lib/p11-kit-proxy.so /usr/lib/libp11-kit.so.0.3.0 /usr/lib/libnssckbi.so /usr/lib/ -COPY --from=builder /usr/lib/pkcs11/p11-kit-trust.so /usr/lib/pkcs11/ +# # bash dependencies +# COPY --from=builder /lib/libreadline.so.8 /lib/ +# COPY --from=builder /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /usr/lib/ +# # inotifywait dependencies +# COPY --from=builder /lib/libinotifytools.so.0 /lib/ +# # crond dependencies +# COPY --from=builder /lib/libselinux.so.1 /lib/libpam.so.0 /lib/libc.so.6 /lib/libpcre.so.1 /lib/libaudit.so.1 /lib/libcap-ng.so.0/ /lib/ +# # vim dependencies +# COPY --from=builder /lib/libm.so.6 /lib/libtinfo.so.6 /lib/ +# # metricsextension dependencies +# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +# COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 /lib/libstdc++.so.6 /lib/libm.so.6 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/libbrotlidec.so.1 /lib/libbrotlienc.so.1 /lib/libz.so.1 /lib/libbrotlicommon.so.1 /lib/ +# COPY --from=builder /lib64/libuuid.so.1 /lib64 +# # fluent-bit dependencies +# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +# COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ +# # telegraf dependencies +# COPY --from=builder /lib/libc.so.6 /lib/ +# # mdsd dependencies +# COPY --from=builder /usr/lib/libdl.so.2 /usr/lib/librt.so.1 /usr/lib/libpthread.so.0 /usr/lib/libm.so.6 /usr/lib/libstdc++.so.6 /usr/lib/libgcc_s.so.1 /usr/lib/ +# # logrotate dependencies +# COPY --from=builder /lib/libselinux.so.1 /lib/libpopt.so.0 /lib/libpcre.so.1 /lib/ +# # curl dependencies +# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +# COPY --from=builder /lib/libcurl.so.4 /lib/libz.so.1 /lib/libc.so.6 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libzstd.so.1 /lib/ +# COPY --from=builder /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /usr/lib/libresolv.so.2 /usr/lib/ +# # sh dependencies +# COPY --from=builder /lib/libreadline.so.8 /lib/libc.so.6 /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /lib/ +# # update-ca-trust dependencies +# COPY --from=builder /usr/lib64/pkcs11 /usr/lib64 +# COPY --from=builder /usr/lib/pkcs11 /usr/lib/ +# COPY --from=builder /usr/libexec/p11-kit /usr/libexec +# COPY --from=builder /lib/libp11-kit.so.0 /lib/libtasn1.so.6 /lib/libc.so.6 /lib/libffi.so.8 /lib/ +# COPY --from=builder /usr/lib/p11-kit-trust.so /usr/lib/p11-kit-proxy.so /usr/lib/libp11-kit.so.0.3.0 /usr/lib/libnssckbi.so /usr/lib/ +# COPY --from=builder /usr/lib/pkcs11/p11-kit-trust.so /usr/lib/pkcs11/ -RUN [ "/bin/bash", "-c", "chmod 644 /etc/crontab" ] -RUN [ "/bin/bash", "-c", "chown root.root /etc/crontab" ] -RUN [ "/bin/bash", "-c", "chmod 755 /etc/cron.daily/logrotate" ] -RUN [ "/bin/bash", "-c", "chmod 644 /etc/logrotate.d/prometheus-collector" ] +# RUN [ "/bin/bash", "-c", "chmod 644 /etc/crontab" ] +# RUN [ "/bin/bash", "-c", "chown root.root /etc/crontab" ] +# RUN [ "/bin/bash", "-c", "chmod 755 /etc/cron.daily/logrotate" ] +# RUN [ "/bin/bash", "-c", "chmod 644 /etc/logrotate.d/prometheus-collector" ] -# Run the Go executable, entrypoint -ENTRYPOINT ["./opt/main"] \ No newline at end of file +# # Run the Go executable, entrypoint +# ENTRYPOINT ["./opt/main"] \ No newline at end of file diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf index 1405ad756..418bb7c43 100644 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ b/otelcollector/fluent-bit/fluent-bit.conf @@ -1,165 +1,21 @@ [SERVICE] - Flush 15 - HTTP_Server Off + # Flush 15 + # HTTP_Server Off Daemon Off - storage.path /var/opt/microsoft/state/flbstore/ - storage.sync normal - storage.checksum off - storage.backlog.mem_limit 10M - Log_Level info - Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf - Log_File /opt/fluent-bit/fluent-bit.log + # storage.path state/flbstore/ + # storage.sync normal + # storage.checksum off + # storage.backlog.mem_limit 10M + Log_Level debug + #Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf + # Log_File fluent-bit.log -# prometheus-collector container logs [INPUT] - Name tail - Tag prometheus.log.prometheuscollectorcontainer - Path /var/log/containers/*prometheus-collector*prometheus-collector*.log,/var/log/containers/*ama-metrics*prometheus-collector*.log - Exclude_Path /var/log/containers/*prometheus-collector-node*.log,/var/log/containers/*ama-metrics-node*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# token-adapter container logs -[INPUT] - Name tail - Tag prometheus.log.addontokenadapter - Path /var/log/containers/*prometheus-collector*addon-token-adapter*.log,/var/log/containers/*ama-metrics*addon-token-adapter*.log - Exclude_Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# otelcollector is logging at warn level -[INPUT] - Name tail - Tag prometheus.otelcollector - Path /opt/microsoft/otelcollector/collector-log.txt - DB /var/opt/microsoft/state/otelcollector.db - DB.Sync Off - Parser collector-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# metrics extension logs at info level to be able to get processed metrics count -[INPUT] - Name tail - Tag prometheus.metricsextension - Path /MetricsExtensionConsoleDebugLog.log - DB /var/opt/microsoft/state/metricsextension.db - DB.Sync Off - Parser me-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# Only tailing mdsd error log file -[INPUT] - Name tail - Tag prometheus.mdsd - Path /opt/microsoft/linuxmonagent/mdsd.err - DB /var/opt/microsoft/state/mdsd.db - DB.Sync Off - Parser mdsd-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -[INPUT] - Name tail - Tag prometheus.log.noconfiguration - Path /dev/write-to-traces - Read_from_Head true - DB /var/opt/microsoft/state/no-configuration.db - DB.Sync Off - Parser no-config-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# Send log lines that contain the telemetry we want to a different tag -# to then send to customMetrics table -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*ProcessedCount.* prometheus.log.processedcount false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*\(infinite\).* prometheus.log.infinitemetric false - -[FILTER] - Name rewrite_tag - Match prometheus.otelcollector - Rule $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true - -# Send ME errors to stdout of container -[FILTER] - name grep - match prometheus.metricsextension - regex level (Error|Fatal) - -# Send otelcollector errors to stdout of container -[FILTER] - name grep - match prometheus.otelcollector - regex level (error|fatal) - -[FILTER] - Name grep - Match prometheus.log.addontokenadapter - regex stream stderr - -[OUTPUT] - Name appinsights - Match prometheus.log.* - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.metricsextension - -[OUTPUT] - Name stdout - Format json_lines - json_date_key false - Match prometheus.otelcollector - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.mdsd + name process_exporter_metrics + tag prometheus.log.process_exporter_metrics + metrics cpu,memory + process_include_pattern otelcollector|MetricsExtension [OUTPUT] Name stdout - Format json_lines - json_date_key time - Match prometheus.log.noconfiguration + Match prometheus.log.* \ No newline at end of file diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index f803944c9..a92e5c81c 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -56,8 +56,9 @@ sudo tdnf install telegraf-1.29.4 -y sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.txt # Install fluent-bit -echo "Installing fluent-bit..." -sudo tdnf install fluent-bit-2.1.10 -y +# echo "Installing fluent-bit..." +# sudo tdnf install fluent-bit-2.1.10 -y +curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh # Setup hourly cron for logrotate cp /etc/cron.daily/logrotate /etc/cron.hourly/ From ed96e0b190193ae0832680d66d8f2b456c607444 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 13:47:06 -0800 Subject: [PATCH 02/47] update branch name --- .pipelines/azure-pipeline-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index 489cb8a5d..c1b42173a 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -2,7 +2,7 @@ trigger: branches: include: - main - - grace/remove-telegraf + - grace/telegraf-removal-2024 pr: autoCancel: true From 890a1ad61b77a4d91546d7a620aeaf9aa7f9fc00 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 13:52:14 -0800 Subject: [PATCH 03/47] support mariner --- otelcollector/scripts/setup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index a92e5c81c..2f7464a16 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -58,7 +58,8 @@ sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.tx # Install fluent-bit # echo "Installing fluent-bit..." # sudo tdnf install fluent-bit-2.1.10 -y -curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh +curl https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm +sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm # Setup hourly cron for logrotate cp /etc/cron.daily/logrotate /etc/cron.hourly/ From bd514640127424268593bc6553d1c2082554fb7c Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 13:57:11 -0800 Subject: [PATCH 04/47] fix download --- otelcollector/scripts/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index 2f7464a16..f39ef05bc 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -58,7 +58,7 @@ sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.tx # Install fluent-bit # echo "Installing fluent-bit..." # sudo tdnf install fluent-bit-2.1.10 -y -curl https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm +curl https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm --output fluent-bit-3.2.2-1.x86_64.rpm sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm # Setup hourly cron for logrotate From a421f17dd7b47317b6dd2090a262e8c25e07b47a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 14:26:57 -0800 Subject: [PATCH 05/47] install normally --- otelcollector/scripts/setup.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index f39ef05bc..d4842b40f 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -56,10 +56,10 @@ sudo tdnf install telegraf-1.29.4 -y sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.txt # Install fluent-bit -# echo "Installing fluent-bit..." -# sudo tdnf install fluent-bit-2.1.10 -y -curl https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm --output fluent-bit-3.2.2-1.x86_64.rpm -sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm +echo "Installing fluent-bit..." +sudo tdnf install fluent-bit-2.1.10 -y +# wget https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm +# sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm # Setup hourly cron for logrotate cp /etc/cron.daily/logrotate /etc/cron.hourly/ From 8c0d65e7be10e32a034e3d6d19cce7d9056e926e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 14:30:03 -0800 Subject: [PATCH 06/47] telegraf to file --- ...egraf-prometheus-collector-ta-enabled.conf | 143 +++++++++--------- 1 file changed, 74 insertions(+), 69 deletions(-) diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf index 815bd44ad..d4e5a91ab 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf @@ -98,9 +98,9 @@ # OUTPUT PLUGINS # ############################################################################### -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" ## Timeout for closing (default: 5s). # timeout = "5s" @@ -108,6 +108,11 @@ ## Enable additional diagnostic logging. # enable_diagnostic_logging = false +# Send telegraf metrics to file(s) +[[outputs.file]] + ## Files to write to, "stdout" is a specially handled file. + files = ["stdout", "metrics.out"] + ############################################################################### # PROCESSOR PLUGINS # @@ -141,13 +146,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### -[[aggregators.quantile]] - period = "5m" - drop_original = true - quantiles = [0.50,0.95] - algorithm = "t-digest" - compression = 100.0 - namepass = ["otelcollector", "metricsextension"] +# [[aggregators.quantile]] +# period = "5m" +# drop_original = true +# quantiles = [0.50,0.95] +# algorithm = "t-digest" +# compression = 100.0 +# namepass = ["otelcollector", "metricsextension"] ############################################################################### # INPUT PLUGINS # @@ -178,41 +183,41 @@ pid_tag = true name_override = "otelcollector" fieldpass = ["cpu_usage", "memory_rss"] - [inputs.procstat.tags] -# Computer = "$NODE_NAME" -# NodeIp = "$NODE_IP" - cpulimit = "$CONTAINER_CPU_LIMIT" - memlimit = "$CONTAINER_MEMORY_LIMIT" +# [inputs.procstat.tags] +# # Computer = "$NODE_NAME" +# # NodeIp = "$NODE_IP" +# cpulimit = "$CONTAINER_CPU_LIMIT" +# memlimit = "$CONTAINER_MEMORY_LIMIT" - defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" - defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" - defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" - defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" - defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" - defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" - defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" - defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" - defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" - defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" - defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" - podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" - defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" - nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" - nodeexportername = "$NODE_EXPORTER_NAME" - kubestatename = "$KUBE_STATE_NAME" - kubestateversion = "$KUBE_STATE_VERSION" - operatortargetstaimgversion = "$OPERATOR_TARGETS_TA_IMG_VERSION" - operatortargetscfgreaderimgversion = "$OPERATOR_TARGETS_CFG_READER_IMG_VERSION" - nodeexporterversion = "$NODE_EXPORTER_VERSION" - akvauth = "$AKVAUTH" - debugmodeenabled = "$DEBUG_MODE_ENABLED" - kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" - kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" - httpproxyenabled = "$HTTP_PROXY_ENABLED" - tadapterh="$tokenadapterHealthyAfterSecs" - tadapterf="$tokenadapterUnhealthyAfterSecs" - setGlobalSettings="$AZMON_SET_GLOBAL_SETTINGS" - globalSettingsConfigured="$AZMON_GLOBAL_SETTINGS_CONFIGURED" +# defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" +# defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" +# defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" +# defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" +# defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" +# defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" +# defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" +# defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" +# defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" +# defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" +# defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" +# podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" +# defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" +# nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" +# nodeexportername = "$NODE_EXPORTER_NAME" +# kubestatename = "$KUBE_STATE_NAME" +# kubestateversion = "$KUBE_STATE_VERSION" +# operatortargetstaimgversion = "$OPERATOR_TARGETS_TA_IMG_VERSION" +# operatortargetscfgreaderimgversion = "$OPERATOR_TARGETS_CFG_READER_IMG_VERSION" +# nodeexporterversion = "$NODE_EXPORTER_VERSION" +# akvauth = "$AKVAUTH" +# debugmodeenabled = "$DEBUG_MODE_ENABLED" +# kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" +# kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" +# httpproxyenabled = "$HTTP_PROXY_ENABLED" +# tadapterh="$tokenadapterHealthyAfterSecs" +# tadapterf="$tokenadapterUnhealthyAfterSecs" +# setGlobalSettings="$AZMON_SET_GLOBAL_SETTINGS" +# globalSettingsConfigured="$AZMON_GLOBAL_SETTINGS_CONFIGURED" [[inputs.procstat]] exe = "MetricsExtension" @@ -222,28 +227,28 @@ name_override = "metricsextension" fieldpass = ["cpu_usage", "memory_rss"] -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:8888/metrics"] - fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] - tagexclude = ["service_instance_id"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:9090/metrics"] - fieldpass = ["prometheus_sd_http_failures_total"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://ama-metrics-operator-targets.kube-system.svc.cluster.local/metrics"] - fieldpass = ["opentelemetry_allocator_targets","opentelemetry_allocator_collectors_discovered"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - name_override = "target_allocator" +# [[inputs.prometheus]] +# interval = "5m" +# urls = ["http://localhost:8888/metrics"] +# fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] +# tagexclude = ["service_instance_id"] +# metric_version = 2 +# url_tag = "scrapeUrl" +# timeout = "15s" + +# [[inputs.prometheus]] +# interval = "5m" +# urls = ["http://localhost:9090/metrics"] +# fieldpass = ["prometheus_sd_http_failures_total"] +# metric_version = 2 +# url_tag = "scrapeUrl" +# timeout = "15s" + +# [[inputs.prometheus]] +# interval = "5m" +# urls = ["http://ama-metrics-operator-targets.kube-system.svc.cluster.local/metrics"] +# fieldpass = ["opentelemetry_allocator_targets","opentelemetry_allocator_collectors_discovered"] +# metric_version = 2 +# url_tag = "scrapeUrl" +# timeout = "15s" +# name_override = "target_allocator" From 9c03ffa0ba782cb772616d8075d7b3a5c01df3b1 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 2 Dec 2024 15:19:58 -0800 Subject: [PATCH 07/47] get latest fluent-bit --- otelcollector/scripts/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index d4842b40f..fa0f4c7d6 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -57,7 +57,7 @@ sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.tx # Install fluent-bit echo "Installing fluent-bit..." -sudo tdnf install fluent-bit-2.1.10 -y +sudo tdnf install fluent-bit -y # wget https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm # sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm From c3aeb9f4fa4ac1d3f3bdfa0846a813671a647304 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 3 Dec 2024 15:01:55 -0800 Subject: [PATCH 08/47] add process metrics from fluent-bit --- otelcollector/build/linux/Dockerfile | 194 +++++++++--------- otelcollector/fluent-bit/fluent-bit.conf | 172 ++++++++++++++-- otelcollector/fluent-bit/src/go.mod | 10 +- otelcollector/fluent-bit/src/go.sum | 23 +++ .../fluent-bit/src/out_appinsights.go | 7 +- otelcollector/fluent-bit/src/process_stats.go | 125 +++++++++++ otelcollector/fluent-bit/src/utils.go | 28 +++ 7 files changed, 446 insertions(+), 113 deletions(-) create mode 100644 otelcollector/fluent-bit/src/process_stats.go diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index be5eebe98..053bb3d94 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -123,106 +123,106 @@ RUN chmod 775 $tmpdir/*.sh; RUN sync; RUN $tmpdir/setup.sh ${TARGETARCH} # If wanting to run without distroless, uncomment this line and comment everything after -ENTRYPOINT ["./opt/main"] +# ENTRYPOINT ["./opt/main"] -# FROM mcr.microsoft.com/cbl-mariner/distroless/base:2.0 -# # Below is for ContainerInsightsPrometheusCollector-Prod AppInsights Resource -# ENV APPLICATIONINSIGHTS_AUTH_PUBLIC MWNkYTMxMTItYWY1Ni00ZmNiLWI4MDQtZjg5NDVhYTFjYjMy -# # Below is for ContainerInsightsPrometheusCollector-Fairfax AppInsights Resource -# ENV APPLICATIONINSIGHTS_AUTH_USGOVERNMENT ZmRjMTE0MmUtY2U0YS1mNTFmLWE4M2EtODBjM2ZjNDYwNGE5 -# # Below is for ContainerInsightsPrometheusCollector-Mooncake AppInsights Resource -# ENV APPLICATIONINSIGHTS_AUTH_CHINACLOUD ZTcyY2ZjOTYtNjY3Zi1jZGYwLTkwOWMtNzhiZjAwZjQ0NDg4 -# # Below is for ContainerInsightsPrometheusCollector-USSec AppInsights Resource -# ENV APPLICATIONINSIGHTS_AUTH_USSEC ZTg4MzFlZGYtNWQ1ZC0wYjZmLTk3MGUtNDkxNTgyYjliMDFl -# # Below is for ContainerInsightsPrometheusCollector-USNat AppInsights Resource -# ENV APPLICATIONINSIGHTS_AUTH_USNAT ZTliNjRmZmUtZDZlYi0xYjczLThjYWQtNDU2OTFjN2FhNzIw -# # Set environment variables for mdsd -# ENV MDSD_LOG="/opt/microsoft/linuxmonagent" -# ENV SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -# ENV MDSD_FLUENT_SOCKET_PORT="28230" -# ENV ENABLE_MCS="true" -# ENV MONITORING_USE_GENEVA_CONFIG_SERVICE="false" -# ENV MDSD_USE_LOCAL_PERSISTENCY="false" -# ENV SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" -# ENV TELEMETRY_DISABLED false -# # Needed for ME, see https://github.com/microsoft/cpprestsdk/issues/1481 -# ENV MALLOC_ARENA_MAX=2 -# ENV PATH="/busybin:${PATH}" -# ENV OS_TYPE "linux" +FROM mcr.microsoft.com/cbl-mariner/distroless/base:2.0 +# Below is for ContainerInsightsPrometheusCollector-Prod AppInsights Resource +ENV APPLICATIONINSIGHTS_AUTH_PUBLIC MWNkYTMxMTItYWY1Ni00ZmNiLWI4MDQtZjg5NDVhYTFjYjMy +# Below is for ContainerInsightsPrometheusCollector-Fairfax AppInsights Resource +ENV APPLICATIONINSIGHTS_AUTH_USGOVERNMENT ZmRjMTE0MmUtY2U0YS1mNTFmLWE4M2EtODBjM2ZjNDYwNGE5 +# Below is for ContainerInsightsPrometheusCollector-Mooncake AppInsights Resource +ENV APPLICATIONINSIGHTS_AUTH_CHINACLOUD ZTcyY2ZjOTYtNjY3Zi1jZGYwLTkwOWMtNzhiZjAwZjQ0NDg4 +# Below is for ContainerInsightsPrometheusCollector-USSec AppInsights Resource +ENV APPLICATIONINSIGHTS_AUTH_USSEC ZTg4MzFlZGYtNWQ1ZC0wYjZmLTk3MGUtNDkxNTgyYjliMDFl +# Below is for ContainerInsightsPrometheusCollector-USNat AppInsights Resource +ENV APPLICATIONINSIGHTS_AUTH_USNAT ZTliNjRmZmUtZDZlYi0xYjczLThjYWQtNDU2OTFjN2FhNzIw +# Set environment variables for mdsd +ENV MDSD_LOG="/opt/microsoft/linuxmonagent" +ENV SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +ENV MDSD_FLUENT_SOCKET_PORT="28230" +ENV ENABLE_MCS="true" +ENV MONITORING_USE_GENEVA_CONFIG_SERVICE="false" +ENV MDSD_USE_LOCAL_PERSISTENCY="false" +ENV SSL_CERT_FILE="/etc/pki/tls/certs/ca-bundle.crt" +ENV TELEMETRY_DISABLED false +# Needed for ME, see https://github.com/microsoft/cpprestsdk/issues/1481 +ENV MALLOC_ARENA_MAX=2 +ENV PATH="/busybin:${PATH}" +ENV OS_TYPE "linux" -# # files -# COPY --from=builder /opt /opt -# COPY --from=builder /etc /etc -# COPY --from=builder /busybin /busybin -# COPY --from=builder /static/react /static/react -# COPY --from=builder /usr/sbin/me.config /usr/sbin/me_internal.config /usr/sbin/me_ds.config /usr/sbin/me_ds_internal.config /usr/sbin/ -# COPY --from=builder /var/opt/microsoft /var/opt/microsoft -# COPY --from=builder /var/lib/logrotate /var/lib/logrotate -# COPY --from=builder /var/spool/cron /var/spool/cron -# COPY --from=builder /usr/share/p11-kit /usr/share/p11-kit -# COPY --from=builder /usr/share/pki/ /usr/share/pki -# COPY --from=builder /opt/microsoft/liveness /opt/microsoft/liveness -# COPY --from=builder /opt/microsoft/configmapparser /opt/microsoft/configmapparser +# files +COPY --from=builder /opt /opt +COPY --from=builder /etc /etc +COPY --from=builder /busybin /busybin +COPY --from=builder /static/react /static/react +COPY --from=builder /usr/sbin/me.config /usr/sbin/me_internal.config /usr/sbin/me_ds.config /usr/sbin/me_ds_internal.config /usr/sbin/ +COPY --from=builder /var/opt/microsoft /var/opt/microsoft +COPY --from=builder /var/lib/logrotate /var/lib/logrotate +COPY --from=builder /var/spool/cron /var/spool/cron +COPY --from=builder /usr/share/p11-kit /usr/share/p11-kit +COPY --from=builder /usr/share/pki/ /usr/share/pki +COPY --from=builder /opt/microsoft/liveness /opt/microsoft/liveness +COPY --from=builder /opt/microsoft/configmapparser /opt/microsoft/configmapparser -# # executables -# COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension -# COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait -# COPY --from=builder /usr/bin/bash /usr/bin/bash -# COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox -# COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit -# COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf -# COPY --from=builder /usr/sbin/crond /usr/sbin/crond -# COPY --from=builder /usr/bin/vim /usr/bin/vim -# COPY --from=builder /usr/share/vim /usr/share/vim -# COPY --from=builder /usr/sbin/mdsd /usr/sbin/mdsd -# COPY --from=builder /usr/sbin/logrotate /usr/sbin/logrotate -# COPY --from=builder /usr/bin/gzip /usr/bin/ -# COPY --from=builder /usr/bin/curl /usr/bin/ -# COPY --from=builder /usr/bin/update-ca-trust /usr/bin -# COPY --from=builder /bin/sh /bin/sh -# COPY --from=builder /usr/bin/p11-kit /usr/bin -# COPY --from=builder /usr/bin/trust /usr/bin +# executables +COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension +COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait +COPY --from=builder /usr/bin/bash /usr/bin/bash +COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox +COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit +COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf +COPY --from=builder /usr/sbin/crond /usr/sbin/crond +COPY --from=builder /usr/bin/vim /usr/bin/vim +COPY --from=builder /usr/share/vim /usr/share/vim +COPY --from=builder /usr/sbin/mdsd /usr/sbin/mdsd +COPY --from=builder /usr/sbin/logrotate /usr/sbin/logrotate +COPY --from=builder /usr/bin/gzip /usr/bin/ +COPY --from=builder /usr/bin/curl /usr/bin/ +COPY --from=builder /usr/bin/update-ca-trust /usr/bin +COPY --from=builder /bin/sh /bin/sh +COPY --from=builder /usr/bin/p11-kit /usr/bin +COPY --from=builder /usr/bin/trust /usr/bin -# # bash dependencies -# COPY --from=builder /lib/libreadline.so.8 /lib/ -# COPY --from=builder /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /usr/lib/ -# # inotifywait dependencies -# COPY --from=builder /lib/libinotifytools.so.0 /lib/ -# # crond dependencies -# COPY --from=builder /lib/libselinux.so.1 /lib/libpam.so.0 /lib/libc.so.6 /lib/libpcre.so.1 /lib/libaudit.so.1 /lib/libcap-ng.so.0/ /lib/ -# # vim dependencies -# COPY --from=builder /lib/libm.so.6 /lib/libtinfo.so.6 /lib/ -# # metricsextension dependencies -# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -# COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 /lib/libstdc++.so.6 /lib/libm.so.6 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/libbrotlidec.so.1 /lib/libbrotlienc.so.1 /lib/libz.so.1 /lib/libbrotlicommon.so.1 /lib/ -# COPY --from=builder /lib64/libuuid.so.1 /lib64 -# # fluent-bit dependencies -# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -# COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ -# # telegraf dependencies -# COPY --from=builder /lib/libc.so.6 /lib/ -# # mdsd dependencies -# COPY --from=builder /usr/lib/libdl.so.2 /usr/lib/librt.so.1 /usr/lib/libpthread.so.0 /usr/lib/libm.so.6 /usr/lib/libstdc++.so.6 /usr/lib/libgcc_s.so.1 /usr/lib/ -# # logrotate dependencies -# COPY --from=builder /lib/libselinux.so.1 /lib/libpopt.so.0 /lib/libpcre.so.1 /lib/ -# # curl dependencies -# # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -# COPY --from=builder /lib/libcurl.so.4 /lib/libz.so.1 /lib/libc.so.6 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libzstd.so.1 /lib/ -# COPY --from=builder /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /usr/lib/libresolv.so.2 /usr/lib/ -# # sh dependencies -# COPY --from=builder /lib/libreadline.so.8 /lib/libc.so.6 /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /lib/ -# # update-ca-trust dependencies -# COPY --from=builder /usr/lib64/pkcs11 /usr/lib64 -# COPY --from=builder /usr/lib/pkcs11 /usr/lib/ -# COPY --from=builder /usr/libexec/p11-kit /usr/libexec -# COPY --from=builder /lib/libp11-kit.so.0 /lib/libtasn1.so.6 /lib/libc.so.6 /lib/libffi.so.8 /lib/ -# COPY --from=builder /usr/lib/p11-kit-trust.so /usr/lib/p11-kit-proxy.so /usr/lib/libp11-kit.so.0.3.0 /usr/lib/libnssckbi.so /usr/lib/ -# COPY --from=builder /usr/lib/pkcs11/p11-kit-trust.so /usr/lib/pkcs11/ +# bash dependencies +COPY --from=builder /lib/libreadline.so.8 /lib/ +COPY --from=builder /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /usr/lib/ +# inotifywait dependencies +COPY --from=builder /lib/libinotifytools.so.0 /lib/ +# crond dependencies +COPY --from=builder /lib/libselinux.so.1 /lib/libpam.so.0 /lib/libc.so.6 /lib/libpcre.so.1 /lib/libaudit.so.1 /lib/libcap-ng.so.0/ /lib/ +# vim dependencies +COPY --from=builder /lib/libm.so.6 /lib/libtinfo.so.6 /lib/ +# metricsextension dependencies +# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 /lib/libstdc++.so.6 /lib/libm.so.6 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/libbrotlidec.so.1 /lib/libbrotlienc.so.1 /lib/libz.so.1 /lib/libbrotlicommon.so.1 /lib/ +COPY --from=builder /lib64/libuuid.so.1 /lib64 +# fluent-bit dependencies +# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ +# telegraf dependencies +COPY --from=builder /lib/libc.so.6 /lib/ +# mdsd dependencies +COPY --from=builder /usr/lib/libdl.so.2 /usr/lib/librt.so.1 /usr/lib/libpthread.so.0 /usr/lib/libm.so.6 /usr/lib/libstdc++.so.6 /usr/lib/libgcc_s.so.1 /usr/lib/ +# logrotate dependencies +COPY --from=builder /lib/libselinux.so.1 /lib/libpopt.so.0 /lib/libpcre.so.1 /lib/ +# curl dependencies +# libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures +COPY --from=builder /lib/libcurl.so.4 /lib/libz.so.1 /lib/libc.so.6 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libzstd.so.1 /lib/ +COPY --from=builder /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /usr/lib/libresolv.so.2 /usr/lib/ +# sh dependencies +COPY --from=builder /lib/libreadline.so.8 /lib/libc.so.6 /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /lib/ +# update-ca-trust dependencies +COPY --from=builder /usr/lib64/pkcs11 /usr/lib64 +COPY --from=builder /usr/lib/pkcs11 /usr/lib/ +COPY --from=builder /usr/libexec/p11-kit /usr/libexec +COPY --from=builder /lib/libp11-kit.so.0 /lib/libtasn1.so.6 /lib/libc.so.6 /lib/libffi.so.8 /lib/ +COPY --from=builder /usr/lib/p11-kit-trust.so /usr/lib/p11-kit-proxy.so /usr/lib/libp11-kit.so.0.3.0 /usr/lib/libnssckbi.so /usr/lib/ +COPY --from=builder /usr/lib/pkcs11/p11-kit-trust.so /usr/lib/pkcs11/ -# RUN [ "/bin/bash", "-c", "chmod 644 /etc/crontab" ] -# RUN [ "/bin/bash", "-c", "chown root.root /etc/crontab" ] -# RUN [ "/bin/bash", "-c", "chmod 755 /etc/cron.daily/logrotate" ] -# RUN [ "/bin/bash", "-c", "chmod 644 /etc/logrotate.d/prometheus-collector" ] +RUN [ "/bin/bash", "-c", "chmod 644 /etc/crontab" ] +RUN [ "/bin/bash", "-c", "chown root.root /etc/crontab" ] +RUN [ "/bin/bash", "-c", "chmod 755 /etc/cron.daily/logrotate" ] +RUN [ "/bin/bash", "-c", "chmod 644 /etc/logrotate.d/prometheus-collector" ] -# # Run the Go executable, entrypoint -# ENTRYPOINT ["./opt/main"] \ No newline at end of file +# Run the Go executable, entrypoint +ENTRYPOINT ["./opt/main"] \ No newline at end of file diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf index 418bb7c43..70e47011d 100644 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ b/otelcollector/fluent-bit/fluent-bit.conf @@ -1,21 +1,165 @@ [SERVICE] - # Flush 15 - # HTTP_Server Off + Flush 15 + HTTP_Server Off Daemon Off - # storage.path state/flbstore/ - # storage.sync normal - # storage.checksum off - # storage.backlog.mem_limit 10M - Log_Level debug - #Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf - # Log_File fluent-bit.log + storage.path /var/opt/microsoft/state/flbstore/ + storage.sync normal + storage.checksum off + storage.backlog.mem_limit 10M + Log_Level info + Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf + Log_File /opt/fluent-bit/fluent-bit.log +# prometheus-collector container logs [INPUT] - name process_exporter_metrics - tag prometheus.log.process_exporter_metrics - metrics cpu,memory - process_include_pattern otelcollector|MetricsExtension + Name tail + Tag prometheus.log.prometheuscollectorcontainer + Path /var/log/containers/*prometheus-collector*prometheus-collector*.log,/var/log/containers/*ama-metrics*prometheus-collector*.log + Exclude_Path /var/log/containers/*prometheus-collector-node*.log,/var/log/containers/*ama-metrics-node*.log + DB /var/opt/microsoft/state/prometheus-collector-ai.db + DB.Sync Off + Parser cri + Read_from_Head true + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +# token-adapter container logs +[INPUT] + Name tail + Tag prometheus.log.addontokenadapter + Path /var/log/containers/*prometheus-collector*addon-token-adapter*.log,/var/log/containers/*ama-metrics*addon-token-adapter*.log + Exclude_Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log + DB /var/opt/microsoft/state/prometheus-collector-ai.db + DB.Sync Off + Parser cri + Read_from_Head true + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +# otelcollector is logging at warn level +[INPUT] + Name tail + Tag prometheus.otelcollector + Path /opt/microsoft/otelcollector/collector-log.txt + DB /var/opt/microsoft/state/otelcollector.db + DB.Sync Off + Parser collector-parser + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +# metrics extension logs at info level to be able to get processed metrics count +[INPUT] + Name tail + Tag prometheus.metricsextension + Path /MetricsExtensionConsoleDebugLog.log + DB /var/opt/microsoft/state/metricsextension.db + DB.Sync Off + Parser me-parser + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +# Only tailing mdsd error log file +[INPUT] + Name tail + Tag prometheus.mdsd + Path /opt/microsoft/linuxmonagent/mdsd.err + DB /var/opt/microsoft/state/mdsd.db + DB.Sync Off + Parser mdsd-parser + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +[INPUT] + Name tail + Tag prometheus.log.noconfiguration + Path /dev/write-to-traces + Read_from_Head true + DB /var/opt/microsoft/state/no-configuration.db + DB.Sync Off + Parser no-config-parser + Mem_Buf_Limit 1m + Path_Key filepath + Skip_Long_Lines On + Ignore_Older 2m + +# Send log lines that contain the telemetry we want to a different tag +# to then send to customMetrics table +[FILTER] + Name rewrite_tag + Match prometheus.metricsextension + Rule $message .*ProcessedCount.* prometheus.log.processedcount false + +[FILTER] + Name rewrite_tag + Match prometheus.metricsextension + Rule $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false + +[FILTER] + Name rewrite_tag + Match prometheus.metricsextension + Rule $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false + +[FILTER] + Name rewrite_tag + Match prometheus.metricsextension + Rule $message .*\(infinite\).* prometheus.log.infinitemetric false + +[FILTER] + Name rewrite_tag + Match prometheus.otelcollector + Rule $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true + +# Send ME errors to stdout of container +[FILTER] + name grep + match prometheus.metricsextension + regex level (Error|Fatal) + +# Send otelcollector errors to stdout of container +[FILTER] + name grep + match prometheus.otelcollector + regex level (error|fatal) + +[FILTER] + Name grep + Match prometheus.log.addontokenadapter + regex stream stderr + +[OUTPUT] + Name appinsights + Match prometheus.log.* + +[OUTPUT] + Name stdout + Format json_lines + json_date_key time + Match prometheus.metricsextension + +[OUTPUT] + Name stdout + Format json_lines + json_date_key false + Match prometheus.otelcollector + +[OUTPUT] + Name stdout + Format json_lines + json_date_key time + Match prometheus.mdsd [OUTPUT] Name stdout - Match prometheus.log.* \ No newline at end of file + Format json_lines + json_date_key time + Match prometheus.log.noconfiguration \ No newline at end of file diff --git a/otelcollector/fluent-bit/src/go.mod b/otelcollector/fluent-bit/src/go.mod index df7fbac4f..beb9c6b8b 100644 --- a/otelcollector/fluent-bit/src/go.mod +++ b/otelcollector/fluent-bit/src/go.mod @@ -6,6 +6,7 @@ require ( github.com/fluent/fluent-bit-go v0.0.0-20220311094233-780004bf5562 github.com/microsoft/ApplicationInsights-Go v0.4.4 github.com/prometheus/client_golang v1.18.0 + github.com/shirou/gopsutil/v4 v4.24.11 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/yaml.v2 v2.4.0 k8s.io/apimachinery v0.29.4 @@ -17,8 +18,10 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/ebitengine/purego v0.8.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/go-logr/logr v1.3.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.3 // indirect @@ -30,19 +33,24 @@ require ( github.com/google/uuid v1.3.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect github.com/ugorji/go/codec v1.1.7 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect - golang.org/x/sys v0.18.0 // indirect + golang.org/x/sys v0.26.0 // indirect golang.org/x/term v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect golang.org/x/time v0.3.0 // indirect diff --git a/otelcollector/fluent-bit/src/go.sum b/otelcollector/fluent-bit/src/go.sum index bb39182eb..869d3afee 100644 --- a/otelcollector/fluent-bit/src/go.sum +++ b/otelcollector/fluent-bit/src/go.sum @@ -8,6 +8,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.8.1 h1:sdRKd6plj7KYW33EH5As6YKfe8m9zbN9JMrOjNVF/BE= +github.com/ebitengine/purego v0.8.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/fluent/fluent-bit-go v0.0.0-20220311094233-780004bf5562 h1:x2JSiiQ0Q7JCyQXOYnTWdYYiSAuKs/8KwEFBdtxLfKg= @@ -15,6 +17,8 @@ github.com/fluent/fluent-bit-go v0.0.0-20220311094233-780004bf5562/go.mod h1:L92 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= @@ -33,6 +37,7 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -58,6 +63,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= @@ -81,6 +88,8 @@ github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= @@ -91,6 +100,8 @@ github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/shirou/gopsutil/v4 v4.24.11 h1:WaU9xqGFKvFfsUv94SXcUPD7rCkU0vr/asVdQOBZNj8= +github.com/shirou/gopsutil/v4 v4.24.11/go.mod h1:s4D/wg+ag4rG0WO7AiTj2BeYCRhym0vM7DHbZRxnIT8= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -103,11 +114,17 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc/go.mod h1:eyZnKCc955uh98WQvzOm0dgAeLnf2O0Rz0LPoC5ze+0= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v1.1.7 h1:2SvQaVZ1ouYrrKKwoSk2pzd4A9evlKJb9oTL+OaLUSs= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -130,9 +147,15 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 140d3dc72..ac073352b 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -16,9 +16,10 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { return output.FLBPluginRegister(ctx, "appinsights", "AppInsights GO!") } -//export FLBPluginInit // (fluentbit will call this) // ctx (context) pointer to fluentbit context (state/ c code) +// +//export FLBPluginInit func FLBPluginInit(ctx unsafe.Pointer) int { // This will not load the plugin instance. FLBPluginFlush won't be called. @@ -47,6 +48,10 @@ func FLBPluginInit(ctx unsafe.Pointer) int { go SendContainersCpuMemoryToAppInsightsMetrics() } + // Collect, aggregate, and send CPU and Memory usage telemetry for the processes below + processAggregations := InitProcessAggregations([]string{"otelcollector", "MetricsExtension", "fluent-bit", "mdsd", "telegraf"}) + processAggregations.Run() + go PushMEProcessedAndReceivedCountToAppInsightsMetrics() return output.FLB_OK diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go new file mode 100644 index 000000000..fad80952b --- /dev/null +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -0,0 +1,125 @@ +package main + +import ( + "fmt" + "math" + "sort" + "strings" + "sync" + "time" + + "github.com/microsoft/ApplicationInsights-Go/appinsights" + stats "github.com/shirou/gopsutil/v4/process" +) + +type Process struct { + processName string + processPID int32 + cpuValues sort.Float64Slice + memValues sort.Float64Slice + process *stats.Process +} + +type ProcessAggregations struct { + processMap map[string]*Process + mu sync.Mutex +} + +func InitProcessAggregations(processName []string) *ProcessAggregations { + fmt.Printf("Starting process aggregations") + + processAggregationsMap := make(map[string]*Process) + for _, processName := range processName { + pids, err := findPIDFromExe(processName) + if err != nil || len(pids) == 0 { + fmt.Printf("Error getting PID for process %s\n", processName) + continue + } + + process, err := stats.NewProcess(pids[0]) + if err != nil { + fmt.Printf("Error tracking process %s\n", processName) + continue + } + + p := Process{ + processName: processName, + processPID: pids[0], + process: process, + } + + processAggregationsMap[processName] = &p + } + + return &ProcessAggregations{ + processMap: processAggregationsMap, + } +} + +func (pa *ProcessAggregations) Run() { + go pa.CollectStats() + go pa.SendToAppInsights() +} + +func (pa *ProcessAggregations) CollectStats() { + ticker := time.NewTicker(time.Second * time.Duration(10)) + for ; true; <-ticker.C { + pa.mu.Lock() + + for _, p := range pa.processMap { + cpu, err := p.process.Percent(0) + if err == nil { + p.cpuValues = append(p.cpuValues, cpu) + p.cpuValues.Sort() + } + mem, err := p.process.MemoryPercent() + if err == nil { + p.memValues = append(p.memValues, float64(mem)) + p.memValues.Sort() + } + + fmt.Printf("cpu: %f, mem: %f\n", cpu, mem) + } + + pa.mu.Unlock() + } +} + +func (pa *ProcessAggregations) SendToAppInsights() { + ticker := time.NewTicker(time.Second * time.Duration(300)) + for ; true; <-ticker.C { + pa.mu.Lock() + + for processName, p := range pa.processMap { + for _, percentile := range []int{50, 95} { + if len(p.cpuValues) > 0 { + cpuMetric := appinsights.NewMetricTelemetry( + fmt.Sprintf("fluent_%s_cpu_usage_0%d", strings.ToLower(processName), percentile), + float64(p.cpuValues[int(math.Round(float64(len(p.cpuValues)-1)*float64(percentile)/100.0))]), + ) + fmt.Printf("cpuMetric: %v\n", cpuMetric) + fmt.Printf("cpuValues: %v\n", p.cpuValues) + fmt.Printf("index: %d\n", int(math.Round(float64(len(p.cpuValues)-1)*float64(percentile)/100.0))) + TelemetryClient.Track(cpuMetric) + } + + if len(p.memValues) > 0 { + memMetric := appinsights.NewMetricTelemetry( + fmt.Sprintf("fluent_%s_memory_rss_0%d", strings.ToLower(processName), percentile), + float64(p.memValues[int(math.Round(float64(len(p.memValues)-1)*float64(percentile)/100.0))]), + ) + fmt.Printf("memMetric: %v\n", memMetric) + fmt.Printf("memValues: %v\n", p.memValues) + fmt.Printf("index: %d\n", int(math.Round(float64(len(p.memValues)-1)*float64(percentile)/100.0))) + TelemetryClient.Track(memMetric) + } + } + + // Clear values for next aggregation period + p.cpuValues = sort.Float64Slice{} + p.memValues = sort.Float64Slice{} + } + + pa.mu.Unlock() + } +} diff --git a/otelcollector/fluent-bit/src/utils.go b/otelcollector/fluent-bit/src/utils.go index 896438ade..bf7b1161c 100644 --- a/otelcollector/fluent-bit/src/utils.go +++ b/otelcollector/fluent-bit/src/utils.go @@ -2,7 +2,10 @@ package main import ( "errors" + "fmt" "io/ioutil" + "os/exec" + "strconv" "strings" ) @@ -29,3 +32,28 @@ func ReadFileContents(fullPathToFileName string) (string, error) { return strings.TrimSpace(string(content)), nil } } + +// From telegraf codebase +func findPIDFromExe(process string) ([]int32, error) { + buf, err := exec.Command("pgrep", process).Output() + if err != nil { + return nil, fmt.Errorf("error running %w", err) + } + out := string(buf) + + fields := strings.Fields(out) + + fmt.Printf("fields: %v\n", fields) + pids := make([]int32, 0, len(fields)) + for _, field := range fields { + pid, err := strconv.ParseInt(field, 10, 32) + if err != nil { + return nil, err + } + pids = append(pids, int32(pid)) + } + + fmt.Printf("pids: %v\n", pids) + + return pids, nil +} From 2cc0010a2b90a330dcce67146d57b53667bd1b12 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 3 Dec 2024 15:05:04 -0800 Subject: [PATCH 09/47] add back in telegraf --- ...egraf-prometheus-collector-ta-enabled.conf | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf index d4e5a91ab..f0121f592 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf @@ -98,20 +98,20 @@ # OUTPUT PLUGINS # ############################################################################### -# [[outputs.application_insights]] -# ## Instrumentation key of the Application Insights resource. -# instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" +[[outputs.application_insights]] + ## Instrumentation key of the Application Insights resource. + instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - ## Timeout for closing (default: 5s). - # timeout = "5s" + # Timeout for closing (default: 5s). + timeout = "5s" - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false + # Enable additional diagnostic logging. + enable_diagnostic_logging = false # Send telegraf metrics to file(s) -[[outputs.file]] - ## Files to write to, "stdout" is a specially handled file. - files = ["stdout", "metrics.out"] +# [[outputs.file]] +# ## Files to write to, "stdout" is a specially handled file. +# files = ["stdout", "metrics.out"] ############################################################################### @@ -146,13 +146,13 @@ ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### -# [[aggregators.quantile]] -# period = "5m" -# drop_original = true -# quantiles = [0.50,0.95] -# algorithm = "t-digest" -# compression = 100.0 -# namepass = ["otelcollector", "metricsextension"] +[[aggregators.quantile]] + period = "5m" + drop_original = true + quantiles = [0.50,0.95] + algorithm = "t-digest" + compression = 100.0 + namepass = ["otelcollector", "metricsextension"] ############################################################################### # INPUT PLUGINS # @@ -227,28 +227,28 @@ name_override = "metricsextension" fieldpass = ["cpu_usage", "memory_rss"] -# [[inputs.prometheus]] -# interval = "5m" -# urls = ["http://localhost:8888/metrics"] -# fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] -# tagexclude = ["service_instance_id"] -# metric_version = 2 -# url_tag = "scrapeUrl" -# timeout = "15s" - -# [[inputs.prometheus]] -# interval = "5m" -# urls = ["http://localhost:9090/metrics"] -# fieldpass = ["prometheus_sd_http_failures_total"] -# metric_version = 2 -# url_tag = "scrapeUrl" -# timeout = "15s" - -# [[inputs.prometheus]] -# interval = "5m" -# urls = ["http://ama-metrics-operator-targets.kube-system.svc.cluster.local/metrics"] -# fieldpass = ["opentelemetry_allocator_targets","opentelemetry_allocator_collectors_discovered"] -# metric_version = 2 -# url_tag = "scrapeUrl" -# timeout = "15s" -# name_override = "target_allocator" +[[inputs.prometheus]] + interval = "5m" + urls = ["http://localhost:8888/metrics"] + fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] + tagexclude = ["service_instance_id"] + metric_version = 2 + url_tag = "scrapeUrl" + timeout = "15s" + +[[inputs.prometheus]] + interval = "5m" + urls = ["http://localhost:9090/metrics"] + fieldpass = ["prometheus_sd_http_failures_total"] + metric_version = 2 + url_tag = "scrapeUrl" + timeout = "15s" + +[[inputs.prometheus]] + interval = "5m" + urls = ["http://ama-metrics-operator-targets.kube-system.svc.cluster.local/metrics"] + fieldpass = ["opentelemetry_allocator_targets","opentelemetry_allocator_collectors_discovered"] + metric_version = 2 + url_tag = "scrapeUrl" + timeout = "15s" + name_override = "target_allocator" From 759e16787686147408e1f0e39af17bb8ec5126e4 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 3 Dec 2024 16:50:41 -0800 Subject: [PATCH 10/47] use correct mem value --- otelcollector/fluent-bit/src/process_stats.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index fad80952b..580759f59 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -72,9 +72,9 @@ func (pa *ProcessAggregations) CollectStats() { p.cpuValues = append(p.cpuValues, cpu) p.cpuValues.Sort() } - mem, err := p.process.MemoryPercent() + mem, err := p.process.MemoryInfo() if err == nil { - p.memValues = append(p.memValues, float64(mem)) + p.memValues = append(p.memValues, float64(mem.RSS)) p.memValues.Sort() } From beb425fe247e26937d9c127cc40695287bca3a63 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 4 Dec 2024 17:14:08 -0800 Subject: [PATCH 11/47] add in extra dimensions --- otelcollector/fluent-bit/src/process_stats.go | 119 ++++++++++++++---- 1 file changed, 97 insertions(+), 22 deletions(-) diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 580759f59..618890ae3 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -2,7 +2,9 @@ package main import ( "fmt" + "maps" "math" + "os" "sort" "strings" "sync" @@ -12,12 +14,56 @@ import ( stats "github.com/shirou/gopsutil/v4/process" ) +var replicasetDimensionsNameToEnvVar = map[string]string{ + "cpulimit": "CONTAINER_CPU_LIMIT", + "memlimit": "CONTAINER_MEMORY_LIMIT", + "defaultscrapekubelet": "AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED", + "defaultscrapecoreDns": "AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED", + "defaultscrapecadvisor": "AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED", + "defaultscrapekubeproxy": "AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED", + "defaultscrapeapiserver": "AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED", + "defaultscrapekubestate": "AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED", + "defaultscrapenodeexporter": "AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED", + "defaultscrapecollectorhealth": "AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED", + "defaultscrapewindowsexporter": "AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED", + "defaultscrapewindowskubeproxy": "AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED", + "defaultscrapepodannotations": "AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED", + "podannotationns": "AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX", + "defaultscrapekappiebasic": "AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED", + "defaultscrapenetworkobservabilityRetina": "AZMON_PROMETHEUS_NETWORKOBSERVABILITYRETINA_SCRAPING_ENABLED", + "defaultscrapenetworkobservabilityHubble": "AZMON_PROMETHEUS_NETWORKOBSERVABILITYHUBBLE_SCRAPING_ENABLED", + "defaultscrapenetworkobservabilityCilium": "AZMON_PROMETHEUS_NETWORKOBSERVABILITYCILIUM_SCRAPING_ENABLED", + "nodeexportertargetport": "NODE_EXPORTER_TARGETPORT", + "nodeexportername": "NODE_EXPORTER_NAME", + "kubestatename": "KUBE_STATE_NAME", + "kubestateversion": "KUBE_STATE_VERSION", + "nodeexporterversion": "NODE_EXPORTER_VERSION", + "akvauth": "AKVAUTH", + "debugmodeenabled": "DEBUG_MODE_ENABLED", + "kubestatemetriclabelsallowlist": "KUBE_STATE_METRIC_LABELS_ALLOWLIST", + "kubestatemetricannotationsallowlist": "KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST", + "httpproxyenabled": "HTTP_PROXY_ENABLED", + "tadapterh": "tokenadapterHealthyAfterSecs", + "tadapterf": "tokenadapterUnhealthyAfterSecs", + "setGlobalSettings": "AZMON_SET_GLOBAL_SETTINGS", + "globalSettingsConfigured": "AZMON_GLOBAL_SETTINGS_CONFIGURED", +} + +var daemonsetDimensionsNameToEnvVar = map[string]string{ + "cpulimit": "CONTAINER_CPU_LIMIT", + "memlimit": "CONTAINER_MEMORY_LIMIT", + "debugmodeenabled": "DEBUG_MODE_ENABLED", + "tadapterh": "tokenadapterHealthyAfterSecs", + "tadapterf": "tokenadapterUnhealthyAfterSecs", +} + type Process struct { - processName string - processPID int32 - cpuValues sort.Float64Slice - memValues sort.Float64Slice - process *stats.Process + processName string + processPID int32 + cpuValues sort.Float64Slice + memValues sort.Float64Slice + process *stats.Process + telemetryDimensions map[string]string } type ProcessAggregations struct { @@ -43,9 +89,10 @@ func InitProcessAggregations(processName []string) *ProcessAggregations { } p := Process{ - processName: processName, - processPID: pids[0], - process: process, + processName: processName, + processPID: pids[0], + process: process, + telemetryDimensions: getExtraDimensions(processName), // Set dimensions from env vars once } processAggregationsMap[processName] = &p @@ -67,11 +114,14 @@ func (pa *ProcessAggregations) CollectStats() { pa.mu.Lock() for _, p := range pa.processMap { + + // 0 means to use the delta with the previous CPU seconds reading cpu, err := p.process.Percent(0) if err == nil { p.cpuValues = append(p.cpuValues, cpu) p.cpuValues.Sort() } + mem, err := p.process.MemoryInfo() if err == nil { p.memValues = append(p.memValues, float64(mem.RSS)) @@ -90,27 +140,21 @@ func (pa *ProcessAggregations) SendToAppInsights() { for ; true; <-ticker.C { pa.mu.Lock() + // For each process, send 50th and 95th percentile CPU and Memory usage for processName, p := range pa.processMap { for _, percentile := range []int{50, 95} { + if len(p.cpuValues) > 0 { - cpuMetric := appinsights.NewMetricTelemetry( - fmt.Sprintf("fluent_%s_cpu_usage_0%d", strings.ToLower(processName), percentile), - float64(p.cpuValues[int(math.Round(float64(len(p.cpuValues)-1)*float64(percentile)/100.0))]), - ) - fmt.Printf("cpuMetric: %v\n", cpuMetric) - fmt.Printf("cpuValues: %v\n", p.cpuValues) - fmt.Printf("index: %d\n", int(math.Round(float64(len(p.cpuValues)-1)*float64(percentile)/100.0))) + cpuMetric := createProcessMetric(processName, "cpu_usage", percentile, p.cpuValues) + + // Add telemetry dimensions to the metric properties + maps.Copy(cpuMetric.Properties, p.telemetryDimensions) + TelemetryClient.Track(cpuMetric) } if len(p.memValues) > 0 { - memMetric := appinsights.NewMetricTelemetry( - fmt.Sprintf("fluent_%s_memory_rss_0%d", strings.ToLower(processName), percentile), - float64(p.memValues[int(math.Round(float64(len(p.memValues)-1)*float64(percentile)/100.0))]), - ) - fmt.Printf("memMetric: %v\n", memMetric) - fmt.Printf("memValues: %v\n", p.memValues) - fmt.Printf("index: %d\n", int(math.Round(float64(len(p.memValues)-1)*float64(percentile)/100.0))) + memMetric := createProcessMetric(processName, "memory_usage", percentile, p.memValues) TelemetryClient.Track(memMetric) } } @@ -123,3 +167,34 @@ func (pa *ProcessAggregations) SendToAppInsights() { pa.mu.Unlock() } } + +func getExtraDimensions(processName string) map[string]string { + extraDimensions := make(map[string]string) + + if processName == "otelcollector" { + var dimensionNamesToEnvVar map[string]string + + controllerType := os.Getenv(envControllerType) + if controllerType == "ReplicaSet" { + dimensionNamesToEnvVar = replicasetDimensionsNameToEnvVar + } else if controllerType == "DaemonSet" { + dimensionNamesToEnvVar = daemonsetDimensionsNameToEnvVar + } + + for dimensionName, envVarName := range dimensionNamesToEnvVar { + envVarValue := os.Getenv(envVarName) + if envVarValue != "" { + extraDimensions[dimensionName] = envVarValue + } + } + } + + return extraDimensions +} + +func createProcessMetric(processName string, metricName string, percentile int, values sort.Float64Slice) *appinsights.MetricTelemetry { + return appinsights.NewMetricTelemetry( + fmt.Sprintf("%s_%s_0%d", strings.ToLower(processName), metricName, percentile), + float64(values[int(math.Round(float64(len(values)-1)*float64(percentile)/100.0))]), + ) +} From 2b5102bdd7f0c2570114b2ed8fb708db5ef5b279 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 4 Dec 2024 17:15:56 -0800 Subject: [PATCH 12/47] remove telegraf cpu/mem --- .../telegraf-prometheus-collector-ds.conf | 44 ++++----- ...egraf-prometheus-collector-ta-enabled.conf | 90 +++++++++---------- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf index 417343f4c..75be1e653 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf @@ -160,29 +160,29 @@ # fieldpass = ["used_percent", "cluster", "node","host","device"] # taginclude = ["cluster","node"] -[[inputs.procstat]] - exe = "otelcollector" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "otelcollector" - fieldpass = ["cpu_usage", "memory_rss"] - [inputs.procstat.tags] -# Computer = "$NODE_NAME" -# NodeIp = "$NODE_IP" - cpulimit = "$CONTAINER_CPU_LIMIT" - memlimit = "$CONTAINER_MEMORY_LIMIT" - debugmodeenabled = "$DEBUG_MODE_ENABLED" - tadapterh="$tokenadapterHealthyAfterSecs" - tadapterf="$tokenadapterUnhealthyAfterSecs" +# [[inputs.procstat]] +# exe = "otelcollector" +# interval = "10s" +# pid_finder = "pgrep" +# pid_tag = true +# name_override = "otelcollector" +# fieldpass = ["cpu_usage", "memory_rss"] +# [inputs.procstat.tags] +# # Computer = "$NODE_NAME" +# # NodeIp = "$NODE_IP" +# cpulimit = "$CONTAINER_CPU_LIMIT" +# memlimit = "$CONTAINER_MEMORY_LIMIT" +# debugmodeenabled = "$DEBUG_MODE_ENABLED" +# tadapterh="$tokenadapterHealthyAfterSecs" +# tadapterf="$tokenadapterUnhealthyAfterSecs" -[[inputs.procstat]] - exe = "MetricsExtension" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "metricsextension" - fieldpass = ["cpu_usage", "memory_rss"] +# [[inputs.procstat]] +# exe = "MetricsExtension" +# interval = "10s" +# pid_finder = "pgrep" +# pid_tag = true +# name_override = "metricsextension" +# fieldpass = ["cpu_usage", "memory_rss"] [[inputs.prometheus]] interval = "5m" diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf index f0121f592..0641c14e5 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf @@ -176,56 +176,56 @@ # fieldpass = ["used_percent", "cluster", "node","host","device"] # taginclude = ["cluster","node"] -[[inputs.procstat]] - exe = "otelcollector" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "otelcollector" - fieldpass = ["cpu_usage", "memory_rss"] +# [[inputs.procstat]] +# exe = "otelcollector" +# interval = "10s" +# pid_finder = "pgrep" +# pid_tag = true +# name_override = "otelcollector" +# fieldpass = ["cpu_usage", "memory_rss"] # [inputs.procstat.tags] # # Computer = "$NODE_NAME" # # NodeIp = "$NODE_IP" -# cpulimit = "$CONTAINER_CPU_LIMIT" -# memlimit = "$CONTAINER_MEMORY_LIMIT" + # cpulimit = "$CONTAINER_CPU_LIMIT" + # memlimit = "$CONTAINER_MEMORY_LIMIT" -# defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" -# defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" -# defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" -# defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" -# defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" -# defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" -# defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" -# defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" -# defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" -# defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" -# defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" -# podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" -# defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" -# nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" -# nodeexportername = "$NODE_EXPORTER_NAME" -# kubestatename = "$KUBE_STATE_NAME" -# kubestateversion = "$KUBE_STATE_VERSION" -# operatortargetstaimgversion = "$OPERATOR_TARGETS_TA_IMG_VERSION" -# operatortargetscfgreaderimgversion = "$OPERATOR_TARGETS_CFG_READER_IMG_VERSION" -# nodeexporterversion = "$NODE_EXPORTER_VERSION" -# akvauth = "$AKVAUTH" -# debugmodeenabled = "$DEBUG_MODE_ENABLED" -# kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" -# kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" -# httpproxyenabled = "$HTTP_PROXY_ENABLED" -# tadapterh="$tokenadapterHealthyAfterSecs" -# tadapterf="$tokenadapterUnhealthyAfterSecs" -# setGlobalSettings="$AZMON_SET_GLOBAL_SETTINGS" -# globalSettingsConfigured="$AZMON_GLOBAL_SETTINGS_CONFIGURED" + # defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" + # defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" + # defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" + # defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" + # defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" + # defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" + # defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" + # defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" + # defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" + # defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" + # defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" + # podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" + # defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" + # nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" + # nodeexportername = "$NODE_EXPORTER_NAME" + # kubestatename = "$KUBE_STATE_NAME" + # kubestateversion = "$KUBE_STATE_VERSION" + # operatortargetstaimgversion = "$OPERATOR_TARGETS_TA_IMG_VERSION" + # operatortargetscfgreaderimgversion = "$OPERATOR_TARGETS_CFG_READER_IMG_VERSION" + # nodeexporterversion = "$NODE_EXPORTER_VERSION" + # akvauth = "$AKVAUTH" + # debugmodeenabled = "$DEBUG_MODE_ENABLED" + # kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" + # kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" + # httpproxyenabled = "$HTTP_PROXY_ENABLED" + # tadapterh="$tokenadapterHealthyAfterSecs" + # tadapterf="$tokenadapterUnhealthyAfterSecs" + # setGlobalSettings="$AZMON_SET_GLOBAL_SETTINGS" + # globalSettingsConfigured="$AZMON_GLOBAL_SETTINGS_CONFIGURED" -[[inputs.procstat]] - exe = "MetricsExtension" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "metricsextension" - fieldpass = ["cpu_usage", "memory_rss"] +# [[inputs.procstat]] +# exe = "MetricsExtension" +# interval = "10s" +# pid_finder = "pgrep" +# pid_tag = true +# name_override = "metricsextension" +# fieldpass = ["cpu_usage", "memory_rss"] [[inputs.prometheus]] interval = "5m" From a5228b2f04d65d22d3f82137731a0cc17902887b Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 5 Dec 2024 09:43:53 -0800 Subject: [PATCH 13/47] dont start telegraf --- otelcollector/main/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/main/main.go b/otelcollector/main/main.go index 401f7dc8a..b791286cf 100644 --- a/otelcollector/main/main.go +++ b/otelcollector/main/main.go @@ -191,7 +191,7 @@ func main() { shared.EchoVar("FLUENT_BIT_VERSION", string(fluentBitVersion)) } - shared.StartTelegraf() + //shared.StartTelegraf() } From b07f6aad1b3cab0aae2edd9fcc7eedf89ebb7e93 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 5 Dec 2024 11:30:02 -0800 Subject: [PATCH 14/47] unrelated telemetry fixes --- otelcollector/fluent-bit/src/process_stats.go | 2 +- otelcollector/fluent-bit/src/telemetry.go | 24 ++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 618890ae3..b9083bb34 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -154,7 +154,7 @@ func (pa *ProcessAggregations) SendToAppInsights() { } if len(p.memValues) > 0 { - memMetric := createProcessMetric(processName, "memory_usage", percentile, p.memValues) + memMetric := createProcessMetric(processName, "memory_rss", percentile, p.memValues) TelemetryClient.Track(memMetric) } } diff --git a/otelcollector/fluent-bit/src/telemetry.go b/otelcollector/fluent-bit/src/telemetry.go index e3af87cff..b6e221f30 100644 --- a/otelcollector/fluent-bit/src/telemetry.go +++ b/otelcollector/fluent-bit/src/telemetry.go @@ -72,6 +72,12 @@ var ( AcstorCapacityProvisionerKeepListRegex string // ACStor Metrics Exporter keep list regex AcstorMetricsExporterKeepListRegex string + // Network Observability Cilium metrics keep list regex + NetworkObservabilityCiliumKeepListRegex string + // Network Observability Hubble metrics keep list regex + NetworkObservabilityHubbleKeepListRegex string + // Network Observability Retina metrics keep list regex + NetworkObservabilityRetinaKeepListRegex string // Kubelet scrape interval KubeletScrapeInterval string @@ -97,10 +103,16 @@ var ( PodAnnotationScrapeInterval string // Kappie Basic scrape interval KappieBasicScrapeInterval string - // ACStor Capacity Provisioner keep list regex + // ACStor Capacity Provisioner scrape interval AcstorCapacityProvisionerScrapeInterval string - // ACStor Metrics Exporter keep list regex + // ACStor Metrics Exporter scrape interval AcstorMetricsExporterScrapeInterval string + // Network Observability Cilium metrics scrape interval + NetworkObservabilityCiliumScrapeInterval string + // Network Observability Hubble metrics scrape interval + NetworkObservabilityHubbleScrapeInterval string + // Network Observability Retina metrics scrape interval + NetworkObservabilityRetinaScrapeInterval string // meMetricsProcessedCount map, which holds references to metrics per metric account meMetricsProcessedCountMap = make(map[string]*meMetricsProcessedCount) @@ -302,6 +314,9 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { KappieBasicKeepListRegex = regexHash["KAPPIEBASIC_METRICS_KEEP_LIST_REGEX"] AcstorCapacityProvisionerKeepListRegex = regexHash["ACSTORCAPACITYPROVISONER_KEEP_LIST_REGEX"] AcstorMetricsExporterKeepListRegex = regexHash["ACSTORMETRICSEXPORTER_KEEP_LIST_REGEX"] + NetworkObservabilityCiliumKeepListRegex = regexHash["NETWORKOBSERVABILITYCILIUM_METRICS_KEEP_LIST_REGEX"] + NetworkObservabilityHubbleKeepListRegex = regexHash["NETWORKOBSERVABILITYHUBBLE_METRICS_KEEP_LIST_REGEX"] + NetworkObservabilityRetinaKeepListRegex = regexHash["NETWORKOBSERVABILITYRETINA_METRICS_KEEP_LIST_REGEX"] } } @@ -329,7 +344,10 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { PodAnnotationScrapeInterval = intervalHash["POD_ANNOTATION_SCRAPE_INTERVAL"] KappieBasicScrapeInterval = intervalHash["KAPPIEBASIC_SCRAPE_INTERVAL"] AcstorCapacityProvisionerScrapeInterval = intervalHash["ACSTORCAPACITYPROVISIONER_SCRAPE_INTERVAL"] - AcstorMetricsExporterScrapeInterval = intervalHash["ACSTORMETRICSEXPORTER_KEEP_LIST_REGEX"] + AcstorMetricsExporterScrapeInterval = intervalHash["ACSTORMETRICSEXPORTER_SCRAPE_INTERVAL"] + NetworkObservabilityCiliumScrapeInterval = intervalHash["NETWORKOBSERVABILITYCILIUM_SCRAPE_INTERVAL"] + NetworkObservabilityHubbleScrapeInterval = intervalHash["NETWORKOBSERVABILITYHUBBLE_SCRAPE_INTERVAL"] + NetworkObservabilityRetinaScrapeInterval = intervalHash["NETWORKOBSERVABILITYRETINA_SCRAPE_INTERVAL"] } } From 9a0bff344ccb311c9ecdc63b97bbcc9e8574450e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 13:16:27 -0800 Subject: [PATCH 15/47] build custom fluent-bit --- otelcollector/build/linux/Dockerfile | 43 ++++- .../fluent-bit/plugins_options.cmake | 153 ++++++++++++++++++ otelcollector/scripts/setup.sh | 6 +- 3 files changed, 197 insertions(+), 5 deletions(-) create mode 100644 otelcollector/fluent-bit/plugins_options.cmake diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 053bb3d94..0e87a74df 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -68,6 +68,39 @@ RUN apt-get update && apt-get install gcc-aarch64-linux-gnu -y ARG TARGETOS TARGETARCH RUN if [ "$TARGETARCH" = "arm64" ] ; then CC=aarch64-linux-gnu-gcc CGO_ENABLED=1 GOOS=$TARGETOS GOARCH=$TARGETARCH go build -buildmode=exe -ldflags '-linkmode external -extldflags=-Wl,-z,now' -o main.exe ./main.go ; else CGO_ENABLED=1 GOOS=$TARGETOS GOARCH=$TARGETARCH go build -buildmode=exe -ldflags '-linkmode external -extldflags=-Wl,-z,now' -o main.exe ./main.go ; fi +ARG TARGETARCH +FROM mcr.microsoft.com/cbl-mariner/base/core:2.0 AS fluent-bit-binary-builder +WORKDIR / +# Install with the same exact dependencies and code source that Mariner uses +RUN tdnf install wget tar ca-certificates bison cmake cyrus-sasl-devel doxygen flex gcc-c++ \ + gnutls-devel graphviz libpq-devel libyaml-devel luajit-devel make openssl-devel pkgconfig \ + systemd-devel systemd-rpm-macros zlib-devel build-essential -y +ARG FLUENT_BIT_VERSION +RUN wget https://github.com/fluent/fluent-bit/archive/refs/tags/v${FLUENT_BIT_VERSION}.tar.gz +RUN tar -xvf v${FLUENT_BIT_VERSION}.tar.gz +# Add a file with settings to build only the plugins we use +COPY ./plugins_options.cmake /fluent-bit-${FLUENT_BIT_VERSION}/cmake/plugins_options.cmake +# Make a change that allows Fluent-Bit metrics to flow to our Go output plugin +RUN sed -i '/out->type = FLB_OUTPUT_PLUGIN_PROXY;/a \ \ \ \ out->event_type = FLB_OUTPUT_LOGS | FLB_OUTPUT_METRICS;' /fluent-bit-${FLUENT_BIT_VERSION}/src/flb_plugin_proxy.c +WORKDIR /fluent-bit-${FLUENT_BIT_VERSION}/build +# Run cmake with the same flags that Mariner uses +RUN cmake \ +-DCMAKE_BUILD_TYPE=RelWithDebInfo \ +-DFLB_EXAMPLES=Off \ +-DFLB_OUT_SLACK=Off \ +-DFLB_IN_SYSTEMD=On \ +-DFLB_OUT_TD=Off \ +-DFLB_OUT_ES=Off \ +-DFLB_SHARED_LIB=On \ +-DFLB_RELEASE=On \ +-DFLB_DEBUG=Off \ +-DFLB_TLS=On \ +-DFLB_JEMALLOC=On \ +-DFLB_PREFER_SYSTEM_LIBS=On \ +-DFLB_PROXY_GO=On ../ +RUN make +RUN make install + FROM mcr.microsoft.com/cbl-mariner/base/core:2.0 as builder LABEL description="Azure Monitor Prometheus metrics collector" LABEL maintainer="ciprometheus@microsoft.com" @@ -106,6 +139,9 @@ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricex COPY ./telegraf/ $tmpdir/telegraf/ COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ +COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit +COPY --from=fluent-bit-binary-builder /usr/local/etc/fluent-bit /usr/local/etc/fluent-bit +COPY --from=fluent-bit-binary-builder /usr/local/lib/fluent-bit /usr/local/etc/fluent-bit COPY ./react /static/react COPY ./LICENSE $tmpdir/microsoft COPY ./NOTICE $tmpdir/microsoft @@ -183,6 +219,10 @@ COPY --from=builder /bin/sh /bin/sh COPY --from=builder /usr/bin/p11-kit /usr/bin COPY --from=builder /usr/bin/trust /usr/bin +COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit +COPY --from=fluent-bit-binary-builder /usr/local/etc/fluent-bit /usr/local/etc/fluent-bit +COPY --from=fluent-bit-binary-builder /usr/local/lib/fluent-bit /usr/local/etc/fluent-bit + # bash dependencies COPY --from=builder /lib/libreadline.so.8 /lib/ COPY --from=builder /usr/lib/libncursesw.so.6 /usr/lib/libtinfo.so.6 /usr/lib/ @@ -198,7 +238,8 @@ COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 COPY --from=builder /lib64/libuuid.so.1 /lib64 # fluent-bit dependencies # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ +# COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ +COPY --from=fluent-bit-binary-builder /lib/libluajit-5.1.so.2 /lib/libssl.so.1.1 /lib/libcrypto.so.1.1 /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/libzstd.so.1 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libgpg-error.so.0 /lib/ # telegraf dependencies COPY --from=builder /lib/libc.so.6 /lib/ # mdsd dependencies diff --git a/otelcollector/fluent-bit/plugins_options.cmake b/otelcollector/fluent-bit/plugins_options.cmake new file mode 100644 index 000000000..c4551d36d --- /dev/null +++ b/otelcollector/fluent-bit/plugins_options.cmake @@ -0,0 +1,153 @@ +macro(DEFINE_OPTION option_name description default_value) + set(temp_value ${default_value}) + if(FLB_MINIMAL) + set(temp_value OFF) + endif() + option(${option_name} "${description}" ${temp_value}) +endmacro() + +# Add the FLB_MINIMAL option +option(FLB_MINIMAL "Enable minimal build configuration" No) + +# Inputs (sources, data collectors) +# ================================= +DEFINE_OPTION(FLB_IN_BLOB "Enable Blob input plugin" OFF) +DEFINE_OPTION(FLB_IN_CALYPTIA_FLEET "Enable Calyptia Fleet input plugin" OFF) +DEFINE_OPTION(FLB_IN_COLLECTD "Enable Collectd input plugin" OFF) +DEFINE_OPTION(FLB_IN_CPU "Enable CPU input plugin" OFF) +DEFINE_OPTION(FLB_IN_DISK "Enable Disk input plugin" OFF) +DEFINE_OPTION(FLB_IN_DOCKER "Enable Docker input plugin" OFF) +DEFINE_OPTION(FLB_IN_DOCKER_EVENTS "Enable Docker events input plugin" OFF) +DEFINE_OPTION(FLB_IN_DUMMY "Enable Dummy input plugin" OFF) +DEFINE_OPTION(FLB_IN_ELASTICSEARCH "Enable Elasticsearch (Bulk API) input plugin" OFF) +# Necessary for rewrite_tag filter plugin +DEFINE_OPTION(FLB_IN_EMITTER "Enable emitter input plugin" ON) +DEFINE_OPTION(FLB_IN_EVENT_TEST "Enable event test plugin" OFF) +DEFINE_OPTION(FLB_IN_EVENT_TYPE "Enable event type plugin" OFF) +DEFINE_OPTION(FLB_IN_EXEC "Enable Exec input plugin" OFF) +DEFINE_OPTION(FLB_IN_EXEC_WASI "Enable Exec WASI input plugin" OFF) +DEFINE_OPTION(FLB_IN_FLUENTBIT_METRICS "Enable Fluent Bit metrics plugin" OFF) +DEFINE_OPTION(FLB_IN_FORWARD "Enable Forward input plugin" OFF) +DEFINE_OPTION(FLB_IN_HEAD "Enable Head input plugin" OFF) +DEFINE_OPTION(FLB_IN_HEALTH "Enable Health input plugin" OFF) +DEFINE_OPTION(FLB_IN_HTTP "Enable HTTP input plugin" OFF) +DEFINE_OPTION(FLB_IN_KAFKA "Enable Kafka input plugin" OFF) +DEFINE_OPTION(FLB_IN_KMSG "Enable Kernel log input plugin" OFF) +DEFINE_OPTION(FLB_IN_KUBERNETES_EVENTS "Enable Kubernetes Events plugin" OFF) +DEFINE_OPTION(FLB_IN_LIB "Enable library mode input plugin" OFF) +DEFINE_OPTION(FLB_IN_MEM "Enable Memory input plugin" OFF) +DEFINE_OPTION(FLB_IN_MQTT "Enable MQTT Broker input plugin" OFF) +DEFINE_OPTION(FLB_IN_NETIF "Enable NetworkIF input plugin" OFF) +DEFINE_OPTION(FLB_IN_NGINX_EXPORTER_METRICS "Enable Nginx Metrics input plugin" OFF) +DEFINE_OPTION(FLB_IN_NODE_EXPORTER_METRICS "Enable node exporter metrics input plugin" OFF) +DEFINE_OPTION(FLB_IN_OPENTELEMETRY "Enable OpenTelemetry input plugin" OFF) +DEFINE_OPTION(FLB_IN_PODMAN_METRICS "Enable Podman Metrics input plugin" OFF) +DEFINE_OPTION(FLB_IN_PROCESS_EXPORTER_METRICS "Enable process exporter metrics input plugin" OFF) +DEFINE_OPTION(FLB_IN_PROC "Enable Process input plugin" OFF) +DEFINE_OPTION(FLB_IN_PROMETHEUS_REMOTE_WRITE "Enable prometheus remote write input plugin" OFF) +DEFINE_OPTION(FLB_IN_PROMETHEUS_SCRAPE "Enable Prometheus Scrape input plugin" OFF) +DEFINE_OPTION(FLB_IN_RANDOM "Enable random input plugin" OFF) +DEFINE_OPTION(FLB_IN_SERIAL "Enable Serial input plugin" OFF) +DEFINE_OPTION(FLB_IN_SPLUNK "Enable Splunk HTTP HEC input plugin" OFF) +DEFINE_OPTION(FLB_IN_STATSD "Enable StatsD input plugin" OFF) +DEFINE_OPTION(FLB_IN_STDIN "Enable Standard input plugin" OFF) +DEFINE_OPTION(FLB_IN_STORAGE_BACKLOG "Enable storage backlog input plugin" OFF) +DEFINE_OPTION(FLB_IN_SYSLOG "Enable Syslog input plugin" OFF) +DEFINE_OPTION(FLB_IN_SYSTEMD "Enable Systemd input plugin" OFF) +DEFINE_OPTION(FLB_IN_TAIL "Enable Tail input plugin" ON) +DEFINE_OPTION(FLB_IN_TCP "Enable TCP input plugin" OFF) +DEFINE_OPTION(FLB_IN_THERMAL "Enable Thermal plugin" OFF) +DEFINE_OPTION(FLB_IN_UDP "Enable UDP input plugin" OFF) +DEFINE_OPTION(FLB_IN_UNIX_SOCKET "Enable Unix socket input plugin" OFF) +DEFINE_OPTION(FLB_IN_WINLOG "Enable Windows Log input plugin" OFF) +DEFINE_OPTION(FLB_IN_WINDOWS_EXPORTER_METRICS "Enable windows exporter metrics input plugin" OFF) +DEFINE_OPTION(FLB_IN_WINEVTLOG "Enable Windows EvtLog input plugin" OFF) +DEFINE_OPTION(FLB_IN_WINSTAT "Enable Windows Stat input plugin" OFF) +DEFINE_OPTION(FLB_IN_EBPF "Enable Linux eBPF input plugin" OFF) + +# Processors +# ========== +DEFINE_OPTION(FLB_PROCESSOR_CONTENT_MODIFIER "Enable content modifier processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_LABELS "Enable metrics label manipulation processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_METRICS_SELECTOR "Enable metrics selector processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_SQL "Enable SQL processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_OPENTELEMETRY_ENVELOPE "Enable OpenTelemetry envelope processor" OFF) + +# Filters +# ======= +DEFINE_OPTION(FLB_FILTER_ALTER_SIZE "Enable alter_size filter" OFF) +DEFINE_OPTION(FLB_FILTER_AWS "Enable aws filter" OFF) +DEFINE_OPTION(FLB_FILTER_CHECKLIST "Enable checklist filter" OFF) +DEFINE_OPTION(FLB_FILTER_ECS "Enable AWS ECS filter" OFF) +DEFINE_OPTION(FLB_FILTER_EXPECT "Enable expect filter" OFF) +DEFINE_OPTION(FLB_FILTER_GEOIP2 "Enable geoip2 filter" OFF) +DEFINE_OPTION(FLB_FILTER_GREP "Enable grep filter" ON) +DEFINE_OPTION(FLB_FILTER_KUBERNETES "Enable kubernetes filter" OFF) +DEFINE_OPTION(FLB_FILTER_LOG_TO_METRICS "Enable log-derived metrics filter" OFF) +DEFINE_OPTION(FLB_FILTER_LUA "Enable Lua scripting filter" OFF) +DEFINE_OPTION(FLB_FILTER_LUA_USE_MPACK "Enable mpack on the lua filter" OFF) +DEFINE_OPTION(FLB_FILTER_MODIFY "Enable modify filter" OFF) +DEFINE_OPTION(FLB_FILTER_MULTILINE "Enable multiline filter" OFF) +DEFINE_OPTION(FLB_FILTER_NEST "Enable nest filter" OFF) +DEFINE_OPTION(FLB_FILTER_NIGHTFALL "Enable Nightfall filter" OFF) +DEFINE_OPTION(FLB_FILTER_PARSER "Enable parser filter" ON) +DEFINE_OPTION(FLB_FILTER_RECORD_MODIFIER "Enable record_modifier filter" OFF) +DEFINE_OPTION(FLB_FILTER_REWRITE_TAG "Enable tag rewrite filter" ON) +DEFINE_OPTION(FLB_FILTER_STDOUT "Enable stdout filter" OFF) +DEFINE_OPTION(FLB_FILTER_SYSINFO "Enable sysinfo filter" OFF) +DEFINE_OPTION(FLB_FILTER_THROTTLE "Enable throttle filter" OFF) +DEFINE_OPTION(FLB_FILTER_THROTTLE_SIZE "Enable throttle size filter" OFF) +DEFINE_OPTION(FLB_FILTER_TYPE_CONVERTER "Enable type converter filter" OFF) +DEFINE_OPTION(FLB_FILTER_TENSORFLOW "Enable tensorflow filter" OFF) +DEFINE_OPTION(FLB_FILTER_WASM "Enable WASM filter" OFF) + +# Outputs (destinations) +# ====================== +DEFINE_OPTION(FLB_OUT_AZURE "Enable Azure output plugin" OFF) +DEFINE_OPTION(FLB_OUT_AZURE_BLOB "Enable Azure output plugin" OFF) +DEFINE_OPTION(FLB_OUT_AZURE_KUSTO "Enable Azure Kusto output plugin" OFF) +DEFINE_OPTION(FLB_OUT_AZURE_LOGS_INGESTION "Enable Azure Logs Ingestion output plugin" OFF) +DEFINE_OPTION(FLB_OUT_BIGQUERY "Enable BigQuery output plugin" OFF) +DEFINE_OPTION(FLB_OUT_CALYPTIA "Enable Calyptia monitoring plugin" OFF) +DEFINE_OPTION(FLB_OUT_CHRONICLE "Enable Google Chronicle output plugin" OFF) +DEFINE_OPTION(FLB_OUT_CLOUDWATCH_LOGS "Enable AWS CloudWatch output plugin" OFF) +DEFINE_OPTION(FLB_OUT_COUNTER "Enable Counter output plugin" OFF) +DEFINE_OPTION(FLB_OUT_DATADOG "Enable DataDog output plugin" OFF) +DEFINE_OPTION(FLB_OUT_ES "Enable Elasticsearch output plugin" OFF) +DEFINE_OPTION(FLB_OUT_EXIT "Enable Exit output plugin" OFF) +DEFINE_OPTION(FLB_OUT_FILE "Enable file output plugin" OFF) +DEFINE_OPTION(FLB_OUT_FLOWCOUNTER "Enable flowcount output plugin" OFF) +DEFINE_OPTION(FLB_OUT_FORWARD "Enable Forward output plugin" OFF) +DEFINE_OPTION(FLB_OUT_GELF "Enable GELF output plugin" OFF) +DEFINE_OPTION(FLB_OUT_HTTP "Enable HTTP output plugin" OFF) +DEFINE_OPTION(FLB_OUT_INFLUXDB "Enable InfluxDB output plugin" OFF) +DEFINE_OPTION(FLB_OUT_KAFKA "Enable Kafka output plugin" OFF) +DEFINE_OPTION(FLB_OUT_KAFKA_REST "Enable Kafka Rest output plugin" OFF) +DEFINE_OPTION(FLB_OUT_KINESIS_FIREHOSE "Enable AWS Firehose output plugin" OFF) +DEFINE_OPTION(FLB_OUT_KINESIS_STREAMS "Enable AWS Kinesis output plugin" OFF) +DEFINE_OPTION(FLB_OUT_LIB "Enable library mode output plugin" OFF) +DEFINE_OPTION(FLB_OUT_LOGDNA "Enable LogDNA output plugin" OFF) +DEFINE_OPTION(FLB_OUT_LOKI "Enable Loki output plugin" OFF) +DEFINE_OPTION(FLB_OUT_NATS "Enable NATS output plugin" OFF) +DEFINE_OPTION(FLB_OUT_NRLOGS "Enable New Relic output plugin" OFF) +DEFINE_OPTION(FLB_OUT_NULL "Enable dev null output plugin" OFF) +DEFINE_OPTION(FLB_OUT_OPENSEARCH "Enable OpenSearch output plugin" OFF) +DEFINE_OPTION(FLB_OUT_OPENTELEMETRY "Enable OpenTelemetry plugin" OFF) +DEFINE_OPTION(FLB_OUT_ORACLE_LOG_ANALYTICS "Enable Oracle Cloud Infrastructure Logging analytics plugin" OFF) +DEFINE_OPTION(FLB_OUT_PGSQL "Enable PostgreSQL output plugin" OFF) +DEFINE_OPTION(FLB_OUT_PLOT "Enable Plot output plugin" OFF) +DEFINE_OPTION(FLB_OUT_PROMETHEUS_EXPORTER "Enable Prometheus exporter plugin" OFF) +DEFINE_OPTION(FLB_OUT_PROMETHEUS_REMOTE_WRITE "Enable Prometheus remote write plugin" OFF) +DEFINE_OPTION(FLB_OUT_RETRY "Enable Retry test output plugin" OFF) +DEFINE_OPTION(FLB_OUT_S3 "Enable AWS S3 output plugin" OFF) +DEFINE_OPTION(FLB_OUT_SKYWALKING "Enable Apache SkyWalking output plugin" OFF) +DEFINE_OPTION(FLB_OUT_SLACK "Enable Slack output plugin" OFF) +DEFINE_OPTION(FLB_OUT_SPLUNK "Enable Splunk output plugin" OFF) +DEFINE_OPTION(FLB_OUT_STACKDRIVER "Enable Stackdriver output plugin" OFF) +DEFINE_OPTION(FLB_OUT_STDOUT "Enable STDOUT output plugin" ON) +DEFINE_OPTION(FLB_OUT_SYSLOG "Enable Syslog output plugin" OFF) +DEFINE_OPTION(FLB_OUT_TD "Enable Treasure Data output plugin" OFF) +DEFINE_OPTION(FLB_OUT_TCP "Enable TCP output plugin" OFF) +DEFINE_OPTION(FLB_OUT_UDP "Enable UDP output plugin" OFF) +DEFINE_OPTION(FLB_OUT_VIVO_EXPORTER "Enable Vivo exporter output plugin" OFF) +DEFINE_OPTION(FLB_OUT_WEBSOCKET "Enable Websocket output plugin" OFF) diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index fa0f4c7d6..3c9ebef03 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -56,10 +56,8 @@ sudo tdnf install telegraf-1.29.4 -y sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.txt # Install fluent-bit -echo "Installing fluent-bit..." -sudo tdnf install fluent-bit -y -# wget https://packages.fluentbit.io/centos/7/fluent-bit-3.2.2-1.x86_64.rpm -# sudo tdnf install -y fluent-bit-3.2.2-1.x86_64.rpm +# echo "Installing fluent-bit..." +# sudo tdnf install fluent-bit -y # Setup hourly cron for logrotate cp /etc/cron.daily/logrotate /etc/cron.hourly/ From 562b7ccaf9d65306ba579cbea84dd64ff6515db3 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 13:40:47 -0800 Subject: [PATCH 16/47] add in metrics --- otelcollector/fluent-bit/fluent-bit.conf | 28 ++- .../fluent-bit/plugins_options.cmake | 2 +- .../fluent-bit/src/cmetrics_decoder.go | 229 ++++++++++++++++++ otelcollector/fluent-bit/src/go.mod | 3 +- otelcollector/fluent-bit/src/go.sum | 8 +- .../fluent-bit/src/out_appinsights.go | 4 +- 6 files changed, 265 insertions(+), 9 deletions(-) create mode 100644 otelcollector/fluent-bit/src/cmetrics_decoder.go diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf index 70e47011d..a7581ed8a 100644 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ b/otelcollector/fluent-bit/fluent-bit.conf @@ -6,7 +6,7 @@ storage.sync normal storage.checksum off storage.backlog.mem_limit 10M - Log_Level info + Log_Level debug Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf Log_File /opt/fluent-bit/fluent-bit.log @@ -92,6 +92,30 @@ Skip_Long_Lines On Ignore_Older 2m +[INPUT] + name prometheus_scrape + host 0.0.0.0 + port 8888 + tag prometheus.metrics.otelcollector + metrics_path /metrics + scrape_interval 1m + +[INPUT] + name prometheus_scrape + host 0.0.0.0 + port 9090 + tag prometheus.metrics.prometheus + metrics_path /metrics + scrape_interval 1m + +[INPUT] + name prometheus_scrape + host ama-metrics-operator-targets.kube-system.svc.cluster.local + port 80 + tag prometheus.metrics.targetallocator + metrics_path /metrics + scrape_interval 1m + # Send log lines that contain the telemetry we want to a different tag # to then send to customMetrics table [FILTER] @@ -138,7 +162,7 @@ [OUTPUT] Name appinsights - Match prometheus.log.* + Match_regex prometheus.log.*|prometheus.metrics.* [OUTPUT] Name stdout diff --git a/otelcollector/fluent-bit/plugins_options.cmake b/otelcollector/fluent-bit/plugins_options.cmake index c4551d36d..10d2d1f63 100644 --- a/otelcollector/fluent-bit/plugins_options.cmake +++ b/otelcollector/fluent-bit/plugins_options.cmake @@ -45,7 +45,7 @@ DEFINE_OPTION(FLB_IN_PODMAN_METRICS "Enable Podman Metrics input plugi DEFINE_OPTION(FLB_IN_PROCESS_EXPORTER_METRICS "Enable process exporter metrics input plugin" OFF) DEFINE_OPTION(FLB_IN_PROC "Enable Process input plugin" OFF) DEFINE_OPTION(FLB_IN_PROMETHEUS_REMOTE_WRITE "Enable prometheus remote write input plugin" OFF) -DEFINE_OPTION(FLB_IN_PROMETHEUS_SCRAPE "Enable Prometheus Scrape input plugin" OFF) +DEFINE_OPTION(FLB_IN_PROMETHEUS_SCRAPE "Enable Prometheus Scrape input plugin" ON) DEFINE_OPTION(FLB_IN_RANDOM "Enable random input plugin" OFF) DEFINE_OPTION(FLB_IN_SERIAL "Enable Serial input plugin" OFF) DEFINE_OPTION(FLB_IN_SPLUNK "Enable Splunk HTTP HEC input plugin" OFF) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go new file mode 100644 index 000000000..6f621265f --- /dev/null +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -0,0 +1,229 @@ +package main + +import ( + "C" + "encoding/binary" + "fmt" + "reflect" + "strings" + "time" + "unsafe" + + "github.com/fluent/fluent-bit-go/output" + "github.com/microsoft/ApplicationInsights-Go/appinsights" + "github.com/mitchellh/mapstructure" + "github.com/ugorji/go/codec" +) + +// Taken from fluent-bit-go to modify +type FLBDecoder struct { + handle *codec.MsgpackHandle + mpdec *codec.Decoder +} + +type FLBTime struct { + time.Time +} + +func (f FLBTime) WriteExt(interface{}) []byte { + panic("unsupported") +} + +func (f FLBTime) ReadExt(i interface{}, b []byte) { + out := i.(*FLBTime) + sec := binary.BigEndian.Uint32(b) + usec := binary.BigEndian.Uint32(b[4:]) + out.Time = time.Unix(int64(sec), int64(usec)) +} + +func (f FLBTime) ConvertExt(v interface{}) interface{} { + return nil +} + +func (f FLBTime) UpdateExt(dest interface{}, v interface{}) { + panic("unsupported") +} + +type AggregationType int64 + +const ( + UNSPECIFIED AggregationType = 0 + DELTA AggregationType = 1 + CUMMULATIVE AggregationType = 2 +) + +func (at AggregationType) String() string { + switch at { + case UNSPECIFIED: + return "unspecified" + case DELTA: + return "delta" + case CUMMULATIVE: + return "cumulative" + default: + return "" + } +} + +type MetricType int64 + +const ( + COUNTER MetricType = 0 + GAUGE MetricType = 1 + HISTOGRAM MetricType = 2 + SUMMARY MetricType = 3 + UNTYPED MetricType = 4 +) + +func (mt MetricType) String() string { + switch mt { + case COUNTER: + return "counter" + case GAUGE: + return "gauge" + case HISTOGRAM: + return "histogram" + case SUMMARY: + return "summary" + case UNTYPED: + return "untyped" + default: + return "" + } +} + +type CMetrics struct { + Meta struct { + Cmetrics map[string]interface{} `mapstructure:"cmetrics"` + External map[string]interface{} `mapstructure:"external"` + Processing struct { + StaticLabels []interface{} `mapstructure:"static_labels"` + } `mapstructure:"processing"` + } `mapstructure:"meta"` + Metrics []struct { + Meta struct { + AggregationType AggregationType `mapstructure:"aggregation_type"` + Labels []string `mapstructure:"labels"` + Opts struct { + Desc string `mapstructure:"desc"` + Name string `mapstructure:"name"` + Namespace string `mapstructure:"ns"` + Subsystem string `mapstructure:"ss"` + } `mapstructure:"opts"` + Type MetricType `mapstructure:"type"` + Ver int `mapstructure:"ver"` + } `mapstructure:"meta"` + Values []struct { + Hash int64 `mapstructure:"hash"` + Labels []string `mapstructure:"labels"` + Ts int64 `mapstructure:"ts"` + Value float64 `mapstructure:"value"` + } `mapstructure:"values"` + } `mapstructure:"metrics"` +} + +func (cm CMetrics) String() string { + var ret strings.Builder + + for _, metric := range cm.Metrics { + ret.WriteString(fmt.Sprintf("# HELP %s %s\n", metric.Meta.Opts.Name, metric.Meta.Opts.Desc)) + ret.WriteString(fmt.Sprintf("# TYPE %s %s\n", metric.Meta.Opts.Name, metric.Meta.Type)) + + for _, value := range metric.Values { + ret.WriteString(fmt.Sprintf("%s{", metric.Meta.Opts.Name)) + for i, labelName := range metric.Meta.Labels { + ret.WriteString(fmt.Sprintf("%s=%s", labelName, value.Labels[i])) + if i < len(metric.Meta.Labels)-1 { + ret.WriteString(",") + } + } + ret.WriteString(fmt.Sprintf("} %.0f\n", value.Value)) + } + } + + return ret.String() +} + +func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) int { + for _, record := range records { + cMetrics := ConvertRecordToCMetrics(record) + for _, metric := range cMetrics.Metrics { + for _, value := range metric.Values { + metricTelemetryItem := appinsights.NewMetricTelemetry(metric.Meta.Opts.Name, value.Value) + for i, labelName := range metric.Meta.Labels { + metricTelemetryItem.Properties[labelName] = fmt.Sprintf("%d", value.Labels[i]) + } + TelemetryClient.Track(metricTelemetryItem) + } + } + } + return output.FLB_OK +} + +func ConvertRecordToCMetrics(record map[interface{}]interface{}) (cMetrics CMetrics) { + var result CMetrics + mapstructure.WeakDecode(record, &result) + return result +} + +func NewDecoder(data unsafe.Pointer, length int) *FLBDecoder { + var b []byte + + dec := new(FLBDecoder) + dec.handle = new(codec.MsgpackHandle) + dec.handle.SetBytesExt(reflect.TypeOf(FLBTime{}), 0, &FLBTime{}) + + b = C.GoBytes(data, C.int(length)) + dec.mpdec = codec.NewDecoderBytes(b, dec.handle) + + return dec +} + +func GetRecord(dec *FLBDecoder) (ret int, ts interface{}, rec map[interface{}]interface{}) { + var check error + var m interface{} + + check = dec.mpdec.Decode(&m) + if check != nil { + return -1, 0, nil + } + + i := reflect.ValueOf(m) + if i.Len() != 2 { + return -2, 0, nil + } + + switch i.Kind() { + case reflect.Map: // Metrics + map_data := i.Interface().(map[interface{}]interface{}) + return 0, 0, map_data + case reflect.Slice: // Logs + var t interface{} + ts = i.Index(0).Interface() + switch ty := ts.(type) { + case FLBTime: + t = ty + case uint64: + t = ty + case []interface{}: // for Fluent Bit V2 metadata type of format + s := reflect.ValueOf(ty) + if s.Kind() != reflect.Slice || s.Len() < 2 { + return -4, 0, nil + } + t = s.Index(0).Interface() + default: + return -5, 0, nil + } + data := i.Index(1) + + map_data, ok := data.Interface().(map[interface{}]interface{}) + if !ok { + return -3, 0, nil + } + + return 0, t, map_data + + default: + return -2, 0, nil + } +} diff --git a/otelcollector/fluent-bit/src/go.mod b/otelcollector/fluent-bit/src/go.mod index beb9c6b8b..8b42c0ce5 100644 --- a/otelcollector/fluent-bit/src/go.mod +++ b/otelcollector/fluent-bit/src/go.mod @@ -5,8 +5,10 @@ go 1.22 require ( github.com/fluent/fluent-bit-go v0.0.0-20220311094233-780004bf5562 github.com/microsoft/ApplicationInsights-Go v0.4.4 + github.com/mitchellh/mapstructure v1.5.0 github.com/prometheus/client_golang v1.18.0 github.com/shirou/gopsutil/v4 v4.24.11 + github.com/ugorji/go/codec v1.1.7 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/yaml.v2 v2.4.0 k8s.io/apimachinery v0.29.4 @@ -46,7 +48,6 @@ require ( github.com/rogpeppe/go-internal v1.11.0 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect - github.com/ugorji/go/codec v1.1.7 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.12.0 // indirect diff --git a/otelcollector/fluent-bit/src/go.sum b/otelcollector/fluent-bit/src/go.sum index 869d3afee..db161166f 100644 --- a/otelcollector/fluent-bit/src/go.sum +++ b/otelcollector/fluent-bit/src/go.sum @@ -71,6 +71,8 @@ github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvls github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= github.com/microsoft/ApplicationInsights-Go v0.4.4 h1:G4+H9WNs6ygSCe6sUyxRc2U81TI5Es90b2t/MwX5KqY= github.com/microsoft/ApplicationInsights-Go v0.4.4/go.mod h1:fKRUseBqkw6bDiXTs3ESTiU/4YTIHsQS4W3fP2ieF4U= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -111,8 +113,8 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tedsuo/ifrit v0.0.0-20180802180643-bea94bb476cc/go.mod h1:eyZnKCc955uh98WQvzOm0dgAeLnf2O0Rz0LPoC5ze+0= github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= @@ -152,8 +154,6 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= -golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.18.0 h1:FcHjZXDMxI8mM3nwhX9HlKop4C0YQvCVCdwYl2wOtE8= diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index ac073352b..8dff9308c 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -64,7 +64,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { var records []map[interface{}]interface{} // Create Fluent Bit decoder - dec := output.NewDecoder(data, int(length)) + dec := NewDecoder(data, int(length)) // Iterate Records for { @@ -90,6 +90,8 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { return PushInfiniteMetricLogToAppInsightsEvents(records) case fluentbitExportingFailedTag: return RecordExportingFailed(records) + case "prometheus.metrics.otelcollector", "prometheus.metrics.prometheus", "prometheus.metrics.targetallocator": + return SendPrometheusMetricsToAppInsights(records) default: // Error messages from metrics extension and otelcollector return PushLogErrorsToAppInsightsTraces(records, appinsights.Information, incomingTag) From 78efb86221c3bddd3f7e8547421051ffc8ca115c Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 13:43:31 -0800 Subject: [PATCH 17/47] fix path to cmake options --- otelcollector/build/linux/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 0e87a74df..67d823b54 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -79,7 +79,7 @@ ARG FLUENT_BIT_VERSION RUN wget https://github.com/fluent/fluent-bit/archive/refs/tags/v${FLUENT_BIT_VERSION}.tar.gz RUN tar -xvf v${FLUENT_BIT_VERSION}.tar.gz # Add a file with settings to build only the plugins we use -COPY ./plugins_options.cmake /fluent-bit-${FLUENT_BIT_VERSION}/cmake/plugins_options.cmake +COPY ./fluent-bit/plugins_options.cmake /fluent-bit-${FLUENT_BIT_VERSION}/cmake/plugins_options.cmake # Make a change that allows Fluent-Bit metrics to flow to our Go output plugin RUN sed -i '/out->type = FLB_OUTPUT_PLUGIN_PROXY;/a \ \ \ \ out->event_type = FLB_OUTPUT_LOGS | FLB_OUTPUT_METRICS;' /fluent-bit-${FLUENT_BIT_VERSION}/src/flb_plugin_proxy.c WORKDIR /fluent-bit-${FLUENT_BIT_VERSION}/build From 44d3b267634c5db23a75cb70d7efd54c7f396dc4 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 13:57:28 -0800 Subject: [PATCH 18/47] missed file --- otelcollector/fluent-bit/src/out_appinsights.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 8dff9308c..0b7b936d2 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -69,7 +69,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // Iterate Records for { // Extract Record - ret, _, record = output.GetRecord(dec) + ret, _, record = GetRecord(dec) if ret != 0 { break } From 7036b98fc18e315ea0909e266f3385619fdb01f0 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 14:08:32 -0800 Subject: [PATCH 19/47] fluent-bit version --- .pipelines/azure-pipeline-build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index c1b42173a..b523e57e2 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -27,6 +27,7 @@ variables: BUILD_WINDOWS: false Codeql.Enabled: true GOLANG_VERSION: '1.22.7' + FLUENT_BIT_VERSION: '3.2.2' stages: - stage: Build @@ -635,7 +636,7 @@ stages: docker buildx create --name dockerbuilder --driver docker-container --driver-opt image=mcr.microsoft.com/azuremonitor/containerinsights/cidev/prometheus-collector/images:buildx-stable-1 --use docker buildx inspect --bootstrap - docker buildx build . --platform=linux/amd64 --file ./build/linux/Dockerfile -t $(LINUX_FULL_IMAGE_NAME) --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --push # --cache-to type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector,mode=max --cache-from type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector + docker buildx build . --platform=linux/amd64 --file ./build/linux/Dockerfile -t $(LINUX_FULL_IMAGE_NAME) --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" --build-arg "FLUENT_BIT_VERSION=$(FLUENT_BIT_VERSION)" --metadata-file $(Build.ArtifactStagingDirectory)/linux/metadata.json --push # --cache-to type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector,mode=max --cache-from type=registry,ref=$(ACR_REGISTRY)$(ACR_REPOSITORY)/cache:prometheuscollector docker pull $(LINUX_FULL_IMAGE_NAME) docker system prune --all -f workingDirectory: $(Build.SourcesDirectory)/otelcollector/ From 3710e9f86a31bd3e75d5a03f39a9aac219d2884d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 14:33:07 -0800 Subject: [PATCH 20/47] remove other fluent-bit reference --- otelcollector/build/linux/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 67d823b54..7a13aa414 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -205,7 +205,7 @@ COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait COPY --from=builder /usr/bin/bash /usr/bin/bash COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox -COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit +#COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf COPY --from=builder /usr/sbin/crond /usr/sbin/crond COPY --from=builder /usr/bin/vim /usr/bin/vim From a0436cef4ea83d5ab30ac9de8d5ec62266b1e6d1 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 15:00:19 -0800 Subject: [PATCH 21/47] remove process printing, add metrics printing --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 1 + otelcollector/fluent-bit/src/process_stats.go | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index 6f621265f..e08382182 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -147,6 +147,7 @@ func (cm CMetrics) String() string { func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) int { for _, record := range records { cMetrics := ConvertRecordToCMetrics(record) + fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { metricTelemetryItem := appinsights.NewMetricTelemetry(metric.Meta.Opts.Name, value.Value) diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index b9083bb34..92fd42053 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -127,8 +127,6 @@ func (pa *ProcessAggregations) CollectStats() { p.memValues = append(p.memValues, float64(mem.RSS)) p.memValues.Sort() } - - fmt.Printf("cpu: %f, mem: %f\n", cpu, mem) } pa.mu.Unlock() From 9e4ce53336702b2b4ba0c63d5d002bbd5d572be6 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 15:03:02 -0800 Subject: [PATCH 22/47] settings --- otelcollector/fluent-bit/fluent-bit-daemonset.conf | 6 +----- otelcollector/fluent-bit/fluent-bit.conf | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.conf b/otelcollector/fluent-bit/fluent-bit-daemonset.conf index 5540619f1..6b6dce41f 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.conf +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.conf @@ -2,11 +2,7 @@ Flush 15 HTTP_Server Off Daemon Off - storage.path /var/opt/microsoft/state/flbstore/ - storage.sync normal - storage.checksum off - storage.backlog.mem_limit 10M - Log_Level info + Log_Level debug Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf Log_File /opt/fluent-bit/fluent-bit.log diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf index a7581ed8a..43b34202e 100644 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ b/otelcollector/fluent-bit/fluent-bit.conf @@ -2,10 +2,6 @@ Flush 15 HTTP_Server Off Daemon Off - storage.path /var/opt/microsoft/state/flbstore/ - storage.sync normal - storage.checksum off - storage.backlog.mem_limit 10M Log_Level debug Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf Log_File /opt/fluent-bit/fluent-bit.log From b29e4d098880438c6d5070d35b2c16a89de22222 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 15:49:41 -0800 Subject: [PATCH 23/47] remove db --- otelcollector/fluent-bit/fluent-bit-daemonset.conf | 14 -------------- otelcollector/fluent-bit/fluent-bit.conf | 12 ------------ 2 files changed, 26 deletions(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.conf b/otelcollector/fluent-bit/fluent-bit-daemonset.conf index 6b6dce41f..6abf4d266 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.conf +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.conf @@ -11,8 +11,6 @@ Name tail Tag prometheus.log.prometheuscollectorcontainer Path /var/log/containers/*prometheus-collector-node*prometheus-collector*.log,/var/log/containers/*ama-metrics-node*prometheus-collector*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -25,8 +23,6 @@ Name tail Tag prometheus.log.kubestatemetricscontainer Path /var/log/containers/ama-metrics-ksm*kube-system*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -39,8 +35,6 @@ Name tail Tag prometheus.log.targetallocator.tacontainer Path /var/log/containers/ama-metrics-*operator-targets*kube-system*targetallocator*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -52,8 +46,6 @@ Name tail Tag prometheus.log.targetallocator.configreader Path /var/log/containers/ama-metrics-*operator-targets*kube-system*config-reader*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -65,8 +57,6 @@ Name tail Tag prometheus.log.addontokenadapter Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -79,8 +69,6 @@ Name tail Tag prometheus.otelcollector Path /opt/microsoft/otelcollector/collector-log.txt - DB /var/opt/microsoft/state/otelcollector.db - DB.Sync Off Parser collector-parser Mem_Buf_Limit 1m Path_Key filepath @@ -92,8 +80,6 @@ Name tail Tag prometheus.metricsextension Path /MetricsExtensionConsoleDebugLog.log - DB /var/opt/microsoft/state/metricsextension.db - DB.Sync Off Parser me-parser Mem_Buf_Limit 1m Path_Key filepath diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf index 43b34202e..0848269b0 100644 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ b/otelcollector/fluent-bit/fluent-bit.conf @@ -12,8 +12,6 @@ Tag prometheus.log.prometheuscollectorcontainer Path /var/log/containers/*prometheus-collector*prometheus-collector*.log,/var/log/containers/*ama-metrics*prometheus-collector*.log Exclude_Path /var/log/containers/*prometheus-collector-node*.log,/var/log/containers/*ama-metrics-node*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -27,8 +25,6 @@ Tag prometheus.log.addontokenadapter Path /var/log/containers/*prometheus-collector*addon-token-adapter*.log,/var/log/containers/*ama-metrics*addon-token-adapter*.log Exclude_Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log - DB /var/opt/microsoft/state/prometheus-collector-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -41,8 +37,6 @@ Name tail Tag prometheus.otelcollector Path /opt/microsoft/otelcollector/collector-log.txt - DB /var/opt/microsoft/state/otelcollector.db - DB.Sync Off Parser collector-parser Mem_Buf_Limit 1m Path_Key filepath @@ -54,8 +48,6 @@ Name tail Tag prometheus.metricsextension Path /MetricsExtensionConsoleDebugLog.log - DB /var/opt/microsoft/state/metricsextension.db - DB.Sync Off Parser me-parser Mem_Buf_Limit 1m Path_Key filepath @@ -67,8 +59,6 @@ Name tail Tag prometheus.mdsd Path /opt/microsoft/linuxmonagent/mdsd.err - DB /var/opt/microsoft/state/mdsd.db - DB.Sync Off Parser mdsd-parser Mem_Buf_Limit 1m Path_Key filepath @@ -80,8 +70,6 @@ Tag prometheus.log.noconfiguration Path /dev/write-to-traces Read_from_Head true - DB /var/opt/microsoft/state/no-configuration.db - DB.Sync Off Parser no-config-parser Mem_Buf_Limit 1m Path_Key filepath From ed39e3cc914c8624818400406acf415e63d175b3 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 6 Dec 2024 16:36:35 -0800 Subject: [PATCH 24/47] remove printing metrics --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 1 - 1 file changed, 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index e08382182..6f621265f 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -147,7 +147,6 @@ func (cm CMetrics) String() string { func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) int { for _, record := range records { cMetrics := ConvertRecordToCMetrics(record) - fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { metricTelemetryItem := appinsights.NewMetricTelemetry(metric.Meta.Opts.Name, value.Value) From 827becebd30d8503452d6fe5f764cfdd8eb67a3e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 11:38:24 -0800 Subject: [PATCH 25/47] use metrics selector --- otelcollector/build/linux/Dockerfile | 2 +- otelcollector/fluent-bit/fluent-bit.yaml | 162 ++++++++++++++++++ .../fluent-bit/plugins_options.cmake | 2 +- .../fluent-bit/src/cmetrics_decoder.go | 1 + otelcollector/shared/helpers.go | 4 +- 5 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 otelcollector/fluent-bit/fluent-bit.yaml diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 7a13aa414..aee92f2a2 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -137,7 +137,7 @@ COPY --from=main-builder --chmod=777 /main/main.exe $tmpdir/main COPY ./scripts/*.sh $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/ COPY ./telegraf/ $tmpdir/telegraf/ -COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ +COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit COPY --from=fluent-bit-binary-builder /usr/local/etc/fluent-bit /usr/local/etc/fluent-bit diff --git a/otelcollector/fluent-bit/fluent-bit.yaml b/otelcollector/fluent-bit/fluent-bit.yaml new file mode 100644 index 000000000..6e7596592 --- /dev/null +++ b/otelcollector/fluent-bit/fluent-bit.yaml @@ -0,0 +1,162 @@ +service: + flush: 15 + http_server: Off + daemon: Off + log_level: debug + parsers_file: /opt/fluent-bit/fluent-bit-parsers.conf + log_file: /opt/fluent-bit/fluent-bit.log + +pipeline: + inputs: + - name: tail + tag: prometheus.log.prometheuscollectorcontainer + path: /var/log/containers/*prometheus-collector*prometheus-collector*.log,/var/log/containers/*ama-metrics*prometheus-collector*.log + exclude_path: /var/log/containers/*prometheus-collector-node*.log,/var/log/containers/*ama-metrics-node*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.log.addontokenadapter + path: /var/log/containers/*prometheus-collector*addon-token-adapter*.log,/var/log/containers/*ama-metrics*addon-token-adapter*.log + exclude_path: /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.otelcollector + path: /opt/microsoft/otelcollector/collector-log.txt + parser: collector-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.metricsextension + path: /MetricsExtensionConsoleDebugLog.log + parser: me-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.mdsd + path: /opt/microsoft/linuxmonagent/mdsd.err + parser: mdsd-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.log.noconfiguration + path: /dev/write-to-traces + read_from_head: true + parser: no-config-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: prometheus_scrape + host: 0.0.0.0 + port: 8888 + tag: prometheus.metrics.otelcollector + metrics_path: /metrics + scrape_interval: 1m + processors: + metrics: + - name: metrics_selector + metric_name: /otelcol_processor_dropped_metric_points|otelcol_receiver_refused_metric_points|otelcol_receiver_accepted_metric_points|otelcol_exporter_sent_metric_points|otelcol_exporter_queue_size|otelcol_exporter_send_failed_metric_points|otelcol_process_memory_rss|otelcol_processor_batch_batch_send_size_bytes_sum|otelcol_processor_batch_batch_send_size_bytes_count/ + action: include + + - name: prometheus_scrape + host: 0.0.0.0 + port: 9090 + tag: prometheus.metrics.prometheus + metrics_path: /metrics + scrape_interval: 1m + processors: + metrics: + - name: metrics_selector + metric_name: /prometheus_sd_http_failures_total/ + action: include + + - name: prometheus_scrape + host: ama-metrics-operator-targets.kube-system.svc.cluster.local + port: 80 + tag: prometheus.metrics.targetallocator + metrics_path: /metrics + scrape_interval: 1m + processors: + metrics: + - name: metrics_selector + metric_name: /opentelemetry_allocator_targets|opentelemetry_allocator_collectors_discovered/ + action: include + + + filters: + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*ProcessedCount.* prometheus.log.processedcount false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*\(infinite\).* prometheus.log.infinitemetric false + + - name: rewrite_tag + match: prometheus.otelcollector + rule: $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true + + - name: grep + match: prometheus.metricsextension + regex: level (Error|Fatal) + + - name: grep + match: prometheus.otelcollector + regex: level (error|fatal) + + - name: grep + match: prometheus.log.addontokenadapter + regex: stream stderr + + outputs: + - name: appinsights + match_regex: prometheus.log.*|prometheus.metrics.* + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.metricsextension + + - name: stdout + format: json_lines + json_date_key: false + match: prometheus.otelcollector + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.mdsd + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.log.noconfiguration \ No newline at end of file diff --git a/otelcollector/fluent-bit/plugins_options.cmake b/otelcollector/fluent-bit/plugins_options.cmake index 10d2d1f63..4ec1d8e1f 100644 --- a/otelcollector/fluent-bit/plugins_options.cmake +++ b/otelcollector/fluent-bit/plugins_options.cmake @@ -69,7 +69,7 @@ DEFINE_OPTION(FLB_IN_EBPF "Enable Linux eBPF input plugin" # ========== DEFINE_OPTION(FLB_PROCESSOR_CONTENT_MODIFIER "Enable content modifier processor" OFF) DEFINE_OPTION(FLB_PROCESSOR_LABELS "Enable metrics label manipulation processor" OFF) -DEFINE_OPTION(FLB_PROCESSOR_METRICS_SELECTOR "Enable metrics selector processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_METRICS_SELECTOR "Enable metrics selector processor" ON) DEFINE_OPTION(FLB_PROCESSOR_SQL "Enable SQL processor" OFF) DEFINE_OPTION(FLB_PROCESSOR_OPENTELEMETRY_ENVELOPE "Enable OpenTelemetry envelope processor" OFF) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index 6f621265f..e08382182 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -147,6 +147,7 @@ func (cm CMetrics) String() string { func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) int { for _, record := range records { cMetrics := ConvertRecordToCMetrics(record) + fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { metricTelemetryItem := appinsights.NewMetricTelemetry(metric.Meta.Opts.Name, value.Value) diff --git a/otelcollector/shared/helpers.go b/otelcollector/shared/helpers.go index 3984fb270..cba41a30e 100644 --- a/otelcollector/shared/helpers.go +++ b/otelcollector/shared/helpers.go @@ -38,14 +38,14 @@ func DetermineConfigFiles(controllerType, clusterOverride string) (string, strin switch { case strings.ToLower(controllerType) == "replicaset": - fluentBitConfigFile = "/opt/fluent-bit/fluent-bit.conf" + fluentBitConfigFile = "/opt/fluent-bit/fluent-bit.yaml" if clusterOverride == "true" { meConfigFile = "/usr/sbin/me_internal.config" } else { meConfigFile = "/usr/sbin/me.config" } case os.Getenv("OS_TYPE") != "windows": - fluentBitConfigFile = "/opt/fluent-bit/fluent-bit.conf" + fluentBitConfigFile = "/opt/fluent-bit/fluent-bit-daemonset.conf" if clusterOverride == "true" { meConfigFile = "/usr/sbin/me_ds_internal.config" } else { From 6e69eca10f82c630d5591c61af2936a1b7237027 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 14:25:43 -0800 Subject: [PATCH 26/47] fix decoding to have ns and ss --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index e08382182..bf9bb57df 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -104,7 +104,8 @@ type CMetrics struct { Meta struct { AggregationType AggregationType `mapstructure:"aggregation_type"` Labels []string `mapstructure:"labels"` - Opts struct { + /* Formatted full qualified metric name is: namespace_subsystem_name */ + Opts struct { Desc string `mapstructure:"desc"` Name string `mapstructure:"name"` Namespace string `mapstructure:"ns"` @@ -126,11 +127,12 @@ func (cm CMetrics) String() string { var ret strings.Builder for _, metric := range cm.Metrics { - ret.WriteString(fmt.Sprintf("# HELP %s %s\n", metric.Meta.Opts.Name, metric.Meta.Opts.Desc)) - ret.WriteString(fmt.Sprintf("# TYPE %s %s\n", metric.Meta.Opts.Name, metric.Meta.Type)) + fullMetricName := fmt.Sprintf("%s_%s_%s", metric.Meta.Opts.Namespace, metric.Meta.Opts.Subsystem, metric.Meta.Opts.Name) + ret.WriteString(fmt.Sprintf("# HELP %s %s\n", fullMetricName, metric.Meta.Opts.Desc)) + ret.WriteString(fmt.Sprintf("# TYPE %s %s\n", fullMetricName, metric.Meta.Type)) for _, value := range metric.Values { - ret.WriteString(fmt.Sprintf("%s{", metric.Meta.Opts.Name)) + ret.WriteString(fmt.Sprintf("%s{", fullMetricName)) for i, labelName := range metric.Meta.Labels { ret.WriteString(fmt.Sprintf("%s=%s", labelName, value.Labels[i])) if i < len(metric.Meta.Labels)-1 { @@ -150,7 +152,10 @@ func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) i fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { - metricTelemetryItem := appinsights.NewMetricTelemetry(metric.Meta.Opts.Name, value.Value) + metricTelemetryItem := appinsights.NewMetricTelemetry( + fmt.Sprintf("%s_%s_%s", metric.Meta.Opts.Namespace, metric.Meta.Opts.Subsystem, metric.Meta.Opts.Name), + value.Value, + ) for i, labelName := range metric.Meta.Labels { metricTelemetryItem.Properties[labelName] = fmt.Sprintf("%d", value.Labels[i]) } From 6be86804fb8962675acb3a7fd667f42aa332a0ef Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 14:41:17 -0800 Subject: [PATCH 27/47] change daemonset --- otelcollector/build/linux/Dockerfile | 2 +- .../fluent-bit/fluent-bit-daemonset.yaml | 167 ++++++++++++++++++ otelcollector/shared/helpers.go | 2 +- 3 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 otelcollector/fluent-bit/fluent-bit-daemonset.yaml diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index aee92f2a2..a65a1ca99 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -137,7 +137,7 @@ COPY --from=main-builder --chmod=777 /main/main.exe $tmpdir/main COPY ./scripts/*.sh $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/ COPY ./telegraf/ $tmpdir/telegraf/ -COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ +COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-daemonset.yaml ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit COPY --from=fluent-bit-binary-builder /usr/local/etc/fluent-bit /usr/local/etc/fluent-bit diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml new file mode 100644 index 000000000..be89507bc --- /dev/null +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -0,0 +1,167 @@ +service: + flush: 15 + http_server: Off + daemon: Off + log_level: debug + parsers_file: /opt/fluent-bit/fluent-bit-parsers.conf + log_file: /opt/fluent-bit/fluent-bit.log + +pipeline: + inputs: + - name: tail + tag: prometheus.log.prometheuscollectorcontainer + path: /var/log/containers/*prometheus-collector-node*prometheus-collector*.log,/var/log/containers/*ama-metrics-node*prometheus-collector*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.log.kubestatemetricscontainer + path: /var/log/containers/ama-metrics-ksm*kube-system*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.log.targetallocator.tacontainer + path: /var/log/containers/ama-metrics-*operator-targets*kube-system*targetallocator*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + + - name: tail + tag: prometheus.log.targetallocator.configreader + path: /var/log/containers/ama-metrics-*operator-targets*kube-system*config-reader*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + + - name: tail + tag: prometheus.log.addontokenadapter + path: /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log + parser: cri + read_from_head: true + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.otelcollector + path: /opt/microsoft/otelcollector/collector-log.txt + parser: collector-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.metricsextension + path: /MetricsExtensionConsoleDebugLog.log + parser: me-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.mdsd + path: /opt/microsoft/linuxmonagent/mdsd.err + db: /var/opt/microsoft/state/mdsd.db + db_sync: Off + parser: mdsd-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: tail + tag: prometheus.log.noconfiguration + path: /dev/write-to-traces + read_from_head: true + db: /var/opt/microsoft/state/no-configuration.db + db_sync: Off + parser: no-config-parser + mem_buf_limit: 1m + path_key: filepath + skip_long_lines: On + ignore_older: 2m + + - name: prometheus_scrape + host: 0.0.0.0 + port: 8888 + tag: prometheus.metrics.otelcollector + metrics_spath: /metrics + scrape_interval: 1m + processors: + metrics: + - name: metrics_selector + metric_name: /otelcol_processor_dropped_metric_points|otelcol_receiver_refused_metric_points|otelcol_receiver_accepted_metric_points|otelcol_exporter_sent_metric_points|otelcol_exporter_queue_size|otelcol_exporter_send_failed_metric_points|otelcol_process_memory_rss|otelcol_processor_batch_batch_send_size_bytes_sum|otelcol_processor_batch_batch_send_size_bytes_count/ + action: include + +filters: + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*ProcessedCount.* prometheus.log.processedcount false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false + + - name: rewrite_tag + match: prometheus.metricsextension + rule: $message .*\(infinite\).* prometheus.log.infinitemetric false + + - name: rewrite_tag + match: prometheus.otelcollector + rule: $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true + + - name: grep + match: prometheus.metricsextension + regex: level (Error|Fatal) + + - name: grep + match: prometheus.otelcollector + regex: level (error|fatal) + + - name: grep + match: prometheus.log.addontokenadapter + regex: stream stderr + +outputs: + - name: appinsights + match: prometheus.log.* + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.metricsextension + + - name: stdout + format: json_lines + json_date_key: false + match: prometheus.otelcollector + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.mdsd + + - name: stdout + format: json_lines + json_date_key: time + match: prometheus.log.noconfiguration diff --git a/otelcollector/shared/helpers.go b/otelcollector/shared/helpers.go index cba41a30e..5bd8b5f94 100644 --- a/otelcollector/shared/helpers.go +++ b/otelcollector/shared/helpers.go @@ -45,7 +45,7 @@ func DetermineConfigFiles(controllerType, clusterOverride string) (string, strin meConfigFile = "/usr/sbin/me.config" } case os.Getenv("OS_TYPE") != "windows": - fluentBitConfigFile = "/opt/fluent-bit/fluent-bit-daemonset.conf" + fluentBitConfigFile = "/opt/fluent-bit/fluent-bit-daemonset.yaml" if clusterOverride == "true" { meConfigFile = "/usr/sbin/me_ds_internal.config" } else { From 4ba0db883ab65e612d869434c1644cfb5481c25f Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 14:47:09 -0800 Subject: [PATCH 28/47] match exact telemetry name --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index bf9bb57df..a629e140c 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -146,14 +146,18 @@ func (cm CMetrics) String() string { return ret.String() } -func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}) int { +func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}, tag string) int { + telemetryPrefix := "prometheus" + if tag == "prometheus.metrics.targetallocator" { + telemetryPrefix = "target_allocator" + } for _, record := range records { cMetrics := ConvertRecordToCMetrics(record) fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { metricTelemetryItem := appinsights.NewMetricTelemetry( - fmt.Sprintf("%s_%s_%s", metric.Meta.Opts.Namespace, metric.Meta.Opts.Subsystem, metric.Meta.Opts.Name), + fmt.Sprintf("%s_%s_%s_%s", telemetryPrefix, metric.Meta.Opts.Namespace, metric.Meta.Opts.Subsystem, metric.Meta.Opts.Name), value.Value, ) for i, labelName := range metric.Meta.Labels { From 118cd887fd4d785ebb2548245640b9015538a122 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 14:57:53 -0800 Subject: [PATCH 29/47] include all changed files --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 8 ++++++++ otelcollector/fluent-bit/fluent-bit.yaml | 8 ++++++++ otelcollector/fluent-bit/src/out_appinsights.go | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index be89507bc..0ba250880 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -108,6 +108,14 @@ pipeline: - name: metrics_selector metric_name: /otelcol_processor_dropped_metric_points|otelcol_receiver_refused_metric_points|otelcol_receiver_accepted_metric_points|otelcol_exporter_sent_metric_points|otelcol_exporter_queue_size|otelcol_exporter_send_failed_metric_points|otelcol_process_memory_rss|otelcol_processor_batch_batch_send_size_bytes_sum|otelcol_processor_batch_batch_send_size_bytes_count/ action: include + - name: labels + delete: service_instance_id + - name: labels + delete: service_name + - name: labels + delete: service_version + - name: labels + delete: transport filters: - name: rewrite_tag diff --git a/otelcollector/fluent-bit/fluent-bit.yaml b/otelcollector/fluent-bit/fluent-bit.yaml index 6e7596592..cd1ef803c 100644 --- a/otelcollector/fluent-bit/fluent-bit.yaml +++ b/otelcollector/fluent-bit/fluent-bit.yaml @@ -78,6 +78,14 @@ pipeline: - name: metrics_selector metric_name: /otelcol_processor_dropped_metric_points|otelcol_receiver_refused_metric_points|otelcol_receiver_accepted_metric_points|otelcol_exporter_sent_metric_points|otelcol_exporter_queue_size|otelcol_exporter_send_failed_metric_points|otelcol_process_memory_rss|otelcol_processor_batch_batch_send_size_bytes_sum|otelcol_processor_batch_batch_send_size_bytes_count/ action: include + - name: labels + delete: service_instance_id + - name: labels + delete: service_name + - name: labels + delete: service_version + - name: labels + delete: transport - name: prometheus_scrape host: 0.0.0.0 diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 0b7b936d2..8a2eff7f1 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -91,7 +91,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { case fluentbitExportingFailedTag: return RecordExportingFailed(records) case "prometheus.metrics.otelcollector", "prometheus.metrics.prometheus", "prometheus.metrics.targetallocator": - return SendPrometheusMetricsToAppInsights(records) + return SendPrometheusMetricsToAppInsights(records, incomingTag) default: // Error messages from metrics extension and otelcollector return PushLogErrorsToAppInsightsTraces(records, appinsights.Information, incomingTag) From 10c6d1bbd8a73a656f0f81dba61241ea19fdd8fd Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 15:04:35 -0800 Subject: [PATCH 30/47] completely remove telegraf --- otelcollector/build/linux/Dockerfile | 4 ++-- otelcollector/fluent-bit/src/out_appinsights.go | 2 +- otelcollector/scripts/setup.sh | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index a65a1ca99..9c6009e38 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -136,7 +136,7 @@ COPY --from=main-builder --chmod=777 /main/main.exe $tmpdir/main COPY ./scripts/*.sh $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/ -COPY ./telegraf/ $tmpdir/telegraf/ +#COPY ./telegraf/ $tmpdir/telegraf/ COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-daemonset.yaml ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit @@ -206,7 +206,7 @@ COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait COPY --from=builder /usr/bin/bash /usr/bin/bash COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox #COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit -COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf +#COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf COPY --from=builder /usr/sbin/crond /usr/sbin/crond COPY --from=builder /usr/bin/vim /usr/bin/vim COPY --from=builder /usr/share/vim /usr/share/vim diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 8a2eff7f1..5997f2767 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -49,7 +49,7 @@ func FLBPluginInit(ctx unsafe.Pointer) int { } // Collect, aggregate, and send CPU and Memory usage telemetry for the processes below - processAggregations := InitProcessAggregations([]string{"otelcollector", "MetricsExtension", "fluent-bit", "mdsd", "telegraf"}) + processAggregations := InitProcessAggregations([]string{"otelcollector", "MetricsExtension"}) processAggregations.Run() go PushMEProcessedAndReceivedCountToAppInsightsMetrics() diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index 3c9ebef03..cde21bdd6 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -51,9 +51,9 @@ cp -f $TMPDIR/envmdsd /etc/mdsd.d mkdir /opt/microsoft/linuxmonagent # Install telegraf -echo "Installing telegraf..." -sudo tdnf install telegraf-1.29.4 -y -sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.txt +# echo "Installing telegraf..." +# sudo tdnf install telegraf-1.29.4 -y +# sudo tdnf list installed | grep telegraf | awk '{print $2}' > telegrafversion.txt # Install fluent-bit # echo "Installing fluent-bit..." From ce3180ef65703e3fb89c9b1eaabb4412727d5ff5 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 15:11:12 -0800 Subject: [PATCH 31/47] fix daemonset spacing --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index 0ba250880..f75836e6a 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -117,7 +117,7 @@ pipeline: - name: labels delete: transport -filters: + filters: - name: rewrite_tag match: prometheus.metricsextension rule: $message .*ProcessedCount.* prometheus.log.processedcount false @@ -150,7 +150,7 @@ filters: match: prometheus.log.addontokenadapter regex: stream stderr -outputs: + outputs: - name: appinsights match: prometheus.log.* From f250ff3915b1c92a17ffa8df028042743523a846 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 15:35:35 -0800 Subject: [PATCH 32/47] change labels to string --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index a629e140c..da1d17f36 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -161,7 +161,7 @@ func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}, t value.Value, ) for i, labelName := range metric.Meta.Labels { - metricTelemetryItem.Properties[labelName] = fmt.Sprintf("%d", value.Labels[i]) + metricTelemetryItem.Properties[labelName] = fmt.Sprintf("%s", value.Labels[i]) } TelemetryClient.Track(metricTelemetryItem) } From 6365603eefd665964d6320bc491f5eb6287a4731 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 15:43:59 -0800 Subject: [PATCH 33/47] remove db references --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index f75836e6a..85e442bf1 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -77,8 +77,6 @@ pipeline: - name: tail tag: prometheus.mdsd path: /opt/microsoft/linuxmonagent/mdsd.err - db: /var/opt/microsoft/state/mdsd.db - db_sync: Off parser: mdsd-parser mem_buf_limit: 1m path_key: filepath @@ -89,8 +87,6 @@ pipeline: tag: prometheus.log.noconfiguration path: /dev/write-to-traces read_from_head: true - db: /var/opt/microsoft/state/no-configuration.db - db_sync: Off parser: no-config-parser mem_buf_limit: 1m path_key: filepath From 98c4fca9cfb3b01cbf5b4a0778b9767619a5bbeb Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 15:58:45 -0800 Subject: [PATCH 34/47] install newer fluent-bit --- otelcollector/build/windows/scripts/setup.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/build/windows/scripts/setup.ps1 b/otelcollector/build/windows/scripts/setup.ps1 index 6a0695534..43b2d4056 100644 --- a/otelcollector/build/windows/scripts/setup.ps1 +++ b/otelcollector/build/windows/scripts/setup.ps1 @@ -32,7 +32,7 @@ Write-Host ('Installing Fluent Bit'); try { # Keep version in sync with linux in setup.sh file # $fluentBitUri = 'https://github.com/microsoft/OMS-docker/releases/download/winakslogagent/td-agent-bit-1.4.0-win64.zip' - $fluentBitUri = 'https://releases.fluentbit.io/2.1/fluent-bit-2.1.10-win64.zip' + $fluentBitUri = 'https://releases.fluentbit.io/3.0/fluent-bit-3.0.7-win64.zip' Invoke-WebRequest -Uri $fluentBitUri -OutFile /installation/fluent-bit.zip Expand-Archive -Path /installation/fluent-bit.zip -Destination /installation/fluent-bit Move-Item -Path /installation/fluent-bit/*/bin/* -Destination /opt/fluent-bit/bin/ -ErrorAction SilentlyContinue From 61145b91a2386bb17c5d5f21b7e01ef41c91fc97 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 16:32:30 -0800 Subject: [PATCH 35/47] typo fix --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index 85e442bf1..532a7f8cf 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -97,7 +97,7 @@ pipeline: host: 0.0.0.0 port: 8888 tag: prometheus.metrics.otelcollector - metrics_spath: /metrics + metrics_path: /metrics scrape_interval: 1m processors: metrics: From 600cd5c2cb36e96a2a6864f0121c32801ac0f23a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 16:38:25 -0800 Subject: [PATCH 36/47] update to build windows --- .pipelines/azure-pipeline-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index b523e57e2..a428f870c 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -24,7 +24,7 @@ variables: NODE_EXPORTER_IMAGE: 'mcr.microsoft.com/oss/prometheus/node-exporter:v1.6.0' IS_PR: $[eq(variables['Build.Reason'], 'PullRequest')] IS_MAIN_BRANCH: $[eq(variables['Build.SourceBranchName'], 'main')] - BUILD_WINDOWS: false + BUILD_WINDOWS: true Codeql.Enabled: true GOLANG_VERSION: '1.22.7' FLUENT_BIT_VERSION: '3.2.2' From 0a7116dee16525435800367da454f40d2fabd611 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 16:57:14 -0800 Subject: [PATCH 37/47] add all env vars --- otelcollector/fluent-bit/fluent-bit-windows.conf | 8 -------- otelcollector/fluent-bit/src/process_stats.go | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/otelcollector/fluent-bit/fluent-bit-windows.conf b/otelcollector/fluent-bit/fluent-bit-windows.conf index 6e7de7d51..bbc24e70a 100644 --- a/otelcollector/fluent-bit/fluent-bit-windows.conf +++ b/otelcollector/fluent-bit/fluent-bit-windows.conf @@ -11,8 +11,6 @@ Name tail Tag prometheus.log.prometheuscollectorcontainer Path C:\\var\\log\\containers\\*ama-metrics*prometheus-collector*.log - DB C:\\opt\\state\\prometheus-collector-win-ai.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -25,8 +23,6 @@ Name tail Tag prometheus.log.addontokenadapter Path C:\\var\\log\\containers\\*ama-metrics*addon-token-adapter-win*.log - DB C:\\opt\\state\\addon-token-adapter.db - DB.Sync Off Parser cri Read_from_Head true Mem_Buf_Limit 1m @@ -39,8 +35,6 @@ Name tail Tag prometheus.otelcollector Path C:\\opt\\microsoft\\otelcollector\\collector-log.txt - DB C:\\opt\\state\\otelcollector.db - DB.Sync Off Parser collector-parser Mem_Buf_Limit 1m Path_Key filepath @@ -52,8 +46,6 @@ Name tail Tag prometheus.metricsextension Path C:\\Users\\ContainerAdministrator\\Documents\\MetricsExtensionConsoleDebugLog.log - DB C:\\opt\\state\\metricsextension.db - DB.Sync Off Parser me-parser Mem_Buf_Limit 1m Path_Key filepath diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 92fd42053..750307572 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -47,6 +47,13 @@ var replicasetDimensionsNameToEnvVar = map[string]string{ "tadapterf": "tokenadapterUnhealthyAfterSecs", "setGlobalSettings": "AZMON_SET_GLOBAL_SETTINGS", "globalSettingsConfigured": "AZMON_GLOBAL_SETTINGS_CONFIGURED", + "calias": "AZMON_CLUSTER_ALIAS", + "clabel": "AZMON_CLUSTER_LABEL", + "mip": "MINIMAL_INGESTION_PROFILE", + "operatormodel": "AZMON_OPERATOR_ENABLED", + "operatormodelcfgmapsetting": "AZMON_OPERATOR_ENABLED_CFG_MAP_SETTING", + "operatormodelchartsetting": "AZMON_OPERATOR_ENABLED_CHART_SETTING", + "collectorHpaEnabled": "AZMON_COLLECTOR_HPA_ENABLED", } var daemonsetDimensionsNameToEnvVar = map[string]string{ From 23b1f4062470c77b3b5a21127f4f2c7dab7de8c0 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 17:09:31 -0800 Subject: [PATCH 38/47] add in missing telemetry --- otelcollector/fluent-bit/src/telemetry.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/otelcollector/fluent-bit/src/telemetry.go b/otelcollector/fluent-bit/src/telemetry.go index b6e221f30..fc10e39df 100644 --- a/otelcollector/fluent-bit/src/telemetry.go +++ b/otelcollector/fluent-bit/src/telemetry.go @@ -727,6 +727,15 @@ func PushMEProcessedAndReceivedCountToAppInsightsMetrics() { if AcstorMetricsExporterKeepListRegex != "" { metric.Properties["AcstorMetricsExporterRegex"] = AcstorMetricsExporterKeepListRegex } + if NetworkObservabilityCiliumKeepListRegex != "" { + metric.Properties["NetworkObservabilityCiliumRegex"] = NetworkObservabilityCiliumKeepListRegex + } + if NetworkObservabilityHubbleKeepListRegex != "" { + metric.Properties["NetworkObservabilityHubbleRegex"] = NetworkObservabilityHubbleKeepListRegex + } + if NetworkObservabilityRetinaKeepListRegex != "" { + metric.Properties["NetworkObservabilityRetinaRegex"] = NetworkObservabilityRetinaKeepListRegex + } if KubeletScrapeInterval != "" { metric.Properties["KubeletScrapeInterval"] = KubeletScrapeInterval @@ -770,6 +779,15 @@ func PushMEProcessedAndReceivedCountToAppInsightsMetrics() { if AcstorMetricsExporterScrapeInterval != "" { metric.Properties["AcstorMetricsExporterScrapeInterval"] = AcstorMetricsExporterScrapeInterval } + if NetworkObservabilityCiliumScrapeInterval != "" { + metric.Properties["NetworkObservabilityCiliumScrapeInterval"] = NetworkObservabilityCiliumScrapeInterval + } + if NetworkObservabilityHubbleScrapeInterval != "" { + metric.Properties["NetworkObservabilityHubbleScrapeInterval"] = NetworkObservabilityHubbleScrapeInterval + } + if NetworkObservabilityRetinaScrapeInterval != "" { + metric.Properties["NetworkObservabilityRetinaScrapeInterval"] = NetworkObservabilityRetinaScrapeInterval + } } TelemetryClient.Track(metric) From c8f74a6f07b359df34b2780063ea30718fccbb87 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 12 Dec 2024 17:31:06 -0800 Subject: [PATCH 39/47] turn on label processor --- otelcollector/fluent-bit/plugins_options.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/plugins_options.cmake b/otelcollector/fluent-bit/plugins_options.cmake index 4ec1d8e1f..262283b78 100644 --- a/otelcollector/fluent-bit/plugins_options.cmake +++ b/otelcollector/fluent-bit/plugins_options.cmake @@ -68,7 +68,7 @@ DEFINE_OPTION(FLB_IN_EBPF "Enable Linux eBPF input plugin" # Processors # ========== DEFINE_OPTION(FLB_PROCESSOR_CONTENT_MODIFIER "Enable content modifier processor" OFF) -DEFINE_OPTION(FLB_PROCESSOR_LABELS "Enable metrics label manipulation processor" OFF) +DEFINE_OPTION(FLB_PROCESSOR_LABELS "Enable metrics label manipulation processor" ON) DEFINE_OPTION(FLB_PROCESSOR_METRICS_SELECTOR "Enable metrics selector processor" ON) DEFINE_OPTION(FLB_PROCESSOR_SQL "Enable SQL processor" OFF) DEFINE_OPTION(FLB_PROCESSOR_OPENTELEMETRY_ENVELOPE "Enable OpenTelemetry envelope processor" OFF) From 280f86ce8a89ed1ac60fd68e85cab7754c863c3e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Dec 2024 14:36:00 -0800 Subject: [PATCH 40/47] accept for daemonset too --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index 532a7f8cf..21b23bee8 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -148,7 +148,7 @@ pipeline: outputs: - name: appinsights - match: prometheus.log.* + match: prometheus.log.*|prometheus.metrics.* - name: stdout format: json_lines From 9f3bdbc285cfb75e185754132b465f51a0454561 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Dec 2024 15:37:59 -0800 Subject: [PATCH 41/47] fix windows pid finder --- otelcollector/fluent-bit/src/out_appinsights.go | 7 ++++++- otelcollector/fluent-bit/src/process_stats.go | 4 ++-- otelcollector/fluent-bit/src/utils.go | 10 ++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 5997f2767..04e69d010 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -49,7 +49,12 @@ func FLBPluginInit(ctx unsafe.Pointer) int { } // Collect, aggregate, and send CPU and Memory usage telemetry for the processes below - processAggregations := InitProcessAggregations([]string{"otelcollector", "MetricsExtension"}) + osType := os.Getenv("OSTYPE") + processNames := []string{"otelcollector", "MetricsExtension"} + if osType == "windows" { + processNames = []string{"otelcollector", "MetricsExtension.Native"} + } + processAggregations := InitProcessAggregations(processNames, osType) processAggregations.Run() go PushMEProcessedAndReceivedCountToAppInsightsMetrics() diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 750307572..6042a964f 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -78,12 +78,12 @@ type ProcessAggregations struct { mu sync.Mutex } -func InitProcessAggregations(processName []string) *ProcessAggregations { +func InitProcessAggregations(processName []string, os string) *ProcessAggregations { fmt.Printf("Starting process aggregations") processAggregationsMap := make(map[string]*Process) for _, processName := range processName { - pids, err := findPIDFromExe(processName) + pids, err := findPIDFromExe(processName, os) if err != nil || len(pids) == 0 { fmt.Printf("Error getting PID for process %s\n", processName) continue diff --git a/otelcollector/fluent-bit/src/utils.go b/otelcollector/fluent-bit/src/utils.go index bf7b1161c..796aa9256 100644 --- a/otelcollector/fluent-bit/src/utils.go +++ b/otelcollector/fluent-bit/src/utils.go @@ -34,8 +34,14 @@ func ReadFileContents(fullPathToFileName string) (string, error) { } // From telegraf codebase -func findPIDFromExe(process string) ([]int32, error) { - buf, err := exec.Command("pgrep", process).Output() +func findPIDFromExe(process string, os string) ([]int32, error) { + var command *exec.Cmd + if os == "windows" { + command = exec.Command("powershell", "-Command", fmt.Sprintf("Get-Process -Name %s | Select-Object -Expand Id", process)) + } else { + command = exec.Command("pgrep", process) + } + buf, err := command.Output() if err != nil { return nil, fmt.Errorf("error running %w", err) } From 667c5b555d61dbc6059796cb7c84a4856341ed2b Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Dec 2024 16:00:23 -0800 Subject: [PATCH 42/47] cleanup --- otelcollector/build/linux/Dockerfile | 8 +- otelcollector/build/windows/Dockerfile | 1 - .../fluent-bit/fluent-bit-daemonset.conf | 185 ------------- otelcollector/fluent-bit/fluent-bit.conf | 173 ------------ .../fluent-bit/src/cmetrics_decoder.go | 1 - otelcollector/fluent-bit/src/process_stats.go | 2 +- otelcollector/fluent-bit/src/utils.go | 1 - .../telegraf-prometheus-collector-ds.conf | 194 ------------- ...egraf-prometheus-collector-ta-enabled.conf | 254 ------------------ ...telegraf-prometheus-collector-windows.conf | 178 ------------ .../telegraf-prometheus-collector.conf | 218 --------------- 11 files changed, 2 insertions(+), 1213 deletions(-) delete mode 100644 otelcollector/fluent-bit/fluent-bit-daemonset.conf delete mode 100644 otelcollector/fluent-bit/fluent-bit.conf delete mode 100644 otelcollector/telegraf/telegraf-prometheus-collector-ds.conf delete mode 100644 otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf delete mode 100644 otelcollector/telegraf/telegraf-prometheus-collector-windows.conf delete mode 100644 otelcollector/telegraf/telegraf-prometheus-collector.conf diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 9c6009e38..44ccff2be 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -136,7 +136,6 @@ COPY --from=main-builder --chmod=777 /main/main.exe $tmpdir/main COPY ./scripts/*.sh $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/ -#COPY ./telegraf/ $tmpdir/telegraf/ COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-daemonset.yaml ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit @@ -205,8 +204,6 @@ COPY --from=builder /usr/sbin/MetricsExtension /usr/sbin/MetricsExtension COPY --from=builder /usr/bin/inotifywait /usr/bin/inotifywait COPY --from=builder /usr/bin/bash /usr/bin/bash COPY --from=builder /usr/sbin/busybox /usr/sbin/busybox -#COPY --from=builder /usr/bin/fluent-bit /usr/bin/fluent-bit -#COPY --from=builder /usr/bin/telegraf /usr/bin/telegraf COPY --from=builder /usr/sbin/crond /usr/sbin/crond COPY --from=builder /usr/bin/vim /usr/bin/vim COPY --from=builder /usr/share/vim /usr/share/vim @@ -238,10 +235,7 @@ COPY --from=builder /lib/libboost_filesystem.so.1.76.0 /lib/libcpprest.so.2.10 COPY --from=builder /lib64/libuuid.so.1 /lib64 # fluent-bit dependencies # libssl.so.1.1 & libcrypto.so.1.1 are already available with openssl in distroless and copying them over causes FIPS HMAC verification failures -# COPY --from=builder /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libcurl.so.4 /lib/libm.so.6 /lib/libz.so.1 /lib/libzstd.so.1 /lib/libsasl2.so.3 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libnghttp2.so.14 /lib/libssh2.so.1 /lib/libgssapi_krb5.so.2 /lib/libresolv.so.2 /lib/libgpg-error.so.0 /usr/lib/libkrb5.so.3 /usr/lib/libk5crypto.so.3 /usr/lib/libcom_err.so.2 /usr/lib/libkrb5support.so.0 /lib/ -COPY --from=fluent-bit-binary-builder /lib/libluajit-5.1.so.2 /lib/libssl.so.1.1 /lib/libcrypto.so.1.1 /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/libzstd.so.1 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libgpg-error.so.0 /lib/ -# telegraf dependencies -COPY --from=builder /lib/libc.so.6 /lib/ +COPY --from=fluent-bit-binary-builder /lib/libluajit-5.1.so.2 /lib/libyaml-0.so.2 /lib/libsystemd.so.0 /lib/libgcc_s.so.1 /lib/libc.so.6 /lib/liblzma.so.5 /lib/libzstd.so.1 /lib/liblz4.so.1 /lib/libcap.so.2 /lib/libgcrypt.so.20 /lib/libgpg-error.so.0 /lib/ # mdsd dependencies COPY --from=builder /usr/lib/libdl.so.2 /usr/lib/librt.so.1 /usr/lib/libpthread.so.0 /usr/lib/libm.so.6 /usr/lib/libstdc++.so.6 /usr/lib/libgcc_s.so.1 /usr/lib/ # logrotate dependencies diff --git a/otelcollector/build/windows/Dockerfile b/otelcollector/build/windows/Dockerfile index d1f1fafb4..ec47162f8 100644 --- a/otelcollector/build/windows/Dockerfile +++ b/otelcollector/build/windows/Dockerfile @@ -30,7 +30,6 @@ COPY ./configmapparser/default-prom-configs/*.yml $tmpdir/microsoft/otelcollecto COPY ./opentelemetry-collector-builder/otelcollector.exe ./opentelemetry-collector-builder/collector-config-default.yml ./opentelemetry-collector-builder/collector-config-template.yml $tmpdir/microsoft/otelcollector/ COPY ./prom-config-validator-builder/promconfigvalidator.exe $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_win.config ./metricextension/me_ds_internal.config ./metricextension/me_ds_internal_win.config $tmpdir/metricextension/ -COPY ./telegraf/telegraf-prometheus-collector-windows.conf $tmpdir/telegraf/ COPY ./fluent-bit/fluent-bit-windows.conf $tmpdir/fluent-bit/ COPY ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY ./fluent-bit/src/out_appinsights.so $tmpdir/fluent-bit/bin/ diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.conf b/otelcollector/fluent-bit/fluent-bit-daemonset.conf deleted file mode 100644 index 6abf4d266..000000000 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.conf +++ /dev/null @@ -1,185 +0,0 @@ -[SERVICE] - Flush 15 - HTTP_Server Off - Daemon Off - Log_Level debug - Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf - Log_File /opt/fluent-bit/fluent-bit.log - -# prometheus-collector container logs -[INPUT] - Name tail - Tag prometheus.log.prometheuscollectorcontainer - Path /var/log/containers/*prometheus-collector-node*prometheus-collector*.log,/var/log/containers/*ama-metrics-node*prometheus-collector*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# kube-state-metrics container logs -[INPUT] - Name tail - Tag prometheus.log.kubestatemetricscontainer - Path /var/log/containers/ama-metrics-ksm*kube-system*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# targetallocator targetallocator container logs -[INPUT] - Name tail - Tag prometheus.log.targetallocator.tacontainer - Path /var/log/containers/ama-metrics-*operator-targets*kube-system*targetallocator*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - -# targetallocator config-reader container logs -[INPUT] - Name tail - Tag prometheus.log.targetallocator.configreader - Path /var/log/containers/ama-metrics-*operator-targets*kube-system*config-reader*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - -# addon-token-adapter container logs -[INPUT] - Name tail - Tag prometheus.log.addontokenadapter - Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# otelcollector is logging at warn level -[INPUT] - Name tail - Tag prometheus.otelcollector - Path /opt/microsoft/otelcollector/collector-log.txt - Parser collector-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# metrics extension logs at info level to be able to get processed metrics count -[INPUT] - Name tail - Tag prometheus.metricsextension - Path /MetricsExtensionConsoleDebugLog.log - Parser me-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# Only tailing mdsd error log file -[INPUT] - Name tail - Tag prometheus.mdsd - Path /opt/microsoft/linuxmonagent/mdsd.err - DB /var/opt/microsoft/state/mdsd.db - DB.Sync Off - Parser mdsd-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -[INPUT] - Name tail - Tag prometheus.log.noconfiguration - Path /dev/write-to-traces - Read_from_Head true - DB /var/opt/microsoft/state/no-configuration.db - DB.Sync Off - Parser no-config-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# Send log lines that contain the telemetry we want to a different tag -# to then send to customMetrics table -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*ProcessedCount.* prometheus.log.processedcount false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*\(infinite\).* prometheus.log.infinitemetric false - -[FILTER] - Name rewrite_tag - Match prometheus.otelcollector - Rule $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true - -# Send ME errors to stdout of container -[FILTER] - name grep - match prometheus.metricsextension - regex level (Error|Fatal) - -# Send otelcollector errors to stdout of container -[FILTER] - name grep - match prometheus.otelcollector - regex level (error|fatal) - -[FILTER] - Name grep - Match prometheus.log.addontokenadapter - regex stream stderr - -[OUTPUT] - Name appinsights - Match prometheus.log.* - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.metricsextension - -[OUTPUT] - Name stdout - Format json_lines - json_date_key false - Match prometheus.otelcollector - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.mdsd - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.log.noconfiguration diff --git a/otelcollector/fluent-bit/fluent-bit.conf b/otelcollector/fluent-bit/fluent-bit.conf deleted file mode 100644 index 0848269b0..000000000 --- a/otelcollector/fluent-bit/fluent-bit.conf +++ /dev/null @@ -1,173 +0,0 @@ -[SERVICE] - Flush 15 - HTTP_Server Off - Daemon Off - Log_Level debug - Parsers_File /opt/fluent-bit/fluent-bit-parsers.conf - Log_File /opt/fluent-bit/fluent-bit.log - -# prometheus-collector container logs -[INPUT] - Name tail - Tag prometheus.log.prometheuscollectorcontainer - Path /var/log/containers/*prometheus-collector*prometheus-collector*.log,/var/log/containers/*ama-metrics*prometheus-collector*.log - Exclude_Path /var/log/containers/*prometheus-collector-node*.log,/var/log/containers/*ama-metrics-node*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# token-adapter container logs -[INPUT] - Name tail - Tag prometheus.log.addontokenadapter - Path /var/log/containers/*prometheus-collector*addon-token-adapter*.log,/var/log/containers/*ama-metrics*addon-token-adapter*.log - Exclude_Path /var/log/containers/*prometheus-collector-node*addon-token-adapter*.log,/var/log/containers/*ama-metrics-node*addon-token-adapter*.log - Parser cri - Read_from_Head true - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# otelcollector is logging at warn level -[INPUT] - Name tail - Tag prometheus.otelcollector - Path /opt/microsoft/otelcollector/collector-log.txt - Parser collector-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# metrics extension logs at info level to be able to get processed metrics count -[INPUT] - Name tail - Tag prometheus.metricsextension - Path /MetricsExtensionConsoleDebugLog.log - Parser me-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -# Only tailing mdsd error log file -[INPUT] - Name tail - Tag prometheus.mdsd - Path /opt/microsoft/linuxmonagent/mdsd.err - Parser mdsd-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -[INPUT] - Name tail - Tag prometheus.log.noconfiguration - Path /dev/write-to-traces - Read_from_Head true - Parser no-config-parser - Mem_Buf_Limit 1m - Path_Key filepath - Skip_Long_Lines On - Ignore_Older 2m - -[INPUT] - name prometheus_scrape - host 0.0.0.0 - port 8888 - tag prometheus.metrics.otelcollector - metrics_path /metrics - scrape_interval 1m - -[INPUT] - name prometheus_scrape - host 0.0.0.0 - port 9090 - tag prometheus.metrics.prometheus - metrics_path /metrics - scrape_interval 1m - -[INPUT] - name prometheus_scrape - host ama-metrics-operator-targets.kube-system.svc.cluster.local - port 80 - tag prometheus.metrics.targetallocator - metrics_path /metrics - scrape_interval 1m - -# Send log lines that contain the telemetry we want to a different tag -# to then send to customMetrics table -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*ProcessedCount.* prometheus.log.processedcount false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EtwEventsDropped.* prometheus.log.diagnosticheartbeat false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*EventsProcessedLastPeriod.* prometheus.log.eventsprocessedlastperiod false - -[FILTER] - Name rewrite_tag - Match prometheus.metricsextension - Rule $message .*\(infinite\).* prometheus.log.infinitemetric false - -[FILTER] - Name rewrite_tag - Match prometheus.otelcollector - Rule $msg .*Exporting\sfailed.* prometheus.log.exportingfailed true - -# Send ME errors to stdout of container -[FILTER] - name grep - match prometheus.metricsextension - regex level (Error|Fatal) - -# Send otelcollector errors to stdout of container -[FILTER] - name grep - match prometheus.otelcollector - regex level (error|fatal) - -[FILTER] - Name grep - Match prometheus.log.addontokenadapter - regex stream stderr - -[OUTPUT] - Name appinsights - Match_regex prometheus.log.*|prometheus.metrics.* - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.metricsextension - -[OUTPUT] - Name stdout - Format json_lines - json_date_key false - Match prometheus.otelcollector - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.mdsd - -[OUTPUT] - Name stdout - Format json_lines - json_date_key time - Match prometheus.log.noconfiguration \ No newline at end of file diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index da1d17f36..65ca4c9f7 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -153,7 +153,6 @@ func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}, t } for _, record := range records { cMetrics := ConvertRecordToCMetrics(record) - fmt.Printf("cMetrics: %v\n", cMetrics) for _, metric := range cMetrics.Metrics { for _, value := range metric.Values { metricTelemetryItem := appinsights.NewMetricTelemetry( diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 6042a964f..3122909d1 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -79,7 +79,7 @@ type ProcessAggregations struct { } func InitProcessAggregations(processName []string, os string) *ProcessAggregations { - fmt.Printf("Starting process aggregations") + fmt.Println("Starting process aggregations") processAggregationsMap := make(map[string]*Process) for _, processName := range processName { diff --git a/otelcollector/fluent-bit/src/utils.go b/otelcollector/fluent-bit/src/utils.go index 796aa9256..fc84eb1eb 100644 --- a/otelcollector/fluent-bit/src/utils.go +++ b/otelcollector/fluent-bit/src/utils.go @@ -49,7 +49,6 @@ func findPIDFromExe(process string, os string) ([]int32, error) { fields := strings.Fields(out) - fmt.Printf("fields: %v\n", fields) pids := make([]int32, 0, len(fields)) for _, field := range fields { pid, err := strconv.ParseInt(field, 10, 32) diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf deleted file mode 100644 index 75be1e653..000000000 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ds.conf +++ /dev/null @@ -1,194 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - - -# Global tags can be specified here in key="value" format. -[global_tags] - #Below are entirely used for telemetry - agentversion = "$AGENT_VERSION" - cluster = "$customResourceId" - calias = "$AZMON_CLUSTER_ALIAS" - clabel = "$AZMON_CLUSTER_LABEL" - Region = "$AKSREGION" - computer = "$NODE_NAME" - nodeip = "$NODE_IP" - mode = "$MODE" - winmode = "$WINMODE" - macmode = "$MAC" - controllertype = "$CONTROLLER_TYPE" - defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME" - namespace = "$POD_NAMESPACE" - podname = "$POD_NAME" - ostype = "$OS_TYPE" - mip="$MINIMAL_INGESTION_PROFILE" - setGlobalSettings = "$AZMON_SET_GLOBAL_SETTINGS" - globalSettingsConfigured = "$AZMON_GLOBAL_SETTINGS_CONFIGURED" - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = false - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "30s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "15s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "15s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = true - ## Specify the log file name. The empty string means to log to stderr. - logfile = "" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - -############################################################################### -# PROCESSOR PLUGINS # -############################################################################### -[[processors.converter]] - [processors.converter.fields] - float = ["*"] - -############################################################################### -# AGGREGATOR PLUGINS # -############################################################################### -[[aggregators.quantile]] - period = "5m" - drop_original = true - quantiles = [0.50,0.95] - algorithm = "t-digest" - compression = 100.0 - namepass = ["otelcollector", "metricsextension"] - -# Keep the aggregate basicstats of each metric passing through. -[[aggregators.basicstats]] - namepass = ["opentelemetry_allocator_targets"] - - ## The period on which to flush & clear the aggregator. - period = "5m" - - ## If true, the original metric will be dropped by the - ## aggregator and will not get sent to the output plugins. - drop_original = true - - ## Configures which basic stats to push as fields - stats = ["count"] - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -# Read metrics about cpu usage -#[[inputs.cpu]] - ## Whether to report per-cpu stats or not -# percpu = false - ## Whether to report total system cpu stats or not -# totalcpu = true - ## If true, collect raw CPU time metrics. -# collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states. -# report_active = true -# fieldpass = ["usage_active","cluster","node","host","device"] -# taginclude = ["cluster","cpu","node"] - -# Read metrics about memory usage -#[[inputs.mem]] -# fieldpass = ["used_percent", "cluster", "node","host","device"] -# taginclude = ["cluster","node"] - -# [[inputs.procstat]] -# exe = "otelcollector" -# interval = "10s" -# pid_finder = "pgrep" -# pid_tag = true -# name_override = "otelcollector" -# fieldpass = ["cpu_usage", "memory_rss"] -# [inputs.procstat.tags] -# # Computer = "$NODE_NAME" -# # NodeIp = "$NODE_IP" -# cpulimit = "$CONTAINER_CPU_LIMIT" -# memlimit = "$CONTAINER_MEMORY_LIMIT" -# debugmodeenabled = "$DEBUG_MODE_ENABLED" -# tadapterh="$tokenadapterHealthyAfterSecs" -# tadapterf="$tokenadapterUnhealthyAfterSecs" - -# [[inputs.procstat]] -# exe = "MetricsExtension" -# interval = "10s" -# pid_finder = "pgrep" -# pid_tag = true -# name_override = "metricsextension" -# fieldpass = ["cpu_usage", "memory_rss"] - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:8888/metrics"] - fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] - tagexclude = ["service_instance_id"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf deleted file mode 100644 index 0641c14e5..000000000 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf +++ /dev/null @@ -1,254 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - - -# Global tags can be specified here in key="value" format. -[global_tags] - #Below are entirely used for telemetry - agentversion = "$AGENT_VERSION" - cluster = "$customResourceId" - calias = "$AZMON_CLUSTER_ALIAS" - clabel = "$AZMON_CLUSTER_LABEL" - Region = "$AKSREGION" - computer = "$NODE_NAME" - nodeip = "$NODE_IP" - mode = "$MODE" - winmode = "$WINMODE" - macmode = "$MAC" - controllertype = "$CONTROLLER_TYPE" - defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME" - namespace = "$POD_NAMESPACE" - podname = "$POD_NAME" - ostype = "$OS_TYPE" - mip = "$MINIMAL_INGESTION_PROFILE" - operatormodel = "$AZMON_OPERATOR_ENABLED" - operatormodelcfgmapsetting = "$AZMON_OPERATOR_ENABLED_CFG_MAP_SETTING" - operatormodelchartsetting = "$AZMON_OPERATOR_ENABLED_CHART_SETTING" - collectorHpaEnabled = "$AZMON_COLLECTOR_HPA_ENABLED" - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = false - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "30s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "15s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "15s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = true - ## Specify the log file name. The empty string means to log to stderr. - logfile = "" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - # Timeout for closing (default: 5s). - timeout = "5s" - - # Enable additional diagnostic logging. - enable_diagnostic_logging = false - -# Send telegraf metrics to file(s) -# [[outputs.file]] -# ## Files to write to, "stdout" is a specially handled file. -# files = ["stdout", "metrics.out"] - - -############################################################################### -# PROCESSOR PLUGINS # -############################################################################### -[[processors.converter]] - [processors.converter.fields] - float = ["*"] - -# Transforms tag and field values as well as measurement, tag and field names with regex pattern -# [[processors.regex]] -# namepass = ["target_allocator"] - -# # Tag and field conversions defined in a separate sub-tables -# [[processors.regex.tags]] -# ## Tag to change, "*" will change every tag -# key = "job_name" -# ## Regular expression to match on a tag value - -# # Group 1: match a string starting with "podMonitor/" or "serviceMonitor/" -# # The format of a CR name is podMonitor// or serviceMonitor// -# # Group 2: match the string "podMonitor" or "serviceMonitor" -# # Group 3: match any string. We want all job names to be matched so that we aren't collecting the job name. -# pattern = '(^(podMonitor|serviceMonitor)\/.*)|(.*)' -# ## Matches of the pattern will be replaced with this string. Use ${1} -# ## notation to use the text of the first submatch. - -# ## "Job" is necessary or else configmap jobs would not be replaced and would keep their -# ## original name -# replacement = "${2}Job" - -############################################################################### -# AGGREGATOR PLUGINS # -############################################################################### -[[aggregators.quantile]] - period = "5m" - drop_original = true - quantiles = [0.50,0.95] - algorithm = "t-digest" - compression = 100.0 - namepass = ["otelcollector", "metricsextension"] - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -# Read metrics about cpu usage -#[[inputs.cpu]] - ## Whether to report per-cpu stats or not -# percpu = false - ## Whether to report total system cpu stats or not -# totalcpu = true - ## If true, collect raw CPU time metrics. -# collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states. -# report_active = true -# fieldpass = ["usage_active","cluster","node","host","device"] -# taginclude = ["cluster","cpu","node"] - -# Read metrics about memory usage -#[[inputs.mem]] -# fieldpass = ["used_percent", "cluster", "node","host","device"] -# taginclude = ["cluster","node"] - -# [[inputs.procstat]] -# exe = "otelcollector" -# interval = "10s" -# pid_finder = "pgrep" -# pid_tag = true -# name_override = "otelcollector" -# fieldpass = ["cpu_usage", "memory_rss"] -# [inputs.procstat.tags] -# # Computer = "$NODE_NAME" -# # NodeIp = "$NODE_IP" - # cpulimit = "$CONTAINER_CPU_LIMIT" - # memlimit = "$CONTAINER_MEMORY_LIMIT" - - # defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" - # defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" - # defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" - # defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" - # defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" - # defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" - # defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" - # defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" - # defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" - # defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" - # defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" - # podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" - # defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" - # nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" - # nodeexportername = "$NODE_EXPORTER_NAME" - # kubestatename = "$KUBE_STATE_NAME" - # kubestateversion = "$KUBE_STATE_VERSION" - # operatortargetstaimgversion = "$OPERATOR_TARGETS_TA_IMG_VERSION" - # operatortargetscfgreaderimgversion = "$OPERATOR_TARGETS_CFG_READER_IMG_VERSION" - # nodeexporterversion = "$NODE_EXPORTER_VERSION" - # akvauth = "$AKVAUTH" - # debugmodeenabled = "$DEBUG_MODE_ENABLED" - # kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" - # kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" - # httpproxyenabled = "$HTTP_PROXY_ENABLED" - # tadapterh="$tokenadapterHealthyAfterSecs" - # tadapterf="$tokenadapterUnhealthyAfterSecs" - # setGlobalSettings="$AZMON_SET_GLOBAL_SETTINGS" - # globalSettingsConfigured="$AZMON_GLOBAL_SETTINGS_CONFIGURED" - -# [[inputs.procstat]] -# exe = "MetricsExtension" -# interval = "10s" -# pid_finder = "pgrep" -# pid_tag = true -# name_override = "metricsextension" -# fieldpass = ["cpu_usage", "memory_rss"] - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:8888/metrics"] - fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] - tagexclude = ["service_instance_id"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:9090/metrics"] - fieldpass = ["prometheus_sd_http_failures_total"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://ama-metrics-operator-targets.kube-system.svc.cluster.local/metrics"] - fieldpass = ["opentelemetry_allocator_targets","opentelemetry_allocator_collectors_discovered"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - name_override = "target_allocator" diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-windows.conf b/otelcollector/telegraf/telegraf-prometheus-collector-windows.conf deleted file mode 100644 index 14c406fdc..000000000 --- a/otelcollector/telegraf/telegraf-prometheus-collector-windows.conf +++ /dev/null @@ -1,178 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - - -# Global tags can be specified here in key="value" format. -[global_tags] - #Below are entirely used for telemetry - agentversion = "$AGENT_VERSION" - cluster = "$customResourceId" - calias = "$AZMON_CLUSTER_ALIAS" - clabel = "$AZMON_CLUSTER_LABEL" - Region = "$AKSREGION" - computer = "$NODE_NAME" - nodeip = "$NODE_IP" - mode = "$MODE" - winmode = "$WINMODE" - macmode = "$MAC" - opt = "true" - controllertype = "$CONTROLLER_TYPE" - defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME" - namespace = "$POD_NAMESPACE" - podname = "$POD_NAME" - ostype = "$OS_TYPE" - debugmodeenabled = "$DEBUG_MODE_ENABLED" - windowsVersion = "$windowsVersion" - setGlobalSettings = "$AZMON_SET_GLOBAL_SETTINGS" - globalSettingsConfigured = "$AZMON_GLOBAL_SETTINGS_CONFIGURED" - - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = false - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "30s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "30s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "15s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = true - ## Specify the log file name. The empty string means to log to stderr. - logfile = "" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - -############################################################################### -# PROCESSOR PLUGINS # -############################################################################### -[[processors.converter]] - [processors.converter.fields] - float = ["*"] - -############################################################################### -# AGGREGATOR PLUGINS # -############################################################################### -[[aggregators.quantile]] - period = "5m" - drop_original = true - quantiles = [0.50,0.95] - algorithm = "t-digest" - compression = 100.0 - namepass = ["win_proc"] - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -# Read metrics about cpu usage -#[[inputs.cpu]] - ## Whether to report per-cpu stats or not -# percpu = false - ## Whether to report total system cpu stats or not -# totalcpu = true - ## If true, collect raw CPU time metrics. -# collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states. -# report_active = true -# fieldpass = ["usage_active","cluster","node","host","device"] -# taginclude = ["cluster","cpu","node"] - -# Read metrics about memory usage -#[[inputs.mem]] -# fieldpass = ["used_percent", "cluster", "node","host","device"] -# taginclude = ["cluster","node"] - -[[inputs.win_perf_counters]] - [inputs.win_perf_counters.tags] - cpulimit = "$CONTAINER_CPU_LIMIT" - memlimit = "$CONTAINER_MEMORY_LIMIT" - - [[inputs.win_perf_counters.object]] - ObjectName = "Process" - Counters = ["% Processor Time","Working Set"] - Instances = ["otelcollector"] - Measurement = "win_proc" - -[[inputs.win_perf_counters]] - [[inputs.win_perf_counters.object]] - # Process metrics, in this case for IIS only - ObjectName = "Process" - Counters = ["% Processor Time","Working Set"] - Instances = ["MetricsExtension.Native"] - Measurement = "win_proc" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:8888/metrics"] - fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] - tagexclude = ["service_instance_id"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" diff --git a/otelcollector/telegraf/telegraf-prometheus-collector.conf b/otelcollector/telegraf/telegraf-prometheus-collector.conf deleted file mode 100644 index 315ad58be..000000000 --- a/otelcollector/telegraf/telegraf-prometheus-collector.conf +++ /dev/null @@ -1,218 +0,0 @@ -# Telegraf Configuration -# -# Telegraf is entirely plugin driven. All metrics are gathered from the -# declared inputs, and sent to the declared outputs. -# -# Plugins must be declared in here to be active. -# To deactivate a plugin, comment out the name and any variables. -# -# Use 'telegraf -config telegraf.conf -test' to see what metrics a config -# file would generate. -# -# Environment variables can be used anywhere in this config file, simply prepend -# them with $. For strings the variable must be within quotes (ie, "$STR_VAR"), -# for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) - - -# Global tags can be specified here in key="value" format. -[global_tags] - #Below are entirely used for telemetry - agentversion = "$AGENT_VERSION" - cluster = "$customResourceId" - calias = "$AZMON_CLUSTER_ALIAS" - clabel = "$AZMON_CLUSTER_LABEL" - Region = "$AKSREGION" - computer = "$NODE_NAME" - nodeip = "$NODE_IP" - mode = "$MODE" - winmode = "$WINMODE" - macmode = "$MAC" - opt = "true" - controllertype = "$CONTROLLER_TYPE" - defaultmetricaccountname = "$AZMON_DEFAULT_METRIC_ACCOUNT_NAME" - namespace = "$POD_NAMESPACE" - podname = "$POD_NAME" - ostype = "$OS_TYPE" - mip = "$MINIMAL_INGESTION_PROFILE" - operatormodel = "$AZMON_OPERATOR_ENABLED" - operatormodelcfgmapsetting = "$AZMON_OPERATOR_ENABLED_CFG_MAP_SETTING" - operatormodelchartsetting = "$AZMON_OPERATOR_ENABLED_CHART_SETTING" - collectorHpaEnabled = "$AZMON_COLLECTOR_HPA_ENABLED" - -# Configuration for telegraf agent -[agent] - ## Default data collection interval for all inputs - interval = "60s" - ## Rounds collection interval to 'interval' - ## ie, if interval="10s" then always collect on :00, :10, :20, etc. - round_interval = false - - ## Telegraf will send metrics to outputs in batches of at most - ## metric_batch_size metrics. - ## This controls the size of writes that Telegraf sends to output plugins. - metric_batch_size = 1000 - - ## For failed writes, telegraf will cache metric_buffer_limit metrics for each - ## output, and will flush this buffer on a successful write. Oldest metrics - ## are dropped first when this buffer fills. - ## This buffer only fills when writes fail to output plugin(s). - metric_buffer_limit = 10000 - - ## Collection jitter is used to jitter the collection by a random amount. - ## Each plugin will sleep for a random time within jitter before collecting. - ## This can be used to avoid many plugins querying things like sysfs at the - ## same time, which can have a measurable effect on the system. - collection_jitter = "30s" - - ## Default flushing interval for all outputs. You shouldn't set this below - ## interval. Maximum flush_interval will be flush_interval + flush_jitter - flush_interval = "30s" - ## Jitter the flush interval by a random amount. This is primarily to avoid - ## large write spikes for users running a large number of telegraf instances. - ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s - flush_jitter = "15s" - - ## By default or when set to "0s", precision will be set to the same - ## timestamp order as the collection interval, with the maximum being 1s. - ## ie, when interval = "10s", precision will be "1s" - ## when interval = "250ms", precision will be "1ms" - ## Precision will NOT be used for service inputs. It is up to each individual - ## service input to set the timestamp at the appropriate precision. - ## Valid time units are "ns", "us" (or "µs"), "ms", "s". - precision = "" - - ## Logging configuration: - ## Run telegraf with debug log messages. - debug = false - ## Run telegraf in quiet mode (error log messages only). - quiet = true - ## Specify the log file name. The empty string means to log to stderr. - logfile = "" - - ## Override default hostname, if empty use os.Hostname() - #hostname = "placeholder_hostname" - ## If set to true, do no set the "host" tag in the telegraf agent. - omit_hostname = true - - -############################################################################### -# OUTPUT PLUGINS # -############################################################################### - -[[outputs.application_insights]] - ## Instrumentation key of the Application Insights resource. - instrumentation_key = "$TELEMETRY_APPLICATIONINSIGHTS_KEY" - - ## Timeout for closing (default: 5s). - # timeout = "5s" - - ## Enable additional diagnostic logging. - # enable_diagnostic_logging = false - - -############################################################################### -# PROCESSOR PLUGINS # -############################################################################### -[[processors.converter]] - [processors.converter.fields] - float = ["*"] - -############################################################################### -# AGGREGATOR PLUGINS # -############################################################################### -[[aggregators.quantile]] - period = "5m" - drop_original = true - quantiles = [0.50,0.95] - algorithm = "t-digest" - compression = 100.0 - namepass = ["otelcollector", "metricsextension"] - -############################################################################### -# INPUT PLUGINS # -############################################################################### - -# Read metrics about cpu usage -#[[inputs.cpu]] - ## Whether to report per-cpu stats or not -# percpu = false - ## Whether to report total system cpu stats or not -# totalcpu = true - ## If true, collect raw CPU time metrics. -# collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states. -# report_active = true -# fieldpass = ["usage_active","cluster","node","host","device"] -# taginclude = ["cluster","cpu","node"] - -# Read metrics about memory usage -#[[inputs.mem]] -# fieldpass = ["used_percent", "cluster", "node","host","device"] -# taginclude = ["cluster","node"] - -[[inputs.procstat]] - exe = "otelcollector" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "otelcollector" - fieldpass = ["cpu_usage", "memory_rss"] - [inputs.procstat.tags] -# Computer = "$NODE_NAME" -# NodeIp = "$NODE_IP" - cpulimit = "$CONTAINER_CPU_LIMIT" - memlimit = "$CONTAINER_MEMORY_LIMIT" - - defaultscrapekubelet = "$AZMON_PROMETHEUS_KUBELET_SCRAPING_ENABLED" - defaultscrapecoreDns = "$AZMON_PROMETHEUS_COREDNS_SCRAPING_ENABLED" - defaultscrapecadvisor = "$AZMON_PROMETHEUS_CADVISOR_SCRAPING_ENABLED" - defaultscrapekubeproxy = "$AZMON_PROMETHEUS_KUBEPROXY_SCRAPING_ENABLED" - defaultscrapeapiserver = "$AZMON_PROMETHEUS_APISERVER_SCRAPING_ENABLED" - defaultscrapekubestate = "$AZMON_PROMETHEUS_KUBESTATE_SCRAPING_ENABLED" - defaultscrapenodeexporter = "$AZMON_PROMETHEUS_NODEEXPORTER_SCRAPING_ENABLED" - defaultscrapecollectorhealth = "$AZMON_PROMETHEUS_COLLECTOR_HEALTH_SCRAPING_ENABLED" - defaultscrapewindowsexporter = "$AZMON_PROMETHEUS_WINDOWSEXPORTER_SCRAPING_ENABLED" - defaultscrapewindowskubeproxy = "$AZMON_PROMETHEUS_WINDOWSKUBEPROXY_SCRAPING_ENABLED" - defaultscrapepodannotations = "$AZMON_PROMETHEUS_POD_ANNOTATION_SCRAPING_ENABLED" - podannotationns = "$AZMON_PROMETHEUS_POD_ANNOTATION_NAMESPACES_REGEX" - defaultscrapekappiebasic = "$AZMON_PROMETHEUS_KAPPIEBASIC_SCRAPING_ENABLED" - defaultscrapenetworkobservabilityRetina = "$AZMON_PROMETHEUS_NETWORKOBSERVABILITYRETINA_SCRAPING_ENABLED" - defaultscrapenetworkobservabilityHubble = "$AZMON_PROMETHEUS_NETWORKOBSERVABILITYHUBBLE_SCRAPING_ENABLED" - defaultscrapenetworkobservabilityCilium = "$AZMON_PROMETHEUS_NETWORKOBSERVABILITYCILIUM_SCRAPING_ENABLED" - nodeexportertargetport= "$NODE_EXPORTER_TARGETPORT" - nodeexportername = "$NODE_EXPORTER_NAME" - kubestatename = "$KUBE_STATE_NAME" - kubestateversion = "$KUBE_STATE_VERSION" - nodeexporterversion = "$NODE_EXPORTER_VERSION" - akvauth = "$AKVAUTH" - debugmodeenabled = "$DEBUG_MODE_ENABLED" - kubestatemetriclabelsallowlist = "$KUBE_STATE_METRIC_LABELS_ALLOWLIST" - kubestatemetricannotationsallowlist = "$KUBE_STATE_METRIC_ANNOTATIONS_ALLOWLIST" - httpproxyenabled = "$HTTP_PROXY_ENABLED" - tadapterh="$tokenadapterHealthyAfterSecs" - tadapterf="$tokenadapterUnhealthyAfterSecs" - -[[inputs.procstat]] - exe = "MetricsExtension" - interval = "10s" - pid_finder = "pgrep" - pid_tag = true - name_override = "metricsextension" - fieldpass = ["cpu_usage", "memory_rss"] - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:8888/metrics"] - fieldpass = ["otelcol_processor_dropped_metric_points", "otelcol_receiver_refused_metric_points", "otelcol_receiver_accepted_metric_points", "otelcol_exporter_sent_metric_points", "otelcol_exporter_queue_size", "otelcol_exporter_send_failed_metric_points", "otelcol_process_memory_rss", "otelcol_processor_batch_batch_send_size_bytes_sum", "otelcol_processor_batch_batch_send_size_bytes_count"] - tagexclude = ["service_instance_id"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" - -[[inputs.prometheus]] - interval = "5m" - urls = ["http://localhost:9090/metrics"] - fieldpass = ["prometheus_sd_http_failures_total"] - metric_version = 2 - url_tag = "scrapeUrl" - timeout = "15s" From fdde68f30a8902b8d157aa411bd9e943465f3d17 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Dec 2024 16:38:04 -0800 Subject: [PATCH 43/47] fixes --- otelcollector/build/linux/Dockerfile | 2 +- otelcollector/fluent-bit/src/process_stats.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/otelcollector/build/linux/Dockerfile b/otelcollector/build/linux/Dockerfile index 44ccff2be..14070e3b5 100644 --- a/otelcollector/build/linux/Dockerfile +++ b/otelcollector/build/linux/Dockerfile @@ -136,7 +136,7 @@ COPY --from=main-builder --chmod=777 /main/main.exe $tmpdir/main COPY ./scripts/*.sh $tmpdir/ COPY ./metricextension/me.config ./metricextension/me_internal.config ./metricextension/me_ds.config ./metricextension/me_ds_internal.config /usr/sbin/ -COPY ./fluent-bit/fluent-bit.conf ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.conf ./fluent-bit/fluent-bit-daemonset.yaml ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ +COPY ./fluent-bit/fluent-bit.yaml ./fluent-bit/fluent-bit-daemonset.yaml ./fluent-bit/fluent-bit-parsers.conf $tmpdir/fluent-bit/ COPY --from=fluent-bit-builder /src/out_appinsights.so $tmpdir/fluent-bit/bin/ COPY --from=fluent-bit-binary-builder /usr/local/bin/fluent-bit /usr/local/bin/fluent-bit COPY --from=fluent-bit-binary-builder /usr/local/etc/fluent-bit /usr/local/etc/fluent-bit diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 3122909d1..22625edc1 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -85,7 +85,7 @@ func InitProcessAggregations(processName []string, os string) *ProcessAggregatio for _, processName := range processName { pids, err := findPIDFromExe(processName, os) if err != nil || len(pids) == 0 { - fmt.Printf("Error getting PID for process %s\n", processName) + fmt.Printf("Error getting PID for process %s: %s\n", processName, err.Error()) continue } From e11b82b1b3c4a86728f3ebb1e63a4c4590f1599b Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 13 Dec 2024 16:39:15 -0800 Subject: [PATCH 44/47] fix OS_TYPE --- otelcollector/fluent-bit/src/out_appinsights.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index 04e69d010..bfdb5803b 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -49,7 +49,7 @@ func FLBPluginInit(ctx unsafe.Pointer) int { } // Collect, aggregate, and send CPU and Memory usage telemetry for the processes below - osType := os.Getenv("OSTYPE") + osType := os.Getenv("OS_TYPE") processNames := []string{"otelcollector", "MetricsExtension"} if osType == "windows" { processNames = []string{"otelcollector", "MetricsExtension.Native"} From 520d3f702f457fd50326f573e85953a42fef07f0 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Sun, 15 Dec 2024 15:32:29 -0800 Subject: [PATCH 45/47] add debugging logs --- otelcollector/fluent-bit/src/cmetrics_decoder.go | 1 + otelcollector/fluent-bit/src/out_appinsights.go | 3 +++ otelcollector/fluent-bit/src/process_stats.go | 6 +++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/cmetrics_decoder.go b/otelcollector/fluent-bit/src/cmetrics_decoder.go index 65ca4c9f7..d506740eb 100644 --- a/otelcollector/fluent-bit/src/cmetrics_decoder.go +++ b/otelcollector/fluent-bit/src/cmetrics_decoder.go @@ -163,6 +163,7 @@ func SendPrometheusMetricsToAppInsights(records []map[interface{}]interface{}, t metricTelemetryItem.Properties[labelName] = fmt.Sprintf("%s", value.Labels[i]) } TelemetryClient.Track(metricTelemetryItem) + Log(fmt.Sprintf("Sent telemetry for %s_%s_%s_%s", telemetryPrefix, metric.Meta.Opts.Namespace, metric.Meta.Opts.Subsystem, metric.Meta.Opts.Name)) } } } diff --git a/otelcollector/fluent-bit/src/out_appinsights.go b/otelcollector/fluent-bit/src/out_appinsights.go index bfdb5803b..e11918933 100644 --- a/otelcollector/fluent-bit/src/out_appinsights.go +++ b/otelcollector/fluent-bit/src/out_appinsights.go @@ -1,6 +1,8 @@ package main import ( + "fmt" + "github.com/fluent/fluent-bit-go/output" "github.com/microsoft/ApplicationInsights-Go/appinsights" ) @@ -84,6 +86,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { incomingTag := strings.ToLower(C.GoString(tag)) // Metrics Extension logs with metrics received, dropped, and processed counts + Log(fmt.Sprintf("Received %d records. Tag: %s", len(records), incomingTag)) switch incomingTag { case fluentbitEventsProcessedLastPeriodTag: return UpdateMEReceivedMetricsCount(records) diff --git a/otelcollector/fluent-bit/src/process_stats.go b/otelcollector/fluent-bit/src/process_stats.go index 22625edc1..bc8af2004 100644 --- a/otelcollector/fluent-bit/src/process_stats.go +++ b/otelcollector/fluent-bit/src/process_stats.go @@ -116,7 +116,7 @@ func (pa *ProcessAggregations) Run() { } func (pa *ProcessAggregations) CollectStats() { - ticker := time.NewTicker(time.Second * time.Duration(10)) + ticker := time.NewTicker(time.Second * time.Duration(5)) for ; true; <-ticker.C { pa.mu.Lock() @@ -136,6 +136,8 @@ func (pa *ProcessAggregations) CollectStats() { } } + Log("Collected process stats") + pa.mu.Unlock() } } @@ -164,6 +166,8 @@ func (pa *ProcessAggregations) SendToAppInsights() { } } + Log(fmt.Sprintf("Sent telemetry for process %s", processName)) + // Clear values for next aggregation period p.cpuValues = sort.Float64Slice{} p.memValues = sort.Float64Slice{} From a2ca7ed3413186bfa023c3cbf962175087ecab79 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Sun, 15 Dec 2024 15:39:48 -0800 Subject: [PATCH 46/47] extra logging --- otelcollector/fluent-bit/src/telemetry.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/src/telemetry.go b/otelcollector/fluent-bit/src/telemetry.go index fc10e39df..fb317e256 100644 --- a/otelcollector/fluent-bit/src/telemetry.go +++ b/otelcollector/fluent-bit/src/telemetry.go @@ -812,6 +812,8 @@ func PushMEProcessedAndReceivedCountToAppInsightsMetrics() { meMetricsReceivedCountMapMutex.Unlock() } + Log("Sent ME Metrics Processed Count to App Insights") + } } @@ -868,7 +870,7 @@ func UpdateMEReceivedMetricsCount(records []map[interface{}]interface{}) int { TimeseriesVolumeMutex.Unlock() } - + Log("Updated ME Metrics Received Count") } } } From 3108e4e5bad0421d2e97d94d675dd0c9ff3a79a6 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Sun, 15 Dec 2024 16:02:17 -0800 Subject: [PATCH 47/47] fix --- otelcollector/fluent-bit/fluent-bit-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml index 21b23bee8..a67f78223 100644 --- a/otelcollector/fluent-bit/fluent-bit-daemonset.yaml +++ b/otelcollector/fluent-bit/fluent-bit-daemonset.yaml @@ -148,7 +148,7 @@ pipeline: outputs: - name: appinsights - match: prometheus.log.*|prometheus.metrics.* + match_regex: prometheus.log.*|prometheus.metrics.* - name: stdout format: json_lines