diff --git a/docs/tutorials/agent_tracing/data-prepper-config.yaml b/docs/tutorials/agent_tracing/data-prepper-config.yaml new file mode 100644 index 0000000000..0fa5a62ab1 --- /dev/null +++ b/docs/tutorials/agent_tracing/data-prepper-config.yaml @@ -0,0 +1,3 @@ +# Sample Data Prepper configuration for agent tracing +# This is an example configuration for development purposes +ssl: false \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/docker-compose.yml b/docs/tutorials/agent_tracing/docker-compose.yml new file mode 100644 index 0000000000..e0216f3fee --- /dev/null +++ b/docs/tutorials/agent_tracing/docker-compose.yml @@ -0,0 +1,97 @@ +# Sample Docker Compose configuration for agent tracing setup +# This is an example configuration for development purposes +version: '3' +services: + opensearch-node1: + image: opensearchproject/opensearch:3.1.0 # NOTE: Ensure versions are consistent across all OpenSearch services (OpenSearch, Dashboards, plugins) + container_name: opensearch-node1 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node1 + - discovery.type=single-node + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m -Dopensearch.experimental.feature.telemetry.enabled=true" + - OPENSEARCH_INITIAL_ADMIN_PASSWORD= + - DISABLE_SECURITY_PLUGIN=true + - opensearch.experimental.feature.telemetry.enabled=true + - telemetry.feature.tracer.enabled=true + - telemetry.tracer.enabled=true + - telemetry.tracer.sampler.probability=1.0 + - telemetry.otel.tracer.span.exporter.class=io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter + - plugins.ml_commons.tracing_enabled=true + - plugins.ml_commons.agent_tracing_enabled=true + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + volumes: + - opensearch-data1:/usr/share/opensearch/data + # USE THE BELOW TO MOUNT YOUR LOCAL BUILD TO AN INSTANCE + - /plugin/build/distributions:/usr/share/opensearch/ + command: > + bash -c " + bin/opensearch-plugin remove opensearch-skills; + bin/opensearch-plugin remove opensearch-ml; + bin/opensearch-plugin install --batch telemetry-otel; + bin/opensearch-plugin install --batch file:///usr/share/opensearch//opensearch-ml-3.1.0.0-SNAPSHOT.zip; + OPENSEARCH_LOG4J_CONFIG_FILE=/usr/share/opensearch/config/telemetry-log4j2.xml ./opensearch-docker-entrypoint.sh opensearch" + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + depends_on: + - otel-collector + extra_hosts: + - "localhost:172.17.0.1" # This maps localhost to the Docker host IP + + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:3.1.0 # Must match OpenSearch version + container_name: opensearch-dashboards + ports: + - 5601:5601 + expose: + - "5601" + environment: + OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' + DISABLE_SECURITY_DASHBOARDS_PLUGIN: "true" + networks: + - opensearch-net + depends_on: + - opensearch-node1 + + data-prepper: + restart: unless-stopped + container_name: data-prepper + image: opensearchproject/data-prepper:2 + volumes: + - ./data-prepper-config.yaml:/usr/share/data-prepper/config/data-prepper-config.yaml + - ./pipelines.yaml:/usr/share/data-prepper/pipelines/pipelines.yaml + - /ml_agent_trace.json:/usr/share/data-prepper/ml_agent_trace.json + ports: + - "21890:21890" + networks: + - opensearch-net + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + ports: + - "4317:4317" # OTLP gRPC port + - "4318:4318" # OTLP HTTP port + networks: + - opensearch-net + depends_on: + - data-prepper + +volumes: + opensearch-data1: + +networks: + opensearch-net: \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/images/trace_hierarchical_view.png b/docs/tutorials/agent_tracing/images/trace_hierarchical_view.png new file mode 100644 index 0000000000..544e9b25a4 Binary files /dev/null and b/docs/tutorials/agent_tracing/images/trace_hierarchical_view.png differ diff --git a/docs/tutorials/agent_tracing/images/trace_table_view.png b/docs/tutorials/agent_tracing/images/trace_table_view.png new file mode 100644 index 0000000000..b0ed74e26e Binary files /dev/null and b/docs/tutorials/agent_tracing/images/trace_table_view.png differ diff --git a/docs/tutorials/agent_tracing/images/trace_timeline_view.png b/docs/tutorials/agent_tracing/images/trace_timeline_view.png new file mode 100644 index 0000000000..a83f73eb35 Binary files /dev/null and b/docs/tutorials/agent_tracing/images/trace_timeline_view.png differ diff --git a/docs/tutorials/agent_tracing/images/vega_trace_graph.png b/docs/tutorials/agent_tracing/images/vega_trace_graph.png new file mode 100644 index 0000000000..01e1abd270 Binary files /dev/null and b/docs/tutorials/agent_tracing/images/vega_trace_graph.png differ diff --git a/docs/tutorials/agent_tracing/ml_agent_trace.json b/docs/tutorials/agent_tracing/ml_agent_trace.json new file mode 100644 index 0000000000..0ee884882c --- /dev/null +++ b/docs/tutorials/agent_tracing/ml_agent_trace.json @@ -0,0 +1,175 @@ +{ + "version": 1, + "template": { + "mappings": { + "date_detection": false, + "_source": { + "enabled": true + }, + "properties": { + "traceId": { + "ignore_above": 256, + "type": "keyword" + }, + "spanId": { + "ignore_above": 256, + "type": "keyword" + }, + "parentSpanId": { + "ignore_above": 256, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "traceGroup": { + "ignore_above": 1024, + "type": "keyword" + }, + "traceGroupFields": { + "properties": { + "endTime": { + "type": "date_nanos" + }, + "durationInNanos": { + "type": "long" + }, + "statusCode": { + "type": "integer" + } + } + }, + "kind": { + "ignore_above": 128, + "type": "keyword" + }, + "startTime": { + "type": "date_nanos" + }, + "endTime": { + "type": "date_nanos" + }, + "status": { + "properties": { + "code": { + "type": "integer" + }, + "message": { + "type": "keyword" + } + } + }, + "serviceName": { + "type": "keyword" + }, + "durationInNanos": { + "type": "long" + }, + "events": { + "type": "nested", + "properties": { + "time": { + "type": "date_nanos" + } + } + }, + "links": { + "type": "nested" + }, + "resource": { + "properties": { + "attributes": { + "properties": { + "service.name": { + "type": "keyword" + } + } + } + } + }, + "span": { + "properties": { + "attributes": { + "properties": { + "service@type": { + "type": "keyword" + }, + "thread@name": { + "type": "keyword" + }, + "gen_ai@operation@name": { + "type": "keyword" + }, + "gen_ai@system": { + "type": "keyword" + }, + "gen_ai@agent@name": { + "type": "text" + }, + "gen_ai@agent@task": { + "type": "text" + }, + "gen_ai@agent@result": { + "type": "text" + }, + "gen_ai@agent@phase": { + "type": "keyword" + }, + "gen_ai@agent@step@number": { + "type": "integer" + }, + "gen_ai@agent@latency": { + "type": "integer" + }, + "gen_ai@request@model": { + "type": "keyword" + }, + "gen_ai@usage@input_tokens": { + "type": "integer" + }, + "gen_ai@usage@output_tokens": { + "type": "integer" + }, + "gen_ai@usage@total_tokens": { + "type": "integer" + }, + "gen_ai@system@message": { + "type": "text" + }, + "gen_ai@tool@name": { + "type": "keyword" + }, + "gen_ai@tool@description": { + "type": "text" + }, + "ml@connector@id": { + "type": "keyword" + }, + "ml@connector@name": { + "type": "keyword" + }, + "ml@model@id": { + "type": "keyword" + }, + "ml@model@name": { + "type": "keyword" + }, + "ml@model@request_body": { + "type": "text" + } + } + } + } + }, + "instrumentationScope": { + "properties": { + "name": { + "type": "keyword" + } + } + } + } + } + } + } \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/otel-collector-config.yaml b/docs/tutorials/agent_tracing/otel-collector-config.yaml new file mode 100644 index 0000000000..b2a8f1e8bd --- /dev/null +++ b/docs/tutorials/agent_tracing/otel-collector-config.yaml @@ -0,0 +1,36 @@ +# Sample OpenTelemetry Collector configuration for agent tracing +# This is an example configuration for development purposes +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + filter/traces: + spans: + include: + match_type: regexp + attributes: + - key: service.type + value: tracer + batch/traces: + timeout: 5s + send_batch_size: 50 + +exporters: + debug: + verbosity: detailed + otlp/data-prepper: + endpoint: data-prepper:21890 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [filter/traces, batch/traces] + exporters: [debug, otlp/data-prepper] \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/pipelines.yaml b/docs/tutorials/agent_tracing/pipelines.yaml new file mode 100644 index 0000000000..dd8420fbbf --- /dev/null +++ b/docs/tutorials/agent_tracing/pipelines.yaml @@ -0,0 +1,66 @@ +# Sample Data Prepper pipelines configuration for agent tracing +# This is an example configuration for development purposes +entry-pipeline: + workers: 8 + source: + otel_trace_source: + ssl: false + authentication: + unauthenticated: + buffer: + bounded_blocking: + buffer_size: 512 + batch_size: 8 + sink: + - pipeline: + name: "raw-trace-pipeline" + - pipeline: + name: "service-map-pipeline" + +raw-trace-pipeline: + workers: 8 + source: + pipeline: + name: "entry-pipeline" + buffer: + bounded_blocking: + buffer_size: 512 + batch_size: 64 + processor: + - otel_trace_raw: + # flatten_attributes: false + - otel_trace_group: + hosts: [ "http://opensearch-node1:9200" ] + username: "admin" + password: "" + sink: + - opensearch: + hosts: ["http://opensearch-node1:9200"] + insecure: true + username: admin + password: "" + # index_type: trace-analytics-raw # Uses default index mappings + index_type: custom + index: otel-v1-apm-span-agent + template_file: ml_agent_trace.json + template_type: index-template + +service-map-pipeline: + workers: 8 + source: + pipeline: + name: "entry-pipeline" + processor: + - service_map_stateful: + window_duration: 360 + buffer: + bounded_blocking: + buffer_size: 512 + batch_size: 8 + sink: + - opensearch: + hosts: ["http://opensearch-node1:9200"] + insecure: true + username: admin + password: "" + index_type: trace-analytics-service-map \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/setup_agent_tracing_and_visualization.md b/docs/tutorials/agent_tracing/setup_agent_tracing_and_visualization.md new file mode 100644 index 0000000000..92a25b0d05 --- /dev/null +++ b/docs/tutorials/agent_tracing/setup_agent_tracing_and_visualization.md @@ -0,0 +1,358 @@ +# Setting Up Agent Tracing and Visualization in OpenSearch + +This tutorial explains how to configure and set up agent tracing and visualization in OpenSearch using OpenTelemetry, Data Prepper, and the OpenTelemetry Collector. + + +## Prerequisites +- Docker and Docker Compose installed +- Basic understanding of OpenSearch and OpenTelemetry concepts + +**⚠️ Security Note**: This tutorial uses `ssl: false` configurations throughout for development convenience. These settings are **NOT recommended for production environments**. For production deployments, you should configure proper TLS certificates and authentication mechanisms. + + +## 1. Configuration Setup + +### Create Configuration Directory +```bash +mkdir opensearch-tracing +cd opensearch-tracing +``` + +### Create Configuration Files +Create the following configuration files in your directory: +- [`docker-compose.yml`](docker-compose.yml): Defines the services +- [`otel-collector-config.yaml`](otel-collector-config.yaml): Configures the OpenTelemetry collector +- [`pipelines.yaml`](pipelines.yaml): Configures Data Prepper pipelines +- [`data-prepper-config.yaml`](data-prepper-config.yaml): Basic Data Prepper configuration +- [`ml_agent_trace.json`](ml_agent_trace.json): Custom index mapping for agent traces + + +## 2. Docker Compose File Setup + +The [`docker-compose.yml`](docker-compose.yml) file orchestrates all the services required for agent tracing and visualization, including OpenSearch, OpenSearch Dashboards, Data Prepper, and the OpenTelemetry Collector. + +**⚠️ Development Configuration**: The configuration files in this tutorial use `ssl: false` and basic authentication for development simplicity. For production use, you must configure proper TLS certificates and security settings. + +### Key Points +- **Version Consistency:** + - Ensure all OpenSearch services use the same version (e.g., 3.1.0 for OpenSearch, Dashboards, and plugins) + - Version mismatches can cause compatibility issues and unexpected behavior +- **Volume Mounts:** + - Data Prepper expects its configuration files to be mounted directly to `/usr/share/data-prepper/config/data-prepper-config.yaml` and `/usr/share/data-prepper/pipelines/pipelines.yaml` inside the container. + - Incorrect mount paths (such as mounting into subdirectories) will cause Data Prepper to fail to start. +- **Environment Variables:** + - For OpenSearch, ensure the following environment variables are set to enable tracing features: + - `opensearch.experimental.feature.telemetry.enabled=true` + - `telemetry.feature.tracer.enabled=true` + - `telemetry.tracer.enabled=true` + - `telemetry.tracer.sampler.probability=1.0` + - `telemetry.otel.tracer.span.exporter.class=io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter` + - `plugins.ml_commons.tracing_enabled=true` + - `plugins.ml_commons.agent_tracing_enabled=true` + +### Configuration Details +The Docker Compose file includes: +- **OpenSearch Node**: Configured with telemetry features enabled +- **OpenSearch Dashboards**: For trace visualization +- **Data Prepper**: For trace processing and storage +- **OpenTelemetry Collector**: For trace collection and forwarding + +**Note:** +- Adjust the file paths on the left side of the `volumes` section to match your actual directory structure. +- The above example assumes you are running `docker-compose` from the directory containing your config files. +- The environment variables for OpenSearch are critical for enabling agent tracing features. + + +## 3. Configure OpenTelemetry Collector + +The OpenTelemetry Collector receives traces from OpenSearch agents and forwards them to Data Prepper. + +The [`otel-collector-config.yaml`](otel-collector-config.yaml) file configures: +- **Receivers**: OTLP gRPC and HTTP endpoints +- **Processors**: Trace filtering and batching +- **Exporters**: Debug output and Data Prepper forwarding +- **Service Pipelines**: Trace processing pipeline + + +## 4. Configure Data Prepper + +Data Prepper processes and transforms the traces before sending them to OpenSearch. + +### Pipeline Configuration +The [`pipelines.yaml`](pipelines.yaml) file defines: +- **Entry Pipeline**: Receives traces from OpenTelemetry Collector +- **Raw Trace Pipeline**: Processes and stores traces in OpenSearch +- **Service Map Pipeline**: Creates service dependency maps + +### Data Prepper Configuration +The [`data-prepper-config.yaml`](data-prepper-config.yaml) file contains basic configuration settings. + + +## 5. Index Mapping Configuration + +### Custom Index Mapping File + +The [`ml_agent_trace.json`](ml_agent_trace.json) file contains the custom index mapping for agent traces and connector traces. This file defines the structure and field types for the `otel-v1-apm-span-agent` index where your agent traces will be stored. + +#### Important Notes + +- **File Location**: The file must be mounted at `/usr/share/data-prepper/ml_agent_trace.json` inside the Data Prepper container +- **Template Reference**: The `pipelines.yaml` references this file in the `template_file` field +- **Index Creation**: Data Prepper will use this template to create the `otel-v1-apm-span-agent` index with the proper mapping +- **Field Compatibility**: Ensure your custom mapping includes all required fields for proper trace visualization in OpenSearch Dashboards + + +## 6. Start the Services + +Start all services: +```bash +docker-compose up -d +``` + +**Note 1: Initial Connection Warnings** +When Data Prepper starts, you may see connection warnings like this: +``` +[raw-trace-pipeline-sink-worker-4-thread-1] WARN org.opensearch.dataprepper.plugins.sink.opensearch.OpenSearchSink - Failed to initialize OpenSearch sink, retrying: Connection refused +``` + +This is normal behavior. Data Prepper needs to wait for the OpenSearch cluster to be fully up and running. Eventually, you should see this success message: +``` +[entry-pipeline-sink-worker-2-thread-1] INFO org.opensearch.dataprepper.plugins.source.oteltrace.OTelTraceSource - Started otel_trace_source on port 21890... +``` + +**Note 2: Expected Error Logs** +You may see error logs like this in Data Prepper: +``` +[raw-trace-pipeline-processor-worker-3-thread-1] ERROR org.opensearch.dataprepper.plugins.processor.oteltracegroup.OTelTraceGroupProcessor - Search request for traceGroup failed for traceIds: [] due to OpenSearch exception [type=index_not_found_exception, reason=no such index [otel-v1-apm-span]] +``` + +These errors are expected and can be safely ignored. They occur because we're using custom static mappings instead of the default Data Prepper index mappings. The system will work correctly despite these warnings. There is no current robust solution to correct these errors, but remember these error messages are harmless and don't affect functionality. + + +Verify services are running: +```bash +docker-compose ps +``` + + +## 7. Enable/Disable Agent Tracing at Runtime + +You can enable or disable agent tracing and connector tracing at runtime using the OpenSearch cluster settings API: + +```bash +curl -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d' +{ + "persistent": { + "plugins.ml_commons.agent_tracing_enabled": true, + "plugins.ml_commons.connector_tracing_enabled": true, + "plugins.ml_commons.model_tracing_enabled": true + } +}' +``` +Note: The `plugins.ml_commons.agent_tracing_enabled` setting only takes effect if the static setting `plugins.ml_commons.tracing_enabled` is already enabled in your OpenSearch configuration. + +Use this API to turn agent tracing on or off without restarting your cluster, as long as tracing is enabled globally. + + +## 8. Run an Agent to Generate Traces + +When you build and run agents normally, they automatically create traces as long as the tracing settings are enabled. For example, when you create a plan-execute-reflect agent following the [tutorial](https://docs.opensearch.org/latest/tutorials/gen-ai/agents/build-plan-execute-reflect-agent/) or run it as shown in the [intelligent troubleshooting blog post](https://opensearch.org/blog/intelligent-troubleshooting-using-opensearch-3-0s-plan-execute-reflect-agent/), the agent will generate comprehensive traces that capture every step of its execution. After execution, traces should appear in OpenSearch Dashboards within 3-5 minutes. + +## 9. View Traces + +- Open OpenSearch Dashboards at [http://localhost:5601](http://localhost:5601) +- Navigate to **Observability → Traces** +- You should see agent traces around ~3-5 minutes after agent execution + + +## 10. Understanding the Visualizations + +The traces visualization provides several main views to help you analyze and understand your trace data: + +### Table View +![Table View](images/trace_table_view.png) +*Table View: This view displays each span as a row in a table, with columns such as Span ID, Trace ID, Parent Span ID, Duration, and Status. It allows you to include and exclude different columns as well. Shown above is a customization that might be useful for agent trace understanding. The columns include `Span ID`, `Duration (ms)`, `span.attributes.gen_ai@operation@name`, `span.attributes.gen_ai@usage@total_tokens`, `span.attributes.gen_ai@agent@task`, and `span.attributes.gen_ai@agent@result`.* + +After clicking into one specific trace (agent execution) you arrive at the Trace Analytics visualizations shown below. + +### Timeline/List View +![Timeline View](images/trace_timeline_view.png) +*Timeline/Waterfall View: The timeline or waterfall view presents traces in chronological order, showing the sequence of spans as they occurred. Each entry is also clickable which creates a pop-up of all the information stored within that span.* + +### Hierarchical List View +![Hierarchical View](images/trace_hierarchical_view.png) +*Hierarchical Tree View: This view organizes spans in a tree-like structure, reflecting the parent-child relationships between operations. It helps you visualize the call hierarchy and understand how different services and components interact within a trace.* + +--- + +### Vega Visualization: Trace Graph + +In addition to Trace Analytics, OpenSearch Dashboards also supports [Vega](https://vega.github.io/vega/) visualizations for advanced, custom graphing. You can use Vega to create interactive graphs of your trace data, such as service dependency graphs or span relationships. + +The [`trace-graph-vega.json`](trace-graph-vega.json) file contains a Vega specification for creating trace graphs. Simply copy this file into the Vega editor in OpenSearch Dashboards and change the `traceId` to be the traceId of the run that was just executed. + +> **How to use:** +> 1. Go to **OpenSearch Dashboards → Visualize → Create visualization → Vega**. +> 2. Paste the Vega spec from [`trace-graph-vega.json`](trace-graph-vega.json) into the editor. +> 3. Adjust the data and layout as needed for your trace data. + +![Vega Graph View](images/vega_trace_graph.png) +*Vega Graph View: This custom graph visualization shows the relationships between services and spans in your trace data, helping you understand dependencies and flow at a glance.* + + +## 11. Setup Notes for Non-Docker Environments + +If you're setting up agent tracing in a production environment without Docker Compose, consider these important notes: + +### 1. Install the Telemetry-OTEL Plugin + +Ensure the telemetry-OTEL plugin is installed in your OpenSearch instance: + +```bash +# For OpenSearch 3.1.0 +bin/opensearch-plugin install telemetry-otel +``` + +### 2. Configuration Format Differences + +When using configuration files instead of environment variables, note that some settings use colons (`:`) instead of equals (`=`): + +**Environment Variables (Docker):** +```yaml +environment: + - opensearch.experimental.feature.telemetry.enabled=true + - telemetry.feature.tracer.enabled=true + - telemetry.tracer.enabled=true + - telemetry.tracer.sampler.probability=1.0 + - telemetry.otel.tracer.span.exporter.class=io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter + - plugins.ml_commons.tracing_enabled=true + - plugins.ml_commons.agent_tracing_enabled=true +``` + +**Configuration Files (Non-Docker):** +```yaml +opensearch.experimental.feature.telemetry.enabled: true +telemetry.feature.tracer.enabled: true +telemetry.tracer.enabled: true +telemetry.tracer.sampler.probability: 1.0 +telemetry.otel.tracer.span.exporter.class: io.opentelemetry.exporter.otlp.trace.OtlpGrpcSpanExporter +plugins.ml_commons.tracing_enabled: true +plugins.ml_commons.agent_tracing_enabled: true + +``` + +### 3. OpenTelemetry Collector Installation and Setup + +For non-Docker setups, you need to install and configure the OpenTelemetry Collector: + +#### Download and Install OpenTelemetry Collector +```bash +# Download the latest OpenTelemetry Collector +wget https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.131.1/otelcol-contrib_0.131.1_linux_amd64.tar.gz + +# Extract the archive +tar -xzf otelcol-contrib_linux_amd64.tar.gz + +# Move to system path and make executable +sudo mv otelcol-contrib /usr/local/bin/ +sudo chmod +x /usr/local/bin/otelcol-contrib + +# Create configuration directory +mkdir -p ~/otel-collector +cd ~/otel-collector +``` + +#### Create OpenTelemetry Collector Configuration +```bash +cat > otel-collector-config.yaml << 'EOF' +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + filter/traces: + spans: + include: + match_type: regexp + attributes: + - key: service.type + value: tracer + batch/traces: + timeout: 5s + send_batch_size: 50 + +exporters: + debug: + verbosity: detailed + otlp/data-prepper: + endpoint: localhost:21890 + tls: + insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [filter/traces, batch/traces] + exporters: [debug, otlp/data-prepper] +EOF +``` + +#### Run OpenTelemetry Collector +```bash +otelcol-contrib --config=~/otel-collector/otel-collector-config.yaml +``` + +### 4. Configuration File Locations + +For non-Docker setups, you'll need to place configuration files in specific locations: + +#### Data Prepper Configuration Files +- **Main Config**: [`data-prepper-config.yaml`](data-prepper-config.yaml) +- **Location**: `/usr/share/data-prepper/config/data-prepper-config.yaml` +- **Pipelines Config**: [`pipelines.yaml`](pipelines.yaml) +- **Location**: `/usr/share/data-prepper/pipelines/pipelines.yaml` +- **Index Mapping**: [`ml_agent_trace.json`](ml_agent_trace.json) +- **Location**: `/usr/share/data-prepper/ml_agent_trace.json` + +### 4. Startup Order + +**Critical**: The components must be started in the following order to ensure proper initialization: + +1. **Data Prepper** - Start first to ensure the trace processing pipeline is ready +2. **OpenTelemetry Collector** - Start after Data Prepper to establish the trace forwarding connection +3. **OpenSearch Cluster** - Start after both Data Prepper and Collector are running +4. **OpenSearch Dashboards** - Start last after the cluster is fully operational + +**Important Note**: Data Prepper also depends on the OpenSearch cluster to be running, as it needs to write processed traces to OpenSearch indices. This is why Data Prepper is configured with `restart: unless-stopped` in the Docker Compose file - it will automatically restart and retry connections until OpenSearch is available. + +## Troubleshooting + +If traces aren't appearing: +- Check OpenTelemetry Collector logs: `docker-compose logs otel-collector` +- Check Data Prepper logs: `docker-compose logs data-prepper` +- Verify index creation: `curl localhost:9200/_cat/indices` +- Verify index tempate: `curl localhost:9200/_index_template` + +**Common issues:** +- Incorrect endpoint configurations +- Missing permissions +- Disabled tracing settings + +--- + +## Additional Notes +- Traces are stored in the `otel-v1-apm-span-agent` index +- The sampling rate is set to 100% (`1.0`) for development +- For production, consider adjusting sampling rate and buffer sizes + +--- + +## References +- [OpenSearch Observability documentation](https://opensearch.org/docs/latest/observing-your-data/) +- [OpenTelemetry documentation](https://opentelemetry.io/docs/) +- [Data Prepper documentation](https://opensearch.org/docs/latest/data-prepper/) \ No newline at end of file diff --git a/docs/tutorials/agent_tracing/trace-graph-vega.json b/docs/tutorials/agent_tracing/trace-graph-vega.json new file mode 100644 index 0000000000..784fcf805c --- /dev/null +++ b/docs/tutorials/agent_tracing/trace-graph-vega.json @@ -0,0 +1,451 @@ +{ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "padding": 10, + "autosize": "fit", + + "signals": [ + + { + "name": "clicked", + "value": null, + "on": [ + {"events": "symbol:click", "update": "datum"}, + {"events": "@closeButton:click", "update": "null"} + ] + }, + { + "name": "zoom", + "value": 1, + "on": [ + { + "events": {"type": "wheel", "consume": true}, + "update": "clamp(zoom * pow(1.001, -event.deltaY * pow(16, event.deltaMode)), 0.1, 5)" + } + ] + }, + { + "name": "tx", + "value": 0, + "on": [ + { + "events": {"type": "mousedown", "consume": true}, + "update": "tx" + }, + { + "events": {"type": "[mousedown, window:mouseup] > window:mousemove", "consume": true}, + "update": "tx + event.dx / zoom" + } + ] + }, + { + "name": "ty", + "value": 0, + "on": [ + { + "events": {"type": "mousedown", "consume": true}, + "update": "ty" + }, + { + "events": {"type": "[mousedown, window:mouseup] > window:mousemove", "consume": true}, + "update": "ty + event.dy / zoom" + } + ] + }, + { + "name": "showAll", + "value": false, + "on": [ + {"events": "@showAllButton:click", "update": "!showAll"} + ] + }, + { + "name": "expanded", + "value": [""], + "on": [ + { + "events": {"type": "click", "marktype": "symbol"}, + "update": "expanded + [datum.spanId]" + } + ] + } + ], + + "data": [ + { + "name": "spans", + "url": { + "index": "otel-v1-apm-span-agent", + "body": { + "size": 10000, + "query": { + "term": { + "traceId": "" + } + } + } + }, + "format": {"property": "hits.hits"}, + "transform": [ + {"type": "formula", "expr": "datum._source.spanId", "as": "spanId"}, + {"type": "formula", "expr": "datum._source.parentSpanId", "as": "parentSpanId"}, + {"type": "formula", "expr": "replace(datum._source.name, 'agent.', '')", "as": "name"}, + {"type": "formula", "expr": "datum._source.startTime", "as": "startTime"}, + {"type": "formula", "expr": "datum._source.durationInNanos / 1e9", "as": "durationSeconds"}, + {"type": "formula", "expr": "datum._source['span.attributes.gen_ai@agent@task']", "as": "agentTask"}, + {"type": "formula", "expr": "datum._source['span.attributes.gen_ai@agent@result']", "as": "agentResult"}, + {"type": "formula", "expr": "datum._source['span.attributes.gen_ai@tool@name'] || 'default'", "as": "toolName"}, + {"type": "formula", "expr": "datum._source['span.attributes.gen_ai@agent@phase']", "as": "phase"}, + { + "type": "formula", + "expr": "test(/^(plan|execute_step_|reflect_step_)/, datum.name) ? datum.phase : (test(/^tool_call_/, datum.name) ? datum.toolName : (test(/^llm_call/, datum.name) ? 'llm_call' : datum.name))", + "as": "colorKey" + }, + {"type": "formula", "expr": "datum.name == 'task' ? '' : datum.parentSpanId", "as": "parentSpanId"}, + { + "type": "collect", + "sort": { + "field": "startTime", + "order": "ascending" + } + }, + {"type": "stratify", "key": "spanId", "parentKey": "parentSpanId"}, + {"type": "tree", "size": [{"signal": "width"}, {"signal": "height - 100"}], "separation": true, "as": ["x", "y", "depth", "children"]} + ] + }, + { + "name": "visibleSpans", + "source": "spans", + "transform": [ + { + "type": "filter", + "expr": "showAll || datum.parentSpanId == '' || datum.depth <= 1 || indexof(expanded, datum.parentSpanId) >= 0" + } + ] + }, + { + "name": "visibleLinks", + "source": "visibleSpans", + "transform": [ + {"type": "treelinks"}, + {"type": "linkpath", "orient": "vertical", "shape": "line"} + ] + } + ], + + "scales": [ + { + "name": "color", + "type": "ordinal", + "domain": {"data": "spans", "field": "colorKey"}, + "range": {"scheme": "category20"} + } + ], + + "marks": [ + { + "type": "group", + "encode": { + "update": { + "width": {"signal": "width"}, + "height": {"signal": "height"}, + "clip": {"value": true}, + "cursor": {"value": "move"} + } + }, + "marks": [ + { + "type": "path", + "from": {"data": "visibleLinks"}, + "encode": { + "update": { + "path": {"field": "path"}, + "stroke": {"value": "#ccc"}, + "strokeWidth": {"value": 1.5}, + "x": {"signal": "tx"}, + "y": {"signal": "ty"}, + "scaleX": {"signal": "zoom"}, + "scaleY": {"signal": "zoom"} + } + } + }, + { + "type": "symbol", + "from": {"data": "visibleSpans"}, + "encode": { + "enter": { + "size": {"value": 1000}, + "cursor": {"value": "pointer"} + }, + "update": { + "x": {"signal": "(datum.x + tx) * zoom"}, + "y": {"signal": "(datum.y + ty) * zoom"}, + "fill": {"scale": "color", "field": "colorKey"}, + "stroke": {"value": "white"}, + "strokeWidth": {"value": 1}, + "zindex": {"value": 1} + }, + "hover": { + "strokeWidth": {"value": 2}, + "stroke": {"value": "#000"}, + "zindex": {"value": 2} + } + } + }, + { + "type": "text", + "from": {"data": "visibleSpans"}, + "encode": { + "enter": { + "fontSize": {"value": 12}, + "baseline": {"value": "middle"}, + "fill": {"value": "#333"} + }, + "update": { + "x": {"signal": "(datum.x + tx) * zoom"}, + "y": {"signal": "((datum.y + ty) * zoom) + 25"}, + "text": {"field": "name"}, + "align": {"value": "center"}, + "fontSize": {"signal": "12 * zoom"} + } + } + } + ] + }, + { + "type": "rect", + "name": "showAllButton", + "encode": { + "enter": { + "x": {"value": 130}, + "y": {"value": 5}, + "width": {"value": 80}, + "height": {"value": 30}, + "fill": {"value": "#4CAF50"}, + "cornerRadius": {"value": 5}, + "cursor": {"value": "pointer"} + }, + "update": { + "fill": {"signal": "showAll ? '#45a049' : '#4CAF50'"} + } + } + }, + { + "type": "text", + "encode": { + "enter": { + "x": {"value": 170}, + "y": {"value": 20}, + "align": {"value": "center"}, + "baseline": {"value": "middle"}, + "fill": {"value": "white"}, + "text": {"signal": "showAll ? 'Collapse' : 'Show All'"}, + "fontSize": {"value": 14}, + "fontWeight": {"value": "bold"} + } + } + }, + { + "type": "group", + "encode": { + "update": { + "x": {"value": 10}, + "y": {"value": 40}, + "width": {"value": 560}, + "height": {"value": 200}, + "fill": {"value": "white"}, + "fillOpacity": {"value": 0.95}, + "stroke": {"value": "#aaa"}, + "strokeWidth": {"value": 0.5}, + "opacity": {"signal": "clicked ? 1 : 0"} + } + }, + "marks": [ + { + "type": "rect", + "name": "closeButton", + "encode": { + "enter": { + "x": {"value": 535}, + "y": {"value": 10}, + "width": {"value": 20}, + "height": {"value": 20}, + "fill": {"value": "#f0f0f0"}, + "stroke": {"value": "#ccc"}, + "strokeWidth": {"value": 1}, + "cursor": {"value": "pointer"} + }, + "update": { + "opacity": {"signal": "clicked ? 1 : 0"} + }, + "hover": { + "fill": {"value": "#e0e0e0"} + } + } + }, + { + "type": "text", + "encode": { + "enter": { + "x": {"value": 545}, + "y": {"value": 25}, + "text": {"value": "✕"}, + "fill": {"value": "#666"}, + "fontSize": {"value": 14}, + "fontWeight": {"value": "bold"}, + "align": {"value": "center"}, + "baseline": {"value": "middle"} + }, + "update": { + "opacity": {"signal": "clicked ? 1 : 0"} + }, + "hover": { + "fill": {"value": "#000"} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 20}, + "text": {"signal": "clicked ? 'Span ID: ' + clicked.spanId : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12}, + "fontWeight": {"value": "bold"} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 35}, + "text": {"signal": "clicked ? 'Operation: ' + clicked.name : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 50}, + "text": {"signal": "clicked ? 'Tool: ' + (clicked.toolName || 'N/A') : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 65}, + "text": {"signal": "clicked ? 'Phase: ' + (clicked.phase || 'N/A') : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 80}, + "text": {"signal": "clicked ? 'Start Time: ' + clicked.startTime : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 95}, + "text": {"signal": "clicked ? 'Duration: ' + format(clicked.durationSeconds, '.3f') + ' seconds' : ''"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 110}, + "text": {"value": "Input:"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12}, + "fontWeight": {"value": "bold"} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 125}, + "width": {"value": 540}, + "text": {"signal": "clicked && clicked.agentTask ? substring(clicked.agentTask, 0, 100) : 'N/A'"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12}, + "lineBreak": {"value": true} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 155}, + "text": {"value": "Output:"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12}, + "fontWeight": {"value": "bold"} + } + } + }, + { + "type": "text", + "encode": { + "update": { + "x": {"value": 6}, + "y": {"value": 170}, + "width": {"value": 540}, + "text": {"signal": "clicked && clicked.agentResult ? substring(clicked.agentResult, 0, 100) : 'N/A'"}, + "fill": {"value": "black"}, + "fontSize": {"value": 12}, + "lineBreak": {"value": true} + } + } + } + ] + }, + { + "type": "text", + "encode": { + "enter": { + "x": {"value": 10}, + "y": {"value": 20}, + "fill": {"value": "black"}, + "fontSize": {"value": 14}, + "fontWeight": {"value": "bold"} + }, + "update": { + "text": {"signal": "'Total Spans: ' + length(data('spans'))"} + } + } + } + ] + } \ No newline at end of file