Skip to content

Commit aa5bdad

Browse files
Bug 1990006 - Create an initial ETL workflow for gecko-trace component
This patch adds an initial ETL workflow for processing traces collected by the [gecko-trace component](1) from varius Gecko based Firefox products. [1]: https://searchfox.org/firefox-main/source/toolkit/components/gecko-trace
1 parent e19db6c commit aa5bdad

File tree

25 files changed

+1018
-0
lines changed

25 files changed

+1018
-0
lines changed

dags.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2487,3 +2487,26 @@ bqetl_firefox_enterprise:
24872487
retry_delay: 30m
24882488
tags:
24892489
- impact/tier_3
2490+
2491+
bqetl_gecko_trace:
2492+
catchup: false
2493+
default_args:
2494+
depends_on_past: false
2495+
email:
2496+
2497+
2498+
email_on_failure: true
2499+
email_on_retry: true
2500+
end_date: null
2501+
max_active_tis_per_dag: null
2502+
2503+
retries: 2
2504+
retry_delay: 30m
2505+
start_date: "2025-09-26"
2506+
description: |
2507+
Processes gecko trace data across multiple Firefox applications.
2508+
repo: bigquery-etl
2509+
schedule_interval: 0 9 * * *
2510+
tags:
2511+
- impact/tier_3
2512+
- repo/bigquery-etl
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# gecko_trace.build_root_span
2+
3+
Builds a root span tree structure from an array of span objects.
4+
5+
## Signature
6+
7+
```sql
8+
gecko_trace.build_root_span(spans ARRAY<JSON>) RETURNS JSON
9+
```
10+
11+
## Arguments
12+
13+
- `spans`: Array of JSON objects representing individual spans. Each span should contain at minimum:
14+
- `span_id`: Unique identifier for the span
15+
- `parent_span_id`: ID of the parent span (null for root spans)
16+
- Other span properties like `name`, `start_time_unix_nano`, `end_time_unix_nano`, etc.
17+
18+
## Description
19+
20+
Takes an array of JSON span objects and constructs a hierarchical tree structure by linking spans with their parent-child relationships. The function:
21+
22+
1. Maps spans by their IDs
23+
2. Links child spans to their parents via a `childSpans` array property
24+
3. Identifies and returns the root span (span with no parent)
25+
4. Handles "missing" parent spans by creating placeholder objects
26+
27+
If no explicit root span is found, the function will attempt to find a single "missing" root span. If there are multiple or no missing roots, an error is thrown.
28+
29+
## Returns
30+
31+
Returns a JSON object representing the root span with all child spans nested in `childSpans` arrays throughout the tree structure.
32+
33+
## Example
34+
35+
```sql
36+
SELECT gecko_trace.build_root_span([
37+
JSON '{"span_id": "root", "parent_span_id": null, "name": "main_process"}',
38+
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "network_request"}',
39+
JSON '{"span_id": "child2", "parent_span_id": "root", "name": "dom_parse"}',
40+
JSON '{"span_id": "grandchild", "parent_span_id": "child1", "name": "dns_lookup"}'
41+
])
42+
```
43+
44+
This would return a tree structure where the root span contains two child spans in its `childSpans` array, and one of those children has its own child span.
45+
46+
## Notes
47+
48+
- Used primarily for processing Gecko trace data to reconstruct span hierarchies
49+
- Throws an error if the span relationships cannot form a valid tree structure
50+
- Missing parent spans are handled gracefully by creating placeholder objects
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
friendly_name: Gecko Trace Build Root Span
3+
description: |-
4+
Builds a root span tree structure from an array of span objects.
5+
6+
Takes an array of JSON span objects and constructs a hierarchical tree structure
7+
by linking spans with their parent-child relationships. Returns the root span
8+
with all child spans nested in a `childSpans` array property.
9+
10+
If no root span is found, the function will attempt to find a single "missing"
11+
root span. If there are multiple or no missing roots, an error is thrown.
12+
13+
This function is used for processing Gecko trace data to reconstruct the
14+
hierarchical structure of spans within a trace.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
CREATE OR REPLACE FUNCTION gecko_trace.build_root_span(spans ARRAY<JSON>)
2+
RETURNS JSON
3+
LANGUAGE js AS r"""
4+
const spansById = new Map();
5+
let rootSpanId;
6+
7+
spans.forEach((span) => {
8+
const spanId = span.span_id;
9+
// Re-attach any children accumulated while parent was "missing"
10+
const maybeMissingSelf = spansById.get(spanId);
11+
span.childSpans = maybeMissingSelf?.childSpans ?? [];
12+
spansById.set(spanId, span);
13+
14+
if (!span.parent_span_id) {
15+
rootSpanId = spanId; // yay, we found the root span
16+
return;
17+
}
18+
19+
const parent = spansById.get(span.parent_span_id) || {
20+
span_id: span.parent_span_id,
21+
childSpans: [],
22+
type: "missing",
23+
};
24+
parent.childSpans.push(span);
25+
spansById.set(span.parent_span_id, parent);
26+
});
27+
28+
if (!rootSpanId) {
29+
// Find the single missing root, if any
30+
const missingRoots = Array.from(spansById.values()).filter(
31+
(span) => span.type == "missing",
32+
);
33+
if (missingRoots.length != 1) {
34+
throw new Error(
35+
`Unable to construct span tree: expected exactly one missing root span, but found ${missingRoots.length}`,
36+
);
37+
}
38+
39+
rootSpanId = missingRoots[0].span_id;
40+
}
41+
42+
return spansById.get(rootSpanId);
43+
""";
44+
45+
-- Tests
46+
SELECT
47+
-- Test with simple parent-child relationship
48+
assert.not_null(
49+
gecko_trace.build_root_span(
50+
[
51+
JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}',
52+
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "child_span"}'
53+
]
54+
)
55+
),
56+
-- Test with empty array
57+
assert.null(gecko_trace.build_root_span([])),
58+
-- Test single span (should be root)
59+
assert.equals(
60+
"root",
61+
JSON_VALUE(
62+
gecko_trace.build_root_span(
63+
[JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}']
64+
),
65+
"$.span_id"
66+
)
67+
);
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# gecko_trace.calculate_signature
2+
3+
Calculates a signature hash for a trace based on its root span structure.
4+
5+
## Signature
6+
7+
```sql
8+
gecko_trace.calculate_signature(rootSpan JSON) RETURNS STRING
9+
```
10+
11+
## Arguments
12+
13+
- `rootSpan`: JSON object representing the root span of a trace tree, typically generated by `gecko_trace.build_root_span()`. Should contain:
14+
- `name`: Span name
15+
- `scope`: Object with `name` property
16+
- `resource`: Object with `attributes` property
17+
- `events`: Optional array of event objects with `name` and `attributes`
18+
- `childSpans`: Array of child span objects with the same structure
19+
20+
## Description
21+
22+
Uses a fast hash function (cyrb64) to generate a deterministic signature based on the hierarchical structure and attributes of spans in a trace. The signature is calculated by traversing the span tree depth-first and hashing:
23+
24+
- Resource attributes (excluding certain internal IDs like `gecko_process_internal_id`)
25+
- Scope names
26+
- Span names
27+
- Event names and attributes
28+
29+
This allows grouping traces that have similar structure and content, which is useful for:
30+
- Performance analysis and benchmarking
31+
- Anomaly detection in trace patterns
32+
- Identifying common execution paths
33+
- Aggregating metrics across similar traces
34+
35+
## Returns
36+
37+
Returns a string hash that serves as a deterministic signature for the trace structure. Traces with identical signatures have the same execution pattern and can be grouped together for analysis.
38+
39+
## Example
40+
41+
```sql
42+
WITH root_span AS (
43+
SELECT gecko_trace.build_root_span(spans_array) as root
44+
FROM traces_table
45+
WHERE trace_id = 'some_trace_id'
46+
)
47+
SELECT gecko_trace.calculate_signature(root) as signature
48+
FROM root_span
49+
```
50+
51+
## Hash Algorithm
52+
53+
Uses the cyrb64 hash function, which is:
54+
- Fast and efficient for string hashing
55+
- Provides good collision resistance for practical use cases
56+
- Deterministic across different executions
57+
- Based on MurmurHash principles
58+
59+
## Notes
60+
61+
- Signatures are deterministic - identical trace structures will always produce the same signature
62+
- Internal process IDs and other volatile attributes are excluded from hashing to focus on logical execution patterns
63+
- Used in conjunction with `gecko_trace.build_root_span()` for complete trace analysis workflows
64+
- Returns empty string for NULL input
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
---
2+
friendly_name: Gecko Trace Calculate Signature
3+
description: |-
4+
Calculates a signature hash for a trace based on its root span structure.
5+
6+
Uses a fast hash function (cyrb64) to generate a deterministic signature
7+
based on the hierarchical structure and attributes of spans in a trace.
8+
The signature is calculated by traversing the span tree and hashing:
9+
- Resource attributes (excluding certain internal IDs like gecko_process_internal_id)
10+
- Scope names
11+
- Span names
12+
- Event names and attributes
13+
14+
This allows grouping traces that have similar structure and content,
15+
which is useful for performance analysis and anomaly detection of Gecko traces.
16+
17+
The function returns a string hash that can be used to identify traces with
18+
similar execution patterns.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
CREATE OR REPLACE FUNCTION gecko_trace.calculate_signature(rootSpan JSON)
2+
RETURNS STRING
3+
LANGUAGE js AS r"""
4+
// cyrb53 (c) 2018 bryc (github.com/bryc). License: Public domain. Attribution appreciated.
5+
// A fast and simple 64-bit (or 53-bit) string hash function with decent collision resistance.
6+
// Largely inspired by MurmurHash2/3, but with a focus on speed/simplicity.
7+
// See https://stackoverflow.com/questions/7616461/generate-a-hash-from-string-in-javascript/52171480#52171480
8+
// https://github.com/bryc/code/blob/master/jshash/experimental/cyrb53.js
9+
const cyrb64 = (str, seed = 0) => {
10+
let h1 = 0xdeadbeef ^ seed,
11+
h2 = 0x41c6ce57 ^ seed;
12+
for (let i = 0, ch; i < str.length; i++) {
13+
ch = str.charCodeAt(i);
14+
h1 = Math.imul(h1 ^ ch, 2654435761);
15+
h2 = Math.imul(h2 ^ ch, 1597334677);
16+
}
17+
h1 = Math.imul(h1 ^ (h1 >>> 16), 2246822507);
18+
h1 ^= Math.imul(h2 ^ (h2 >>> 13), 3266489909);
19+
h2 = Math.imul(h2 ^ (h2 >>> 16), 2246822507);
20+
h2 ^= Math.imul(h1 ^ (h1 >>> 13), 3266489909);
21+
// For a single 53-bit numeric return value we could return
22+
// 4294967296 * (2097151 & h2) + (h1 >>> 0);
23+
// but we instead return the full 64-bit value:
24+
return [h2 >>> 0, h1 >>> 0];
25+
};
26+
27+
const seed = 0;
28+
let digest = "";
29+
const hash = (str) => {
30+
const [h2, h1] = cyrb64(digest + str, seed);
31+
digest =
32+
h2.toString(36).padStart(7, "0") + h1.toString(36).padStart(7, "0");
33+
};
34+
35+
const ATTRS_TO_SKIP = {"gecko_process_internal_id": null}
36+
const hashAttrs = (attrs) => {
37+
for (const [key, value] of Object.entries(attrs)) {
38+
if (key in ATTRS_TO_SKIP) continue;
39+
hash(key);
40+
hash(value);
41+
}
42+
}
43+
44+
const hashEvents = (events) => {
45+
for (const event of events) {
46+
hash(event.name);
47+
hashAttrs(event.attributes);
48+
}
49+
};
50+
51+
const stack = [rootSpan];
52+
while (stack.length > 0) {
53+
const span = stack.pop();
54+
hashAttrs(span.resource.attributes);
55+
hash(span.scope.name);
56+
hash(span.name);
57+
if (span.events) {
58+
hashEvents(span.events);
59+
}
60+
stack.push(...span.childSpans);
61+
}
62+
63+
return digest;
64+
""";
65+
66+
-- Tests
67+
SELECT
68+
-- Test with simple root span
69+
assert.not_null(
70+
gecko_trace.calculate_signature(
71+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
72+
)
73+
),
74+
-- Test that same input produces same signature
75+
assert.equals(
76+
gecko_trace.calculate_signature(
77+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
78+
),
79+
gecko_trace.calculate_signature(
80+
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
81+
)
82+
),
83+
-- Test that null input returns empty string
84+
assert.equals("", gecko_trace.calculate_signature(NULL));

0 commit comments

Comments
 (0)