From c408b8e327328c58990ef727e003c1c034a196d7 Mon Sep 17 00:00:00 2001 From: Federico Torres Date: Wed, 2 Oct 2024 12:52:00 -0300 Subject: [PATCH] Add UTF-8 support in metric and label names (#689) Allow UTF-8 escaped characters in label and metric names This changes the validation in general and also adds a new escaping scheme for URL parameters (the `U__` encoding used to escape names for legacy systems, which unfortunately is different from the base64 encoding we already use for label _values). The behavior is opt-in via a flag because there are valid legacy names that could also be seen as encoded versions of new names with special UTF-8 characters. Signed-off-by: Federico Torres --------- Signed-off-by: Federico Torres --- README.md | 33 +++++ handler/handler_test.go | 316 ++++++++++++++++++++++++++++++++++++++++ handler/push.go | 13 +- main.go | 10 ++ 4 files changed, 369 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3d2945a6..c52d8614 100644 --- a/README.md +++ b/README.md @@ -320,6 +320,39 @@ Examples: /metrics/job/titan/name@base64/zqDPgc6_zrzOt864zrXPjc-C +### UTF-8 support for metric and label names + +Newer versions of the Prometheus exposition formats (text and protobuf) +support the full UTF-8 character set in metric and label names. The +Pushgateway only accepts special characters in names if the command line +flag `--push.enable-utf8-names` is set. +To allow special characters in label names that are part of the URL path, the flag also enables a +[specific encoding mechanism](https://github.com/prometheus/proposals/blob/main/proposals/2023-08-21-utf8.md#text-escaping). This is similar to the base64 encoding for label _values_ described above, +but works differently in detail for technical and historical reasons. As before, client libraries +should usually take care of the encoding. It works as follows: + +* A label name containing encoded characters starts with `U__`. +* All non-standard characters (i.e. characters other than letters, numbers, and underscores) are encoded as underscores surrounding their Unicode value, like `_1F60A_`. +* All pre-existing underscores are encoded as a double-underscore: `__`. +* If a label name starts with `U__` already, these characters have to be encoded as well, resulting in `U___55_____`. (That's `U__` + `_55_` (for `U`) + `__` + `__`). +* A label name starting with `U__` in it's encoded form, but containing invalid sequences (e.g. `U__in_xxx_valid`) is left unchanged. + +For example, the label `"foo.bar"="baz"` would be encoded like: + + /metrics/job/example/U__foo_2e_bar/baz + +This encoding is compatible with the base64 encoding for label values: + + /metrics/job/example/U__foo_2e_bar@base64/YmF6 + +Note that this method has an unlikely edge case that is not handled properly: +A pusher unaware of the encoding mechanism might use a label name that is +also a valid encoded version of another label name. +E.g. if a pusher intends to use the label name `U__foo_2e_bar`, but doesn't +encode it as `U___55_____foo__2e__bar`, the Pushgateway will decode +`U__foo_2e_bar` to `foo.bar`. This is the main reason why the decoding is +opt-in via the `--push.enable-utf8-names` flag. + ### `PUT` method `PUT` is used to push a group of metrics. All metrics with the diff --git a/handler/handler_test.go b/handler/handler_test.go index e1a2a5f6..1f9d388b 100644 --- a/handler/handler_test.go +++ b/handler/handler_test.go @@ -25,6 +25,7 @@ import ( "github.com/go-kit/log" "github.com/matttproud/golang_protobuf_extensions/pbutil" + "github.com/prometheus/common/model" "github.com/prometheus/common/route" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/proto" @@ -436,6 +437,195 @@ func TestPush(t *testing.T) { verifyMetricFamily(t, `name:"histogram_metric" type:HISTOGRAM metric:{histogram:{sample_count_float:20 sample_sum:99.23 schema:1 negative_span:{offset:0 length:2} negative_span:{offset:0 length:2} negative_count:2 negative_count:2 negative_count:-2 negative_count:0 positive_span:{offset:0 length:2} positive_span:{offset:0 length:2} positive_count:2 positive_count:2 positive_count:-2 positive_count:0}}`, mms.lastWriteRequest.MetricFamilies["histogram_metric"]) } +func TestPushUTF8(t *testing.T) { + model.NameValidationScheme = model.UTF8Validation + EscapingScheme = model.ValueEncodingEscaping + mms := MockMetricStore{} + handler := Push(&mms, false, true, false, logger) + handlerBase64 := Push(&mms, false, true, true, logger) + + // With job name, instance name, UTF-8 escaped label name in params, UTF-8 metric name and text content. + mms.lastWriteRequest = storage.WriteRequest{} + req, err := http.NewRequest( + "POST", "http://example.org/", + bytes.NewBufferString("some_metric 3.14\n{\"another.metric\",instance=\"testinstance\",job=\"testjob\",\"dotted.label.name\"=\"mylabelvalue\"} 42\n"), + ) + if err != nil { + t.Fatal(err) + } + w := httptest.NewRecorder() + + params := map[string]string{ + "job": "testjob", + "labels": "/instance/testinstance/U__dotted_2e_label_2e_name/mylabelvalue", + } + + handler(w, req.WithContext(ctxWithParams(params, req))) + if expected, got := http.StatusOK, w.Code; expected != got { + t.Errorf("Wanted status code %v, got %v.", expected, got) + } + if mms.lastWriteRequest.Timestamp.IsZero() { + t.Errorf("Write request timestamp not set: %#v", mms.lastWriteRequest) + } + if expected, got := "testjob", mms.lastWriteRequest.Labels["job"]; expected != got { + t.Errorf("Wanted job %v, got %v.", expected, got) + } + if expected, got := "testinstance", mms.lastWriteRequest.Labels["instance"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + if expected, got := "mylabelvalue", mms.lastWriteRequest.Labels["dotted.label.name"]; expected != got { + t.Errorf("Wanted dotted.label.name %v, got %v.", expected, got) + } + // Note that sanitation hasn't happened yet, grouping labels not in request. + verifyMetricFamily(t, `name:"some_metric" type:UNTYPED metric:{untyped:{value:3.14}}`, mms.lastWriteRequest.MetricFamilies["some_metric"]) + verifyMetricFamily(t, `name:"another.metric" type:UNTYPED metric:{label:{name:"instance" value:"testinstance"} label:{name:"job" value:"testjob"} label:{name:"dotted.label.name" value:"mylabelvalue"} untyped:{value:42}}`, mms.lastWriteRequest.MetricFamilies["another.metric"]) + + // With base64-encoded label values, UTF-8 escaped label name in params, UTF-8 metric name and text content. + mms.lastWriteRequest = storage.WriteRequest{} + req, err = http.NewRequest( + "POST", "http://example.org/", + bytes.NewBufferString("some_metric 3.14\n{\"another.metric\",instance=\"testinstance\",job=\"testjob\",\"dotted.label.name\"=\"mylabelvalue\"} 42\n"), + ) + if err != nil { + t.Fatal(err) + } + w = httptest.NewRecorder() + params = map[string]string{ + "job": "dGVzdC9qb2I=", // job="test/job" + "labels": "/instance@base64/dGVzdGluc3RhbmNl/U__dotted_2e_label_2e_name@base64/bXlsYWJlbHZhbHVl", // instance="testinstance", dotted.label.name="mylabelvalue" + } + handlerBase64(w, req.WithContext(ctxWithParams(params, req))) + if expected, got := http.StatusOK, w.Code; expected != got { + t.Errorf("Wanted status code %v, got %v.", expected, got) + } + if mms.lastWriteRequest.Timestamp.IsZero() { + t.Errorf("Write request timestamp not set: %#v", mms.lastWriteRequest) + } + if expected, got := "test/job", mms.lastWriteRequest.Labels["job"]; expected != got { + t.Errorf("Wanted job %v, got %v.", expected, got) + } + if expected, got := "testinstance", mms.lastWriteRequest.Labels["instance"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + if expected, got := "mylabelvalue", mms.lastWriteRequest.Labels["dotted.label.name"]; expected != got { + t.Errorf("Wanted dotted.label.name %v, got %v.", expected, got) + } + // Note that sanitation hasn't happened yet, grouping labels not in request. + verifyMetricFamily(t, `name:"some_metric" type:UNTYPED metric:{untyped:{value:3.14}}`, mms.lastWriteRequest.MetricFamilies["some_metric"]) + // Note that sanitation hasn't happened yet, job label as still as in the push, not aligned to grouping labels. + verifyMetricFamily(t, `name:"another.metric" type:UNTYPED metric:{label:{name:"instance" value:"testinstance"} label:{name:"job" value:"testjob"} label:{name:"dotted.label.name" value:"mylabelvalue"} untyped:{value:42}}`, mms.lastWriteRequest.MetricFamilies["another.metric"]) + + // With job name, instance name, UTF-8 escaped label name in params, UTF-8 metric names and protobuf content. + mms.lastWriteRequest = storage.WriteRequest{} + buf := &bytes.Buffer{} + _, err = pbutil.WriteDelimited(buf, &dto.MetricFamily{ + Name: proto.String("some.metric"), + Type: dto.MetricType_UNTYPED.Enum(), + Metric: []*dto.Metric{ + { + Untyped: &dto.Untyped{ + Value: proto.Float64(1.234), + }, + }, + }, + }) + if err != nil { + t.Fatal(err) + } + + _, err = pbutil.WriteDelimited(buf, &dto.MetricFamily{ + Name: proto.String("another.metric"), + Type: dto.MetricType_UNTYPED.Enum(), + Metric: []*dto.Metric{ + { + Untyped: &dto.Untyped{ + Value: proto.Float64(3.14), + }, + }, + }, + }) + if err != nil { + t.Fatal(err) + } + + _, err = pbutil.WriteDelimited(buf, &dto.MetricFamily{ + Name: proto.String("histogram.metric"), + Type: dto.MetricType_HISTOGRAM.Enum(), + Metric: []*dto.Metric{ + { + Histogram: &dto.Histogram{ + SampleCountFloat: proto.Float64(20), + SampleSum: proto.Float64(99.23), + Schema: proto.Int32(1), + NegativeCount: []float64{2, 2, -2, 0}, + PositiveCount: []float64{2, 2, -2, 0}, + PositiveSpan: []*dto.BucketSpan{ + { + Offset: proto.Int32(0), + Length: proto.Uint32(2), + }, + { + Offset: proto.Int32(0), + Length: proto.Uint32(2), + }, + }, + NegativeSpan: []*dto.BucketSpan{ + { + Offset: proto.Int32(0), + Length: proto.Uint32(2), + }, + { + Offset: proto.Int32(0), + Length: proto.Uint32(2), + }, + }, + }, + }, + }, + }) + if err != nil { + t.Fatal(err) + } + + req, err = http.NewRequest( + "POST", "http://example.org/", buf, + ) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/vnd.google.protobuf; encoding=delimited; proto=io.prometheus.client.MetricFamily") + w = httptest.NewRecorder() + params = map[string]string{ + "job": "testjob", + "labels": "/instance/testinstance/U__dotted_2e_label_2e_name/mylabelvalue", + } + handler(w, req.WithContext(ctxWithParams(params, req))) + if expected, got := http.StatusOK, w.Code; expected != got { + t.Errorf("Wanted status code %v, got %v.", expected, got) + } + if mms.lastWriteRequest.Timestamp.IsZero() { + t.Errorf("Write request timestamp not set: %#v", mms.lastWriteRequest) + } + if expected, got := "testjob", mms.lastWriteRequest.Labels["job"]; expected != got { + t.Errorf("Wanted job %v, got %v.", expected, got) + } + if expected, got := "testinstance", mms.lastWriteRequest.Labels["instance"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + if expected, got := "mylabelvalue", mms.lastWriteRequest.Labels["dotted.label.name"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + // Note that sanitation hasn't happened yet, grouping labels not in request. + verifyMetricFamily(t, `name:"some.metric" type:UNTYPED metric:{untyped:{value:1.234}}`, mms.lastWriteRequest.MetricFamilies["some.metric"]) + // Note that sanitation hasn't happened yet, grouping labels not in request. + verifyMetricFamily(t, `name:"another.metric" type:UNTYPED metric:{untyped:{value:3.14}}`, mms.lastWriteRequest.MetricFamilies["another.metric"]) + // Note that sanitation hasn't happened yet, grouping labels not in request. + verifyMetricFamily(t, `name:"histogram.metric" type:HISTOGRAM metric:{histogram:{sample_count_float:20 sample_sum:99.23 schema:1 negative_span:{offset:0 length:2} negative_span:{offset:0 length:2} negative_count:2 negative_count:2 negative_count:-2 negative_count:0 positive_span:{offset:0 length:2} positive_span:{offset:0 length:2} positive_count:2 positive_count:2 positive_count:-2 positive_count:0}}`, mms.lastWriteRequest.MetricFamilies["histogram.metric"]) + + model.NameValidationScheme = model.LegacyValidation + EscapingScheme = model.NoEscaping +} + func TestDelete(t *testing.T) { mms := MockMetricStore{} handler := Delete(&mms, false, logger) @@ -525,6 +715,71 @@ func TestDelete(t *testing.T) { } +func TestDeleteUTF8(t *testing.T) { + model.NameValidationScheme = model.UTF8Validation + EscapingScheme = model.ValueEncodingEscaping + mms := MockMetricStore{} + handler := Delete(&mms, false, logger) + handlerBase64 := Delete(&mms, true, logger) + req := &http.Request{} + var params map[string]string + + // With job name, instance name and UTF-8 escaped label name. + mms.lastWriteRequest = storage.WriteRequest{} + w := httptest.NewRecorder() + + params = map[string]string{ + "job": "testjob", + "labels": "/instance/testinstance/U__dotted_2e_label_2e_name/mylabelvalue", + } + + handler(w, req.WithContext(ctxWithParams(params, req))) + if expected, got := http.StatusAccepted, w.Code; expected != got { + t.Errorf("Wanted status code %v, got %v.", expected, got) + } + if mms.lastWriteRequest.Timestamp.IsZero() { + t.Errorf("Write request timestamp not set: %#v", mms.lastWriteRequest) + } + if expected, got := "testjob", mms.lastWriteRequest.Labels["job"]; expected != got { + t.Errorf("Wanted job %v, got %v.", expected, got) + } + if expected, got := "testinstance", mms.lastWriteRequest.Labels["instance"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + if expected, got := "mylabelvalue", mms.lastWriteRequest.Labels["dotted.label.name"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + + // With base64-encoded label values and UTF-8 escaped label name. + mms.lastWriteRequest = storage.WriteRequest{} + w = httptest.NewRecorder() + + params = map[string]string{ + "job": "dGVzdC9qb2I=", + "labels": "/instance@base64/dGVzdGluc3RhbmNl/U__dotted_2e_label_2e_name@base64/bXlsYWJlbHZhbHVl", + } + + handlerBase64(w, req.WithContext(ctxWithParams(params, req))) + if expected, got := http.StatusAccepted, w.Code; expected != got { + t.Errorf("Wanted status code %v, got %v.", expected, got) + } + if mms.lastWriteRequest.Timestamp.IsZero() { + t.Errorf("Write request timestamp not set: %#v", mms.lastWriteRequest) + } + if expected, got := "test/job", mms.lastWriteRequest.Labels["job"]; expected != got { + t.Errorf("Wanted job %v, got %v.", expected, got) + } + if expected, got := "testinstance", mms.lastWriteRequest.Labels["instance"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + if expected, got := "mylabelvalue", mms.lastWriteRequest.Labels["dotted.label.name"]; expected != got { + t.Errorf("Wanted instance %v, got %v.", expected, got) + } + + model.NameValidationScheme = model.LegacyValidation + EscapingScheme = model.NoEscaping +} + func TestSplitLabels(t *testing.T) { scenarios := map[string]struct { input string @@ -575,6 +830,13 @@ func TestSplitLabels(t *testing.T) { input: "/label_name1@base64/foo.bar/label_name2/label_value2", expectError: true, }, + "regular label and UTF-8 escaped label name with legacy validation": { + input: "/label_name1/label_value1/U__label_2e_name2/label_value2", + expectedOutput: map[string]string{ + "label_name1": "label_value1", + "U__label_2e_name2": "label_value2", + }, + }, } for name, scenario := range scenarios { @@ -603,6 +865,60 @@ func TestSplitLabels(t *testing.T) { } } +func TestSplitLabelsUTF8(t *testing.T) { + scenarios := map[string]struct { + input string + expectError bool + expectedOutput map[string]string + }{ + "regular label and UTF-8 escaped label name": { + input: "/label_name1/label_value1/U__label_2e_name2/label_value2", + expectedOutput: map[string]string{ + "label_name1": "label_value1", + "label.name2": "label_value2", + }, + }, + "encoded slash in both label values and UTF-8 escaped label name": { + input: "/label_name1@base64/bGFiZWwvdmFsdWUx/U__label_2e_name2@base64/bGFiZWwvdmFsdWUy", + expectedOutput: map[string]string{ + "label_name1": "label/value1", + "label.name2": "label/value2", + }, + }, + } + + model.NameValidationScheme = model.UTF8Validation + EscapingScheme = model.ValueEncodingEscaping + + for name, scenario := range scenarios { + t.Run(name, func(t *testing.T) { + parsed, err := splitLabels(scenario.input) + if err != nil { + if scenario.expectError { + return // All good. + } + t.Fatalf("Got unexpected error: %s.", err) + } + for k, v := range scenario.expectedOutput { + got, ok := parsed[k] + if !ok { + t.Errorf("Expected to find %s=%q.", k, v) + } + if got != v { + t.Errorf("Expected %s=%q but got %s=%q.", k, v, k, got) + } + delete(parsed, k) + } + for k, v := range parsed { + t.Errorf("Found unexpected label %s=%q.", k, v) + } + }) + } + + model.NameValidationScheme = model.LegacyValidation + EscapingScheme = model.NoEscaping +} + func TestWipeMetricStore(t *testing.T) { // Create MockMetricStore with a few GroupingKeyToMetricGroup metrics // so they can be returned by GetMetricFamiliesMap() to later send write diff --git a/handler/push.go b/handler/push.go index 77f6c8ea..f6eef1aa 100644 --- a/handler/push.go +++ b/handler/push.go @@ -41,6 +41,12 @@ const ( Base64Suffix = "@base64" ) +var ( + // EscapingScheme is provided when unescaping label names in the + // request URL path to define the escaping scheme that will be used. + EscapingScheme = model.NoEscaping +) + // Push returns an http.Handler which accepts samples over HTTP and stores them // in the MetricStore. If replace is true, all metrics for the job and instance // given by the request are deleted before new ones are stored. If check is @@ -180,19 +186,20 @@ func splitLabels(labels string) (map[string]string, error) { for i := 0; i < len(components)-1; i += 2 { name, value := components[i], components[i+1] trimmedName := strings.TrimSuffix(name, Base64Suffix) - if !model.LabelNameRE.MatchString(trimmedName) || + unescapedName := model.UnescapeName(trimmedName, EscapingScheme) + if !model.LabelName(unescapedName).IsValid() || strings.HasPrefix(trimmedName, model.ReservedLabelPrefix) { return nil, fmt.Errorf("improper label name %q", trimmedName) } if name == trimmedName { - result[name] = value + result[unescapedName] = value continue } decodedValue, err := decodeBase64(value) if err != nil { return nil, fmt.Errorf("invalid base64 encoding for label %s=%q: %v", trimmedName, value, err) } - result[trimmedName] = decodedValue + result[unescapedName] = decodedValue } return result, nil } diff --git a/main.go b/main.go index 1af46cf3..934f9893 100644 --- a/main.go +++ b/main.go @@ -35,6 +35,7 @@ import ( "github.com/prometheus/client_golang/prometheus" versioncollector "github.com/prometheus/client_golang/prometheus/collectors/version" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/model" "github.com/prometheus/common/promlog" "github.com/prometheus/common/route" "github.com/prometheus/common/version" @@ -73,6 +74,7 @@ func main() { persistenceFile = app.Flag("persistence.file", "File to persist metrics. If empty, metrics are only kept in memory.").Default("").String() persistenceInterval = app.Flag("persistence.interval", "The minimum interval at which to write out the persistence file.").Default("5m").Duration() pushUnchecked = app.Flag("push.disable-consistency-check", "Do not check consistency of pushed metrics. DANGEROUS.").Default("false").Bool() + pushUTF8Names = app.Flag("push.enable-utf8-names", "Allow UTF-8 characters in metric and label names.").Default("false").Bool() promlogConfig = promlog.Config{} ) promlogflag.AddFlags(app, &promlogConfig) @@ -102,6 +104,14 @@ func main() { ms := storage.NewDiskMetricStore(*persistenceFile, *persistenceInterval, prometheus.DefaultGatherer, logger) + if *pushUTF8Names { + model.NameValidationScheme = model.UTF8Validation + handler.EscapingScheme = model.ValueEncodingEscaping + } else { + model.NameValidationScheme = model.LegacyValidation + handler.EscapingScheme = model.NoEscaping + } + // Create a Gatherer combining the DefaultGatherer and the metrics from the metric store. g := prometheus.Gatherers{ prometheus.DefaultGatherer,