diff --git a/analysis/benchmark_test.go b/analysis/benchmark_test.go index c5b6647aa..6c61c5231 100644 --- a/analysis/benchmark_test.go +++ b/analysis/benchmark_test.go @@ -32,7 +32,10 @@ func BenchmarkAnalysis(b *testing.B) { b.Fatal(err) } - ts := analyzer.Analyze(bleveWikiArticle) + ts, err := analysis.AnalyzeForTokens(analyzer, bleveWikiArticle) + if err != nil { + b.Fatalf("error analyzing text: %v", err) + } freqs := analysis.TokenFrequency(ts, nil, index.IncludeTermVectors) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) diff --git a/analysis/lang/ar/analyzer_ar_test.go b/analysis/lang/ar/analyzer_ar_test.go index 437d69fd9..29590d20a 100644 --- a/analysis/lang/ar/analyzer_ar_test.go +++ b/analysis/lang/ar/analyzer_ar_test.go @@ -175,7 +175,10 @@ func TestArabicAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) diff --git a/analysis/lang/cjk/analyzer_cjk_test.go b/analysis/lang/cjk/analyzer_cjk_test.go index afd895788..8b8f7d232 100644 --- a/analysis/lang/cjk/analyzer_cjk_test.go +++ b/analysis/lang/cjk/analyzer_cjk_test.go @@ -617,7 +617,10 @@ func TestCJKAnalyzer(t *testing.T) { if err != nil { t.Fatal(err) } - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/ckb/analyzer_ckb_test.go b/analysis/lang/ckb/analyzer_ckb_test.go index 9e6adab8e..ec368ff14 100644 --- a/analysis/lang/ckb/analyzer_ckb_test.go +++ b/analysis/lang/ckb/analyzer_ckb_test.go @@ -69,7 +69,10 @@ func TestSoraniAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/da/analyzer_da_test.go b/analysis/lang/da/analyzer_da_test.go index e22f32567..de862fd74 100644 --- a/analysis/lang/da/analyzer_da_test.go +++ b/analysis/lang/da/analyzer_da_test.go @@ -63,7 +63,10 @@ func TestDanishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/de/analyzer_de_test.go b/analysis/lang/de/analyzer_de_test.go index f404ded94..ce1dc6383 100644 --- a/analysis/lang/de/analyzer_de_test.go +++ b/analysis/lang/de/analyzer_de_test.go @@ -147,7 +147,10 @@ func TestGermanAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/en/analyzer_en_test.go b/analysis/lang/en/analyzer_en_test.go index 6db7c3000..47e5688bb 100644 --- a/analysis/lang/en/analyzer_en_test.go +++ b/analysis/lang/en/analyzer_en_test.go @@ -97,7 +97,10 @@ func TestEnglishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/es/analyzer_es_test.go b/analysis/lang/es/analyzer_es_test.go index ad3b1f650..6c209de2f 100644 --- a/analysis/lang/es/analyzer_es_test.go +++ b/analysis/lang/es/analyzer_es_test.go @@ -114,7 +114,10 @@ func TestSpanishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/fa/analyzer_fa_test.go b/analysis/lang/fa/analyzer_fa_test.go index f648261f9..f0f416125 100644 --- a/analysis/lang/fa/analyzer_fa_test.go +++ b/analysis/lang/fa/analyzer_fa_test.go @@ -305,7 +305,10 @@ func TestPersianAnalyzerVerbs(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } @@ -600,7 +603,10 @@ func TestPersianAnalyzerVerbsDefective(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } @@ -671,7 +677,10 @@ func TestPersianAnalyzerOthers(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/fi/analyzer_fi_test.go b/analysis/lang/fi/analyzer_fi_test.go index 45aa242de..c08537848 100644 --- a/analysis/lang/fi/analyzer_fi_test.go +++ b/analysis/lang/fi/analyzer_fi_test.go @@ -57,7 +57,10 @@ func TestFinishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/fr/analyzer_fr_test.go b/analysis/lang/fr/analyzer_fr_test.go index 38f89e079..ed742e4d4 100644 --- a/analysis/lang/fr/analyzer_fr_test.go +++ b/analysis/lang/fr/analyzer_fr_test.go @@ -196,7 +196,10 @@ func TestFrenchAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/hi/analyzer_hi_test.go b/analysis/lang/hi/analyzer_hi_test.go index a86aeefd8..568398850 100644 --- a/analysis/lang/hi/analyzer_hi_test.go +++ b/analysis/lang/hi/analyzer_hi_test.go @@ -58,7 +58,10 @@ func TestHindiAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %v, got %v", test.output, actual) } diff --git a/analysis/lang/hr/analyzer_hr_test.go b/analysis/lang/hr/analyzer_hr_test.go index e1ab35afc..6adbe7e92 100644 --- a/analysis/lang/hr/analyzer_hr_test.go +++ b/analysis/lang/hr/analyzer_hr_test.go @@ -84,7 +84,10 @@ func TestCroatianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/hu/analyzer_hu_test.go b/analysis/lang/hu/analyzer_hu_test.go index 8745668c2..c4e03abe8 100644 --- a/analysis/lang/hu/analyzer_hu_test.go +++ b/analysis/lang/hu/analyzer_hu_test.go @@ -57,7 +57,10 @@ func TestHungarianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/it/analyzer_it_test.go b/analysis/lang/it/analyzer_it_test.go index 19b9d4dfe..65a3a0222 100644 --- a/analysis/lang/it/analyzer_it_test.go +++ b/analysis/lang/it/analyzer_it_test.go @@ -83,7 +83,10 @@ func TestItalianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/nl/analyzer_nl_test.go b/analysis/lang/nl/analyzer_nl_test.go index 707655f01..fe5fcc48d 100644 --- a/analysis/lang/nl/analyzer_nl_test.go +++ b/analysis/lang/nl/analyzer_nl_test.go @@ -57,7 +57,10 @@ func TestDutchAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/no/analyzer_no_test.go b/analysis/lang/no/analyzer_no_test.go index b37cb4d1c..5d7fab141 100644 --- a/analysis/lang/no/analyzer_no_test.go +++ b/analysis/lang/no/analyzer_no_test.go @@ -57,7 +57,10 @@ func TestNorwegianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/pl/analyzer_pl_test.go b/analysis/lang/pl/analyzer_pl_test.go index 073a28f79..8f0e8ba91 100644 --- a/analysis/lang/pl/analyzer_pl_test.go +++ b/analysis/lang/pl/analyzer_pl_test.go @@ -136,7 +136,10 @@ func TestPolishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/pt/analyzer_pt_test.go b/analysis/lang/pt/analyzer_pt_test.go index 417e64066..ea5ae3cfe 100644 --- a/analysis/lang/pt/analyzer_pt_test.go +++ b/analysis/lang/pt/analyzer_pt_test.go @@ -57,7 +57,10 @@ func TestPortugueseAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/ro/analyzer_ro_test.go b/analysis/lang/ro/analyzer_ro_test.go index 0fe4645bd..b8ec2bcb4 100644 --- a/analysis/lang/ro/analyzer_ro_test.go +++ b/analysis/lang/ro/analyzer_ro_test.go @@ -57,7 +57,10 @@ func TestRomanianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/ru/analyzer_ru_test.go b/analysis/lang/ru/analyzer_ru_test.go index 38534aff2..ef231853d 100644 --- a/analysis/lang/ru/analyzer_ru_test.go +++ b/analysis/lang/ru/analyzer_ru_test.go @@ -109,7 +109,10 @@ func TestRussianAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/sv/analyzer_sv_test.go b/analysis/lang/sv/analyzer_sv_test.go index a3bd5f161..b7b4bc40b 100644 --- a/analysis/lang/sv/analyzer_sv_test.go +++ b/analysis/lang/sv/analyzer_sv_test.go @@ -57,7 +57,10 @@ func TestSwedishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/lang/tr/analyzer_tr_test.go b/analysis/lang/tr/analyzer_tr_test.go index 3c4592b6e..720996bda 100644 --- a/analysis/lang/tr/analyzer_tr_test.go +++ b/analysis/lang/tr/analyzer_tr_test.go @@ -77,7 +77,10 @@ func TestTurkishAnalyzer(t *testing.T) { t.Fatal(err) } for _, test := range tests { - actual := analyzer.Analyze(test.input) + actual, err := analysis.AnalyzeForTokens(analyzer, test.input) + if err != nil { + t.Fatalf("error analyzing input: %v", err) + } if len(actual) != len(test.output) { t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) } diff --git a/analysis/type.go b/analysis/type.go index e3a7f201b..470759a49 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -76,6 +76,75 @@ type Analyzer interface { Analyze([]byte) TokenStream } +// ----------------------------------------------------------------------------- + +type AnalyzerType int +const ( + TokensAnalyzer AnalyzerType = iota + VectorAnalyzer +) + +type AnalyzerV2 interface { + Type() AnalyzerType + Analyze([]byte) (any, error) +} + +func AnalyzeForTokens(analyzer AnalyzerV2, input []byte) (TokenStream, error) { + if analyzer.Type() != TokensAnalyzer { + return nil, fmt.Errorf("incompatible analyzer type") + } + + output, err := analyzer.Analyze(input) + if err != nil { + return nil, err + } + + rv, ok := output.(TokenStream) + if !ok { + return nil, fmt.Errorf("unexpected output type, expected TokenStream") + } + + return rv, nil +} + +func AnalyzeForVectors(analyzer AnalyzerV2, input []byte) ([]float32, error) { + if analyzer.Type() != VectorAnalyzer { + return nil, fmt.Errorf("incompatible analyzer type") + } + + output, err := analyzer.Analyze(input) + if err != nil { + return nil, err + } + + rv, ok := output.([]float32) + if !ok { + return nil, fmt.Errorf("unexpected output type, expected []float32") + } + + return rv, nil +} + +// ----------------------------------------------------------------------------- + +// Helper type to allow Analyzer to be used as an AnalyzerV2 +// +// An Analyer can simply be wrapped in an AnalyzerAdapter to be used as an +// AnalyzerV2 +type AnalyzerAdapter struct { + Analyzer +} + +func (a *AnalyzerAdapter) Type() AnalyzerType { + return TokensAnalyzer +} + +func (a *AnalyzerAdapter) Analyze(input []byte) (any, error) { + return a.Analyzer.Analyze(input), nil +} + +// ----------------------------------------------------------------------------- + type DefaultAnalyzer struct { CharFilters []CharFilter Tokenizer Tokenizer diff --git a/document/field.go b/document/field.go index eb104e2df..0b590a91a 100644 --- a/document/field.go +++ b/document/field.go @@ -29,7 +29,7 @@ type Field interface { // "doc1", then "field" in "doc2". ArrayPositions() []uint64 Options() index.FieldIndexingOptions - Analyze() + Analyze() error Value() []byte // NumPlainTextBytes should return the number of plain text bytes diff --git a/document/field_boolean.go b/document/field_boolean.go index 8c2987a7f..72379b453 100644 --- a/document/field_boolean.go +++ b/document/field_boolean.go @@ -66,7 +66,7 @@ func (b *BooleanField) Options() index.FieldIndexingOptions { return b.options } -func (b *BooleanField) Analyze() { +func (b *BooleanField) Analyze() error { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -78,6 +78,8 @@ func (b *BooleanField) Analyze() { b.length = len(tokens) b.frequencies = analysis.TokenFrequency(tokens, b.arrayPositions, b.options) + + return nil } func (b *BooleanField) Value() []byte { diff --git a/document/field_composite.go b/document/field_composite.go index e0ba8af7a..57da02b3c 100644 --- a/document/field_composite.go +++ b/document/field_composite.go @@ -94,7 +94,8 @@ func (c *CompositeField) Options() index.FieldIndexingOptions { return c.options } -func (c *CompositeField) Analyze() { +func (c *CompositeField) Analyze() error { + return nil } func (c *CompositeField) Value() []byte { diff --git a/document/field_datetime.go b/document/field_datetime.go index f3b859c43..0f3a5141c 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -98,7 +98,7 @@ func (n *DateTimeField) splitValue() (numeric.PrefixCoded, string) { return numeric.PrefixCoded(parts[0]), string(parts[1]) } -func (n *DateTimeField) Analyze() { +func (n *DateTimeField) Analyze() error { valueWithoutLayout, _ := n.splitValue() tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ @@ -132,6 +132,8 @@ func (n *DateTimeField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *DateTimeField) Value() []byte { diff --git a/document/field_geopoint.go b/document/field_geopoint.go index 5795043f2..6d007afb8 100644 --- a/document/field_geopoint.go +++ b/document/field_geopoint.go @@ -82,7 +82,7 @@ func (n *GeoPointField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *GeoPointField) Analyze() { +func (n *GeoPointField) Analyze() error { tokens := make(analysis.TokenStream, 0, 8) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -133,6 +133,8 @@ func (n *GeoPointField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *GeoPointField) Value() []byte { diff --git a/document/field_geoshape.go b/document/field_geoshape.go index 6bf7b010a..24a21fd6c 100644 --- a/document/field_geoshape.go +++ b/document/field_geoshape.go @@ -84,7 +84,7 @@ func (n *GeoShapeField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *GeoShapeField) Analyze() { +func (n *GeoShapeField) Analyze() error { // compute the bytes representation for the coordinates tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ @@ -111,6 +111,8 @@ func (n *GeoShapeField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *GeoShapeField) Value() []byte { diff --git a/document/field_ip.go b/document/field_ip.go index 80a353a01..0a0dbabc6 100644 --- a/document/field_ip.go +++ b/document/field_ip.go @@ -79,7 +79,7 @@ func (n *IPField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (b *IPField) Analyze() { +func (b *IPField) Analyze() error { tokens := analysis.TokenStream{ &analysis.Token{ @@ -92,6 +92,8 @@ func (b *IPField) Analyze() { } b.length = 1 b.frequencies = analysis.TokenFrequency(tokens, b.arrayPositions, b.options) + + return nil } func (b *IPField) Value() []byte { diff --git a/document/field_numeric.go b/document/field_numeric.go index 1ee7b75ee..2de8ebe83 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -81,7 +81,7 @@ func (n *NumericField) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.frequencies } -func (n *NumericField) Analyze() { +func (n *NumericField) Analyze() error { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, @@ -114,6 +114,8 @@ func (n *NumericField) Analyze() { n.length = len(tokens) n.frequencies = analysis.TokenFrequency(tokens, n.arrayPositions, n.options) + + return nil } func (n *NumericField) Value() []byte { diff --git a/document/field_text.go b/document/field_text.go index d35e74732..efede672a 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -36,7 +36,7 @@ type TextField struct { name string arrayPositions []uint64 options index.FieldIndexingOptions - analyzer analysis.Analyzer + analyzer analysis.AnalyzerV2 value []byte numPlainTextBytes uint64 length int @@ -79,8 +79,9 @@ func (t *TextField) AnalyzedTokenFrequencies() index.TokenFrequencies { return t.frequencies } -func (t *TextField) Analyze() { +func (t *TextField) Analyze() error { var tokens analysis.TokenStream + var err error if t.analyzer != nil { bytesToAnalyze := t.Value() if t.options.IsStored() { @@ -89,7 +90,7 @@ func (t *TextField) Analyze() { copy(bytesCopied, bytesToAnalyze) bytesToAnalyze = bytesCopied } - tokens = t.analyzer.Analyze(bytesToAnalyze) + tokens, err = analysis.AnalyzeForTokens(t.analyzer, bytesToAnalyze) } else { tokens = analysis.TokenStream{ &analysis.Token{ @@ -103,9 +104,11 @@ func (t *TextField) Analyze() { } t.length = len(tokens) // number of tokens in this doc field t.frequencies = analysis.TokenFrequency(tokens, t.arrayPositions, t.options) + + return err } -func (t *TextField) Analyzer() analysis.Analyzer { +func (t *TextField) Analyzer() analysis.AnalyzerV2 { return t.analyzer } @@ -139,7 +142,7 @@ func NewTextFieldWithIndexingOptions(name string, arrayPositions []uint64, value } } -func NewTextFieldWithAnalyzer(name string, arrayPositions []uint64, value []byte, analyzer analysis.Analyzer) *TextField { +func NewTextFieldWithAnalyzer(name string, arrayPositions []uint64, value []byte, analyzer analysis.AnalyzerV2) *TextField { return &TextField{ name: name, arrayPositions: arrayPositions, @@ -150,7 +153,7 @@ func NewTextFieldWithAnalyzer(name string, arrayPositions []uint64, value []byte } } -func NewTextFieldCustom(name string, arrayPositions []uint64, value []byte, options index.FieldIndexingOptions, analyzer analysis.Analyzer) *TextField { +func NewTextFieldCustom(name string, arrayPositions []uint64, value []byte, options index.FieldIndexingOptions, analyzer analysis.AnalyzerV2) *TextField { return &TextField{ name: name, arrayPositions: arrayPositions, diff --git a/document/field_vector.go b/document/field_vector.go index 53334d202..63f8968c5 100644 --- a/document/field_vector.go +++ b/document/field_vector.go @@ -82,8 +82,9 @@ func (n *VectorField) AnalyzedTokenFrequencies() index.TokenFrequencies { return nil } -func (n *VectorField) Analyze() { +func (n *VectorField) Analyze() error { // vectors aren't analyzed + return nil } func (n *VectorField) Value() []byte { diff --git a/document/field_vector_base64.go b/document/field_vector_base64.go index 31d6cbffd..0deb6f79d 100644 --- a/document/field_vector_base64.go +++ b/document/field_vector_base64.go @@ -79,7 +79,8 @@ func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies { return n.vectorField.AnalyzedTokenFrequencies() } -func (n *VectorBase64Field) Analyze() { +func (n *VectorBase64Field) Analyze() error { + return nil } func (n *VectorBase64Field) Value() []byte { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 429d1daa9..b86bdac89 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -409,6 +409,11 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { if doc != nil { // put the work on the queue s.analysisQueue.Queue(func() { + // ignoring the analysis errors here. + // Data for all the fields with analysis errors will be ignored. + // + // todo: collect stats on analysis errors and also maintain + // records of such unique errors. analyze(doc, s.setSpatialAnalyzerPlugin) resultChan <- doc }) @@ -674,8 +679,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { return m } -func (s *Scorch) Analyze(d index.Document) { - analyze(d, s.setSpatialAnalyzerPlugin) +func (s *Scorch) Analyze(d index.Document) map[string]error { + return analyze(d, s.setSpatialAnalyzerPlugin) } type customAnalyzerPluginInitFunc func(field index.Field) @@ -691,14 +696,19 @@ func (s *Scorch) setSpatialAnalyzerPlugin(f index.Field) { } } -func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { +func analyze(d index.Document, fn customAnalyzerPluginInitFunc) map[string]error { + rv := make(map[string]error) d.VisitFields(func(field index.Field) { if field.Options().IsIndexed() { if fn != nil { fn(field) } - field.Analyze() + err := field.Analyze() + if err != nil { + rv[field.Name()] = err + return + } if d.HasComposite() && field.Name() != "_id" { // see if any of the composite fields need this @@ -708,6 +718,8 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { } } }) + + return rv } func (s *Scorch) AddEligibleForRemoval(epoch uint64) { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 8165774e7..aeec62e30 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -59,8 +59,10 @@ func CreateConfig(name string) map[string]interface{} { return rv } -var testAnalyzer = &analysis.DefaultAnalyzer{ - Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), +var testAnalyzer = &analysis.AnalyzerAdapter{ + Analyzer: &analysis.DefaultAnalyzer{ + Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), + }, } func TestIndexOpenReopen(t *testing.T) { diff --git a/index/upsidedown/upsidedown_test.go b/index/upsidedown/upsidedown_test.go index beeea6579..b43c42a16 100644 --- a/index/upsidedown/upsidedown_test.go +++ b/index/upsidedown/upsidedown_test.go @@ -34,8 +34,10 @@ import ( index "github.com/blevesearch/bleve_index_api" ) -var testAnalyzer = &analysis.DefaultAnalyzer{ - Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), +var testAnalyzer = &analysis.AnalyzerAdapter{ + Analyzer: &analysis.DefaultAnalyzer{ + Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), + }, } func TestIndexOpenReopen(t *testing.T) { diff --git a/mapping/field.go b/mapping/field.go index 5c064fddd..65099767c 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -374,7 +374,7 @@ func (fm *FieldMapping) processGeoShape(propertyMightBeGeoShape interface{}, } } -func (fm *FieldMapping) analyzerForField(path []string, context *walkContext) analysis.Analyzer { +func (fm *FieldMapping) analyzerForField(path []string, context *walkContext) analysis.AnalyzerV2 { analyzerName := fm.Analyzer if analyzerName == "" { analyzerName = context.dm.defaultAnalyzerName(path) diff --git a/mapping/index.go b/mapping/index.go index fe8c96713..9a6761f40 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -399,7 +399,7 @@ func (im *IndexMappingImpl) AnalyzerNameForPath(path string) string { return im.DefaultAnalyzer } -func (im *IndexMappingImpl) AnalyzerNamed(name string) analysis.Analyzer { +func (im *IndexMappingImpl) AnalyzerNamed(name string) analysis.AnalyzerV2 { analyzer, err := im.cache.AnalyzerNamed(name) if err != nil { logger.Printf("error using analyzer named: %s", name) @@ -425,7 +425,7 @@ func (im *IndexMappingImpl) AnalyzeText(analyzerName string, text []byte) (analy if err != nil { return nil, err } - return analyzer.Analyze(text), nil + return analysis.AnalyzeForTokens(analyzer, text) } // FieldAnalyzer returns the name of the analyzer used on a field. diff --git a/mapping/mapping.go b/mapping/mapping.go index cbfc98faa..805b78201 100644 --- a/mapping/mapping.go +++ b/mapping/mapping.go @@ -54,7 +54,7 @@ type IndexMapping interface { DefaultSearchField() string AnalyzerNameForPath(path string) string - AnalyzerNamed(name string) analysis.Analyzer + AnalyzerNamed(name string) analysis.AnalyzerV2 FieldMappingForPath(path string) FieldMapping } diff --git a/registry/analyzer.go b/registry/analyzer.go index f4753bc1c..09845e56c 100644 --- a/registry/analyzer.go +++ b/registry/analyzer.go @@ -16,6 +16,7 @@ package registry import ( "fmt" + "sync" "github.com/blevesearch/bleve/v2/analysis" ) @@ -28,9 +29,50 @@ func RegisterAnalyzer(name string, constructor AnalyzerConstructor) { analyzers[name] = constructor } +// Concurrent safe way to register analyzers at runtime. +// Useful for embedders to register custom analyzers at runtime. +func RegisterAnalyzerV2(name string, constructor AnalyzerConstructorV2) error { + analyzersV2.m.Lock() + defer analyzersV2.m.Unlock() + + _, exists := analyzersV2.cons[name] + if exists { + return fmt.Errorf("attempted to register duplicate analyzer named '%s'", name) + } + + analyzersV2.cons[name] = constructor + + return nil +} + +// create/replace an analyzer in the registry. +func ReplaceAnalyzerV2(name string, constructor AnalyzerConstructorV2) { + analyzersV2.m.Lock() + analyzersV2.cons[name] = constructor + analyzersV2.m.Unlock() +} + +func DeregisterAnalyzerV2(name string) { + analyzersV2.m.Lock() + delete(analyzersV2.cons, name) + analyzersV2.m.Unlock() +} + type AnalyzerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Analyzer, error) +type AnalyzerConstructorV2 func(config map[string]interface{}, cache *Cache) (analysis.AnalyzerV2, error) type AnalyzerRegistry map[string]AnalyzerConstructor +type AnalyzerRegistryV2 struct { + m sync.RWMutex + cons map[string]AnalyzerConstructorV2 +} + +func NewAnalyzerRegistryV2() *AnalyzerRegistryV2 { + return &AnalyzerRegistryV2{ + cons: make(map[string]AnalyzerConstructorV2), + } +} + type AnalyzerCache struct { *ConcurrentCache } @@ -42,26 +84,36 @@ func NewAnalyzerCache() *AnalyzerCache { } func AnalyzerBuild(name string, config map[string]interface{}, cache *Cache) (interface{}, error) { - cons, registered := analyzers[name] - if !registered { - return nil, fmt.Errorf("no analyzer with name or type '%s' registered", name) + if cons, registered := analyzers[name]; registered { + analyzer, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building analyzer: %v", err) + } + return &analysis.AnalyzerAdapter{Analyzer: analyzer}, nil } - analyzer, err := cons(config, cache) - if err != nil { - return nil, fmt.Errorf("error building analyzer: %v", err) + + analyzersV2.m.RLock() + defer analyzersV2.m.RUnlock() + if cons, registered := analyzersV2.cons[name]; registered { + analyzer, err := cons(config, cache) + if err != nil { + return nil, fmt.Errorf("error building analyzer: %v", err) + } + return analyzer, nil } - return analyzer, nil + + return nil, fmt.Errorf("no analyzer with name or type '%s' registered", name) } -func (c *AnalyzerCache) AnalyzerNamed(name string, cache *Cache) (analysis.Analyzer, error) { +func (c *AnalyzerCache) AnalyzerNamed(name string, cache *Cache) (analysis.AnalyzerV2, error) { item, err := c.ItemNamed(name, cache, AnalyzerBuild) if err != nil { return nil, err } - return item.(analysis.Analyzer), nil + return item.(analysis.AnalyzerV2), nil } -func (c *AnalyzerCache) DefineAnalyzer(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.Analyzer, error) { +func (c *AnalyzerCache) DefineAnalyzer(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.AnalyzerV2, error) { item, err := c.DefineItem(name, typ, config, cache, AnalyzerBuild) if err != nil { if err == ErrAlreadyDefined { @@ -69,7 +121,7 @@ func (c *AnalyzerCache) DefineAnalyzer(name string, typ string, config map[strin } return nil, err } - return item.(analysis.Analyzer), nil + return item.(analysis.AnalyzerV2), nil } func AnalyzerTypesAndInstances() ([]string, []string) { diff --git a/registry/registry.go b/registry/registry.go index 1954d0896..19e050e68 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -37,6 +37,8 @@ var tokenFilters = make(TokenFilterRegistry, 0) var analyzers = make(AnalyzerRegistry, 0) var dateTimeParsers = make(DateTimeParserRegistry, 0) +var analyzersV2 = NewAnalyzerRegistryV2() + type Cache struct { CharFilters *CharFilterCache Tokenizers *TokenizerCache @@ -123,11 +125,11 @@ func (c *Cache) DefineTokenFilter(name string, config map[string]interface{}) (a return c.TokenFilters.DefineTokenFilter(name, typ, config, c) } -func (c *Cache) AnalyzerNamed(name string) (analysis.Analyzer, error) { +func (c *Cache) AnalyzerNamed(name string) (analysis.AnalyzerV2, error) { return c.Analyzers.AnalyzerNamed(name, c) } -func (c *Cache) DefineAnalyzer(name string, config map[string]interface{}) (analysis.Analyzer, error) { +func (c *Cache) DefineAnalyzer(name string, config map[string]interface{}) (analysis.AnalyzerV2, error) { typ, err := typeFromConfig(config) if err != nil { return nil, err diff --git a/search/query/match.go b/search/query/match.go index 074d11d34..be4abb298 100644 --- a/search/query/match.go +++ b/search/query/match.go @@ -18,6 +18,7 @@ import ( "context" "fmt" + "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/util" @@ -134,7 +135,10 @@ func (q *MatchQuery) Searcher(ctx context.Context, i index.IndexReader, m mappin return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } - tokens := analyzer.Analyze([]byte(q.Match)) + tokens, err := analysis.AnalyzeForTokens(analyzer, []byte(q.Match)) + if err != nil { + return nil, fmt.Errorf("error analyzing input: %v", err) + } if len(tokens) > 0 { tqs := make([]Query, len(tokens)) diff --git a/search/query/match_phrase.go b/search/query/match_phrase.go index 63a16a534..813bf6bd8 100644 --- a/search/query/match_phrase.go +++ b/search/query/match_phrase.go @@ -84,7 +84,10 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } - tokens := analyzer.Analyze([]byte(q.MatchPhrase)) + tokens, err := analysis.AnalyzeForTokens(analyzer, []byte(q.MatchPhrase)) + if err != nil { + return nil, fmt.Errorf("error analyzing input: %v", err) + } if len(tokens) > 0 { phrase := tokenStreamToPhrase(tokens) phraseQuery := NewMultiPhraseQuery(phrase, field) diff --git a/search/searcher/base_test.go b/search/searcher/base_test.go index 6f80bf653..adafbc680 100644 --- a/search/searcher/base_test.go +++ b/search/searcher/base_test.go @@ -77,8 +77,10 @@ func initTwoDocs(twoDocIndex index.Index) { } // create a simpler analyzer which will support these tests -var testAnalyzer = &analysis.DefaultAnalyzer{ - Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), +var testAnalyzer = &analysis.AnalyzerAdapter{ + Analyzer: &analysis.DefaultAnalyzer{ + Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), + }, } // sets up some mock data used in many tests in this package diff --git a/search_test.go b/search_test.go index 3d14c9254..a723328c7 100644 --- a/search_test.go +++ b/search_test.go @@ -627,10 +627,12 @@ func TestNestedBooleanSearchers(t *testing.T) { doc.Fields = []document.Field{ document.NewTextFieldCustom("hostname", []uint64{}, []byte(hostname), index.IndexField, - &analysis.DefaultAnalyzer{ - Tokenizer: single.NewSingleTokenTokenizer(), - TokenFilters: []analysis.TokenFilter{ - lowercase.NewLowerCaseFilter(), + &analysis.AnalyzerAdapter{ + Analyzer: &analysis.DefaultAnalyzer{ + Tokenizer: single.NewSingleTokenTokenizer(), + TokenFilters: []analysis.TokenFilter{ + lowercase.NewLowerCaseFilter(), + }, }, }, ),