diff --git a/cmd/flags.go b/cmd/flags.go index e256e8f..3aa51c8 100644 --- a/cmd/flags.go +++ b/cmd/flags.go @@ -95,6 +95,13 @@ func allMatchesFlag(cmd *cobra.Command) { } } +func findByAnnotFlag(cmd *cobra.Command) { + b, _ := cmd.Flags().GetBool("find-by-annotation") + if b { + opts = append(opts, config.OptWithFindByAnnotation(b)) + } +} + func oddsDetailsFlag(cmd *cobra.Command) { b, _ := cmd.Flags().GetBool("details-odds") if b { diff --git a/cmd/gnfinder.yml b/cmd/gnfinder.yml index c2025aa..95efaeb 100644 --- a/cmd/gnfinder.yml +++ b/cmd/gnfinder.yml @@ -71,6 +71,12 @@ # # WithBayesOddsDetails: false +# WithFindByAnnotation allows to detect names by existence of a +# nomenclatural annotation. If it is true, dictionaries do not prevent +# detection of a name. +# +# WithFindByAnnotation: false + # WithOddsAdjustment can be set to true to adjust calculated odds using the # ratio of scientific names found in text to the number of capitalized # words. diff --git a/cmd/root.go b/cmd/root.go index ddd40ae..4749fce 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -65,6 +65,7 @@ type cfgData struct { WithAllMatches bool WithAmbiguousNames bool WithBayesOddsDetails bool + WithFindByAnnotation bool WithOddsAdjustment bool WithPlainInput bool WithPositionInBytes bool @@ -120,6 +121,7 @@ verification results. inputOnlyFlag(cmd) langFlag(cmd) allMatchesFlag(cmd) + findByAnnotFlag(cmd) oddsDetailsFlag(cmd) plainInputFlag(cmd) sourcesFlag(cmd) @@ -194,6 +196,9 @@ func init() { "show details of odds calculation.") rootCmd.Flags().StringP("verifier-url", "e", "", "custom URL for name-verification service.") + rootCmd.Flags().BoolP("find-by-annotation", "F", false, + `if there is a nomenclatural annotation ('sp. nov.' etc), +a name will be detected.`) rootCmd.Flags().StringP("format", "f", "", `Format of the output: "compact", "pretty", "csv". compact: compact JSON, @@ -272,6 +277,7 @@ func initConfig() { _ = viper.BindEnv("WithAmbiguousNames", "GNF_WITH_AMBIGUOUS_NAMES") _ = viper.BindEnv("WithAllMatches", "GNF_WITH_ALL_MATCHES") _ = viper.BindEnv("WithBayesOddsDetails", "GNF_WITH_BAYES_ODDS_DETAILS") + _ = viper.BindEnv("WithFindByAnnotation", "GNF_WITH_FIND_BY_ANNOTATION") _ = viper.BindEnv("WithOddsAdjustment", "GNF_WITH_ODDS_ADJUSTMENT") _ = viper.BindEnv("WithPlainInput", "GNF_WITH_PLAIN_INPUT") _ = viper.BindEnv("WithPositionInBytes", "GNF_WITH_POSITION_IN_BYTES") @@ -362,6 +368,14 @@ func getOpts() { opts = append(opts, config.OptWithBayesOddsDetails(true)) } + if cfgCli.WithFindByAnnotation { + opts = append(opts, config.OptWithFindByAnnotation(true)) + } + + if cfgCli.WithOddsAdjustment { + opts = append(opts, config.OptWithOddsAdjustment(true)) + } + if cfgCli.WithPlainInput { opts = append(opts, config.OptWithPlainInput(true)) } @@ -370,10 +384,6 @@ func getOpts() { opts = append(opts, config.OptWithPositonInBytes(true)) } - if cfgCli.WithOddsAdjustment { - opts = append(opts, config.OptWithOddsAdjustment(true)) - } - if cfgCli.WithUniqueNames { opts = append(opts, config.OptWithUniqueNames(true)) } diff --git a/pkg/config/config.go b/pkg/config/config.go index da42208..5113d75 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -75,6 +75,11 @@ type Config struct { // WithBayesOddsDetails show in detail how odds are calculated. WithBayesOddsDetails bool + // WithFindByAnnotation allows to detect names by existence of a + // nomenclatural annotation. If it is true, dictionaries do not prevent + // detection of a name. + WithFindByAnnotation bool + // WithOddsAdjustment can be set to true to adjust calculated odds using the // ratio of scientific names found in text to the number of capitalized // words. @@ -208,6 +213,14 @@ func OptWithBayesOddsDetails(b bool) Option { } } +// OptWithFindByAnnotation option to allow detect names solely by their +// nomenclatural annotation. +func OptWithFindByAnnotation(b bool) Option { + return func(cfg *Config) { + cfg.WithFindByAnnotation = b + } +} + // OptWithOddsAdjustment is an option that triggers recalculation of prior odds // using number of found names divided by number of all name candidates. func OptWithOddsAdjustment(b bool) Option { diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 03a0050..0aa9796 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -16,17 +16,17 @@ func TestConfig(t *testing.T) { t.Run("returns new Config object", func(t *testing.T) { cfg := config.New() - assert.Equal(t, cfg.Language, lang.English) - assert.Equal(t, cfg.LanguageDetected, "") - assert.Equal(t, cfg.TokensAround, 0) + assert.Equal(t, lang.English, cfg.Language) + assert.Equal(t, "", cfg.LanguageDetected) + assert.Equal(t, 0, cfg.TokensAround) assert.True(t, cfg.WithBayes) assert.False(t, cfg.WithPositionInBytes) }) t.Run("takes language", func(t *testing.T) { cfg := config.New(config.OptLanguage(lang.English)) - assert.Equal(t, cfg.Language, lang.English) - assert.Equal(t, cfg.LanguageDetected, "") + assert.Equal(t, lang.English, cfg.Language) + assert.Equal(t, "", cfg.LanguageDetected) }) t.Run("sets bayes", func(t *testing.T) { @@ -41,19 +41,24 @@ func TestConfig(t *testing.T) { t.Run("sets tokens number", func(t *testing.T) { cfg := config.New(config.OptTokensAround(4)) - assert.Equal(t, cfg.TokensAround, 4) + assert.Equal(t, 4, cfg.TokensAround) + }) + + t.Run("sets find by annotation", func(t *testing.T) { + cfg := config.New(config.OptWithFindByAnnotation(true)) + assert.Equal(t, true, cfg.WithFindByAnnotation) }) t.Run("does not set 'bad' tokens number", func(t *testing.T) { cfg := config.New(config.OptTokensAround(-1)) - assert.Equal(t, cfg.TokensAround, 0) + assert.Equal(t, 0, cfg.TokensAround) cfg = config.New(config.OptTokensAround(10)) - assert.Equal(t, cfg.TokensAround, 5) + assert.Equal(t, 5, cfg.TokensAround) }) t.Run("sets bayes' threshold", func(t *testing.T) { cfg := config.New(config.OptBayesOddsThreshold(200)) - assert.Equal(t, cfg.BayesOddsThreshold, 200.0) + assert.Equal(t, 200.0, cfg.BayesOddsThreshold) }) t.Run("sets several options", func(t *testing.T) { @@ -62,7 +67,7 @@ func TestConfig(t *testing.T) { config.OptLanguage(lang.German), } cfg := config.New(opts...) - assert.Equal(t, cfg.Language, lang.German) + assert.Equal(t, lang.German, cfg.Language) assert.True(t, cfg.WithBayes) }) @@ -81,11 +86,11 @@ func TestConfig(t *testing.T) { for _, v := range tests { l, err := lang.New(v.lang) - assert.Equal(t, err != nil, v.hasErr, v.msg) + assert.Equal(t, v.hasErr, err != nil, v.msg) langOpt := config.OptLanguage(l) opts := []config.Option{langOpt} cfg := config.New(opts...) - assert.Equal(t, cfg.Language, v.langCfg, v.msg) + assert.Equal(t, v.langCfg, cfg.Language, v.msg) } }) } diff --git a/pkg/ent/heuristic/heuristic.go b/pkg/ent/heuristic/heuristic.go index 8c723c5..edc5733 100644 --- a/pkg/ent/heuristic/heuristic.go +++ b/pkg/ent/heuristic/heuristic.go @@ -11,7 +11,7 @@ import ( // tokens and sets up token's indices. Indices determine if a token is a // potential unimonial, binomial or trinomial. Then if fills out signfificant // number of features pertained to the token. -func TagTokens(ts []token.TokenSN, d *dict.Dictionary) { +func TagTokens(ts []token.TokenSN, d *dict.Dictionary, withAnnot bool) { l := len(ts) for i := range ts { diff --git a/pkg/ent/heuristic/heuristic_test.go b/pkg/ent/heuristic/heuristic_test.go index fdb4ba0..a75d25a 100644 --- a/pkg/ent/heuristic/heuristic_test.go +++ b/pkg/ent/heuristic/heuristic_test.go @@ -17,7 +17,7 @@ func TestHeuristic(t *testing.T) { randomly... Pardosa is a very nice when it is not sad. Drosophila (Sophophora) melanogaster disagrees!`) ts := token.Tokenize(txt) - heuristic.TagTokens(ts, dictionary) + heuristic.TagTokens(ts, dictionary, false) tests := map[int]struct { name string decision token.Decision diff --git a/pkg/ent/nlp/bayes_test.go b/pkg/ent/nlp/bayes_test.go index 0a8284c..14022e5 100644 --- a/pkg/ent/nlp/bayes_test.go +++ b/pkg/ent/nlp/bayes_test.go @@ -26,7 +26,7 @@ cheilum, 1 5s. per doz. Conostylis americana, 2i. 6d. `) tokens := token.Tokenize(txt) - heuristic.TagTokens(tokens, dictionary) + heuristic.TagTokens(tokens, dictionary, false) nb := weights[lang.English] tkn := tokens[10] diff --git a/pkg/gnfinder.go b/pkg/gnfinder.go index 750ca82..bee57d1 100644 --- a/pkg/gnfinder.go +++ b/pkg/gnfinder.go @@ -64,7 +64,7 @@ func (gnf gnfinder) Find(file, txt string) output.Output { gnf.Language, gnf.LanguageDetected = lang.DetectLanguage(text) } - heuristic.TagTokens(tokens, gnf.Dictionary) + heuristic.TagTokens(tokens, gnf.Dictionary, gnf.WithFindByAnnotation) if gnf.WithBayes { nb := gnf.bayesWeights[gnf.Language] nlp.TagTokens(tokens, gnf.Dictionary, nb, gnf.BayesOddsThreshold) diff --git a/tools/training/trainer.go b/tools/training/trainer.go index 9120c5d..4a6b011 100644 --- a/tools/training/trainer.go +++ b/tools/training/trainer.go @@ -113,7 +113,7 @@ func processText(t *TextData, d *dict.Dictionary) []feature.ClassFeatures { var lfs, lfsText []feature.ClassFeatures var nd NameData ts := token.Tokenize(t.Text) - heuristic.TagTokens(ts, d) + heuristic.TagTokens(ts, d, false) l := len(t.NamesPositions) var nameIdx, i int for {