From 1760ac6bd0db5ec0cf6cbf76e30734229d258968 Mon Sep 17 00:00:00 2001 From: KMY Date: Fri, 8 Mar 2024 10:02:01 +0900 Subject: [PATCH] =?UTF-8?q?Change:=20#532=20ElasticSearch=E8=A8=AD?= =?UTF-8?q?=E5=AE=9A=E3=81=AE=E5=A4=96=E5=87=BA=E3=81=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + app/chewy/accounts_index.rb | 93 ++------ app/chewy/public_statuses_index.rb | 82 +------ app/chewy/statuses_index.rb | 86 +------- app/chewy/tags_index.rb | 37 +--- app/lib/chewy_config.rb | 55 +++++ config/elasticsearch.default-ja-sudachi.yml | 232 ++++++++++++++++++++ config/elasticsearch.default.yml | 175 +++++++++++++++ 8 files changed, 494 insertions(+), 269 deletions(-) create mode 100644 app/lib/chewy_config.rb create mode 100644 config/elasticsearch.default-ja-sudachi.yml create mode 100644 config/elasticsearch.default.yml diff --git a/.gitignore b/.gitignore index c5af8eb67f8304..df824eb2a60437 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,9 @@ /node_modules/ /build/ +# Ignore elasticsearch config +/.elasticsearch.yml + # Ignore Vagrant files .vagrant/ diff --git a/app/chewy/accounts_index.rb b/app/chewy/accounts_index.rb index a7a0df40dab759..d70a96c444de14 100644 --- a/app/chewy/accounts_index.rb +++ b/app/chewy/accounts_index.rb @@ -3,83 +3,9 @@ class AccountsIndex < Chewy::Index include DatetimeClampingConcern - settings index: index_preset(refresh_interval: '30s'), analysis: { - filter: { - english_stop: { - type: 'stop', - stopwords: '_english_', - }, - - english_stemmer: { - type: 'stemmer', - language: 'english', - }, - - english_possessive_stemmer: { - type: 'stemmer', - language: 'possessive_english', - }, - - my_posfilter: { - type: 'sudachi_part_of_speech', - stoptags: [ - '助詞', - '助動詞', - '補助記号,句点', - '補助記号,読点', - ], - }, - }, - - analyzer: { - natural: { - tokenizer: 'standard', - filter: %w( - lowercase - asciifolding - cjk_width - elision - english_possessive_stemmer - english_stop - english_stemmer - ), - }, - - sudachi_analyzer: { - filter: %w( - my_posfilter - sudachi_normalizedform - ), - type: 'custom', - tokenizer: 'sudachi_tokenizer', - }, - - verbatim: { - tokenizer: 'standard', - filter: %w(lowercase asciifolding cjk_width), - }, - - edge_ngram: { - tokenizer: 'edge_ngram', - filter: %w(lowercase asciifolding cjk_width), - }, - }, - - tokenizer: { - edge_ngram: { - type: 'edge_ngram', - min_gram: 1, - max_gram: 15, - }, - - sudachi_tokenizer: { - resources_path: '/etc/elasticsearch/sudachi', - split_mode: 'A', - type: 'sudachi_tokenizer', - discard_punctuation: 'true', - }, - }, - } + # ElasticSearch config is moved to "/config/elasticsearch.default.yml". + # Edit it when original Mastodon changed ElasticSearch config. + settings index: index_preset(refresh_interval: '30s'), analysis: ChewyConfig.instance.accounts index_scope ::Account.searchable.includes(:account_stat) @@ -90,8 +16,15 @@ class AccountsIndex < Chewy::Index field(:properties, type: 'keyword', value: ->(account) { account.searchable_properties }) field(:last_status_at, type: 'date', value: ->(account) { clamp_date(account.last_status_at || account.created_at) }) field(:domain, type: 'keyword', value: ->(account) { account.domain || '' }) - field(:display_name, type: 'text', analyzer: 'verbatim') { field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'verbatim' } - field(:username, type: 'text', analyzer: 'verbatim', value: ->(account) { [account.username, account.domain].compact.join('@') }) { field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'verbatim' } - field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(account) { account.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'natural') } + field(:display_name, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'analyzer')) do + field :edge_ngram, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'edge_ngram', 'analyzer'), search_analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'edge_ngram', 'search_analyzer') + end + field(:username, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'analyzer'), value: lambda { |account| + [account.username, account.domain].compact.join('@') + }) do + field :edge_ngram, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'edge_ngram', 'analyzer'), + search_analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'edge_ngram', 'search_analyzer') + end + field(:text, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('text', 'analyzer'), value: ->(account) { account.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('text', 'stemmed', 'analyzer')) } end end diff --git a/app/chewy/public_statuses_index.rb b/app/chewy/public_statuses_index.rb index 6ae13418f43fa0..b71406d3e345dd 100644 --- a/app/chewy/public_statuses_index.rb +++ b/app/chewy/public_statuses_index.rb @@ -3,81 +3,9 @@ class PublicStatusesIndex < Chewy::Index include DatetimeClampingConcern - settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: { - filter: { - english_stop: { - type: 'stop', - stopwords: '_english_', - }, - - english_stemmer: { - type: 'stemmer', - language: 'english', - }, - - english_possessive_stemmer: { - type: 'stemmer', - language: 'possessive_english', - }, - - my_posfilter: { - type: 'sudachi_part_of_speech', - stoptags: [ - '助詞', - '助動詞', - '補助記号,句点', - '補助記号,読点', - ], - }, - }, - - analyzer: { - content: { - tokenizer: 'uax_url_email', - filter: %w( - english_possessive_stemmer - lowercase - asciifolding - cjk_width - english_stop - english_stemmer - ), - }, - - hashtag: { - tokenizer: 'keyword', - filter: %w( - word_delimiter_graph - lowercase - asciifolding - cjk_width - ), - }, - - sudachi_analyzer: { - tokenizer: 'sudachi_tokenizer', - type: 'custom', - filter: %w( - english_possessive_stemmer - lowercase - asciifolding - cjk_width - english_stop - english_stemmer - my_posfilter - sudachi_normalizedform - ), - }, - }, - tokenizer: { - sudachi_tokenizer: { - resources_path: '/etc/elasticsearch/sudachi', - split_mode: 'A', - type: 'sudachi_tokenizer', - discard_punctuation: 'true', - }, - }, - } + # ElasticSearch config is moved to "/config/elasticsearch.default.yml". + # Edit it when original Mastodon changed ElasticSearch config. + settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: ChewyConfig.instance.public_statuses index_scope ::Status.unscoped .kept @@ -87,8 +15,8 @@ class PublicStatusesIndex < Chewy::Index root date_detection: false do field(:id, type: 'long') field(:account_id, type: 'long') - field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'content') } - field(:tags, type: 'text', analyzer: 'hashtag', value: ->(status) { status.tags.map(&:display_name) }) + field(:text, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('text', 'analyzer'), value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('text', 'stemmed', 'analyzer')) } + field(:tags, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('tags', 'analyzer'), value: ->(status) { status.tags.map(&:display_name) }) field(:language, type: 'keyword') field(:domain, type: 'keyword', value: ->(status) { status.account.domain || '' }) field(:properties, type: 'keyword', value: ->(status) { status.searchable_properties }) diff --git a/app/chewy/statuses_index.rb b/app/chewy/statuses_index.rb index ff6e0e76721d0f..44cb86d7558a83 100644 --- a/app/chewy/statuses_index.rb +++ b/app/chewy/statuses_index.rb @@ -3,85 +3,9 @@ class StatusesIndex < Chewy::Index include DatetimeClampingConcern - settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: { - filter: { - english_stop: { - type: 'stop', - stopwords: '_english_', - }, - - english_stemmer: { - type: 'stemmer', - language: 'english', - }, - - english_possessive_stemmer: { - type: 'stemmer', - language: 'possessive_english', - }, - - my_posfilter: { - type: 'sudachi_part_of_speech', - stoptags: [ - '助詞', - '助動詞', - '補助記号,句点', - '補助記号,読点', - ], - }, - }, - analyzer: { - verbatim: { - tokenizer: 'uax_url_email', - filter: %w(lowercase), - }, - - content: { - tokenizer: 'uax_url_email', - filter: %w( - english_possessive_stemmer - lowercase - asciifolding - cjk_width - english_stop - english_stemmer - ), - }, - - hashtag: { - tokenizer: 'keyword', - filter: %w( - word_delimiter_graph - lowercase - asciifolding - cjk_width - ), - }, - - sudachi_analyzer: { - tokenizer: 'sudachi_tokenizer', - type: 'custom', - filter: %w( - english_possessive_stemmer - lowercase - asciifolding - cjk_width - english_stop - english_stemmer - my_posfilter - sudachi_normalizedform - ), - }, - }, - tokenizer: { - sudachi_tokenizer: { - resources_path: '/etc/elasticsearch/sudachi', - split_mode: 'A', - type: 'sudachi_tokenizer', - discard_punctuation: 'true', - }, - }, - } + # ElasticSearch config is moved to "/config/elasticsearch.default.yml". + # Edit it when original Mastodon changed ElasticSearch config. + settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: ChewyConfig.instance.statuses index_scope ::Status.unscoped.kept.without_reblogs.includes( :account, @@ -107,8 +31,8 @@ class StatusesIndex < Chewy::Index root date_detection: false do field(:id, type: 'long') field(:account_id, type: 'long') - field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'content') } - field(:tags, type: 'text', analyzer: 'hashtag', value: ->(status) { status.tags.map(&:display_name) }) + field(:text, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('text', 'analyzer'), value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('text', 'stemmed', 'analyzer')) } + field(:tags, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('tags', 'analyzer'), value: ->(status) { status.tags.map(&:display_name) }) field(:searchable_by, type: 'long', value: ->(status) { status.searchable_by }) field(:mentioned_by, type: 'long', value: ->(status) { status.mentioned_by }) field(:favourited_by, type: 'long', value: ->(status) { status.favourited_by }) diff --git a/app/chewy/tags_index.rb b/app/chewy/tags_index.rb index c99218a47fcdcb..965718e83ee6c8 100644 --- a/app/chewy/tags_index.rb +++ b/app/chewy/tags_index.rb @@ -3,36 +3,9 @@ class TagsIndex < Chewy::Index include DatetimeClampingConcern - settings index: index_preset(refresh_interval: '30s'), analysis: { - analyzer: { - content: { - tokenizer: 'keyword', - filter: %w( - word_delimiter_graph - lowercase - asciifolding - cjk_width - ), - }, - - edge_ngram: { - tokenizer: 'edge_ngram', - filter: %w( - lowercase - asciifolding - cjk_width - ), - }, - }, - - tokenizer: { - edge_ngram: { - type: 'edge_ngram', - min_gram: 2, - max_gram: 15, - }, - }, - } + # ElasticSearch config is moved to "/config/elasticsearch.default.yml". + # Edit it when original Mastodon changed ElasticSearch config. + settings index: index_preset(refresh_interval: '30s'), analysis: ChewyConfig.instance.tags index_scope ::Tag.listable @@ -41,7 +14,9 @@ class TagsIndex < Chewy::Index end root date_detection: false do - field(:name, type: 'text', analyzer: 'content', value: :display_name) { field(:edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content') } + field(:name, type: 'text', analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'analyzer'), value: :display_name) do + field(:edge_ngram, type: 'text', analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'edge_ngram', 'analyzer'), search_analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'edge_ngram', 'search_analyzer')) + end field(:reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }) field(:usage, type: 'long', value: ->(tag, crutches) { tag.history.aggregate(crutches.time_period).accounts }) field(:last_status_at, type: 'date', value: ->(tag) { clamp_date(tag.last_status_at || tag.created_at) }) diff --git a/app/lib/chewy_config.rb b/app/lib/chewy_config.rb new file mode 100644 index 00000000000000..f34fc578c922fd --- /dev/null +++ b/app/lib/chewy_config.rb @@ -0,0 +1,55 @@ +# frozen_string_literal: true + +require 'singleton' +require 'yaml' + +class ChewyConfig + include Singleton + + def initialize + custom_config_file = Rails.root.join('.elasticsearch.yml') + default_config_file = Rails.root.join('config', 'elasticsearch.default.yml') + + custom_config = nil + custom_config = YAML.load_file(custom_config_file) if File.exist?(custom_config_file) + default_config = YAML.load_file(default_config_file) + + @config = default_config.merge(custom_config || {}) + + @config = @config.merge(YAML.load_file(Rails.root.join('config', 'elasticsearch.default-ja-sudachi.yml'))) if Rails.env.test? + end + + attr_reader :config + + def accounts + config['accounts'] + end + + def accounts_analyzers + config['accounts_analyzers'] + end + + def public_statuses + config['public_statuses'] + end + + def public_statuses_analyzers + config['public_statuses_analyzers'] + end + + def statuses + config['statuses'] + end + + def statuses_analyzers + config['statuses_analyzers'] + end + + def tags + config['tags'] + end + + def tags_analyzers + config['tags_analyzers'] + end +end diff --git a/config/elasticsearch.default-ja-sudachi.yml b/config/elasticsearch.default-ja-sudachi.yml new file mode 100644 index 00000000000000..cfe4cbeb6d1489 --- /dev/null +++ b/config/elasticsearch.default-ja-sudachi.yml @@ -0,0 +1,232 @@ +# This is a configuration file for environments that use Japanese and Sudachi plug-ins. +# To use this file, copy it to the Mastodon root directory and rename the file to ".elasticsearch.yml". + +accounts: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + my_posfilter: + type: sudachi_part_of_speech + stoptags: + - 助詞 + - 助動詞 + - 補助記号,句点 + - 補助記号,読点 + + analyzer: + natural: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + - elision + - english_possessive_stemmer + - english_stop + - english_stemmer + sudachi_analyzer: + type: custom + tokenizer: sudachi_tokenizer + filter: + - my_posfilter + - sudachi_normalizedform + verbatim: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + edge_ngram: + tokenizer: edge_ngram + filter: + - lowercase + - asciifolding + - cjk_width + + tokenizer: + edge_ngram: + type: edge_ngram + min_gram: 1 + max_gram: 15 + sudachi_tokenizer: + resources_path: '/etc/elasticsearch/sudachi' + split_mode: A + type: sudachi_tokenizer + discard_punctuation: 'true' + +public_statuses: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + my_posfilter: + type: sudachi_part_of_speech + stoptags: + - 助詞 + - 助動詞 + - 補助記号,句点 + - 補助記号,読点 + + analyzer: + content: + tokenizer: uax_url_email + filter: + - english_possessive_stemmer + - lowercase + - asciifolding + - cjk_width + - english_stop + - english_stemmer + hashtag: + tokenizer: keyword + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + sudachi_analyzer: + tokenizer: sudachi_tokenizer + type: custom + filter: + - english_possessive_stemmer + - lowercase + - asciifolding + - cjk_width + - english_stop + - english_stemmer + - my_posfilter + - sudachi_normalizedform + + tokenizer: + sudachi_tokenizer: + resources_path: '/etc/elasticsearch/sudachi' + split_mode: A + type: sudachi_tokenizer + discard_punctuation: 'true' + +statuses: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + my_posfilter: + type: sudachi_part_of_speech + stoptags: + - 助詞 + - 助動詞 + - 補助記号,句点 + - 補助記号,読点 + + analyzer: + verbatim: + tokenizer: uax_url_email + filter: + - lowercase + content: + tokenizer: uax_url_email + filter: + - english_possessive_stemmer + - lowercase + - asciifolding + - cjk_width + - english_stop + - english_stemmer + hashtag: + tokenizer: keyword + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + sudachi_analyzer: + tokenizer: sudachi_tokenizer + type: custom + filter: + - english_possessive_stemmer + - lowercase + - asciifolding + - cjk_width + - english_stop + - english_stemmer + - my_posfilter + - sudachi_normalizedform + +tags: + analyzer: + content: + tokenizer: keyword + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + edge_ngram: + tokenizer: edge_ngram + filter: + - lowercase + - asciifolding + - cjk_width + + tokenizer: + edge_ngram: + type: edge_ngram + min_gram: 2 + max_gram: 15 + +accounts_analyzers: + display_name: + analyzer: verbatim + edge_ngram: + analyzer: edge_ngram + search_analyzer: verbatim + username: + analyzer: verbatim + edge_ngram: + analyzer: edge_ngram + search_analyzer: verbatim + text: + analyzer: sudachi_analyzer + stemmed: + analyzer: natural + +public_statuses_analyzers: + text: + analyzer: sudachi_analyzer + stemmed: + analyzer: content + tags: + analyzer: hashtag + +statuses_analyzers: + text: + analyzer: sudachi_analyzer + stemmed: + analyzer: content + tags: + analyzer: hashtag + +tags_analyzers: + name: + analyzer: content + edge_ngram: + analyzer: edge_ngram + search_analyzer: content diff --git a/config/elasticsearch.default.yml b/config/elasticsearch.default.yml new file mode 100644 index 00000000000000..974e78f085f49f --- /dev/null +++ b/config/elasticsearch.default.yml @@ -0,0 +1,175 @@ +# The standard ElasticSearch settings described in the original Mastodon code are stored. +# This configuration file is overridden by creating a ".elasticsearch.yml" file in the Mastodon root directory. + +accounts: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + + analyzer: + natural: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + - elision + - english_possessive_stemmer + - english_stop + - english_stemmer + verbatim: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + edge_ngram: + tokenizer: edge_ngram + filter: + - lowercase + - asciifolding + - cjk_width + + tokenizer: + edge_ngram: + type: edge_ngram + min_gram: 1 + max_gram: 15 + +public_statuses: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + + analyzer: + verbatim: + tokenizer: uax_url_email + filter: + - lowercase + content: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + - elision + - english_possessive_stemmer + - english_stop + - english_stemmer + hashtag: + tokenizer: keyword + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + +statuses: + filter: + english_stop: + type: stop + stopwords: _english_ + english_stemmer: + type: stemmer + language: english + english_possessive_stemmer: + type: stemmer + language: possessive_english + + analyzer: + verbatim: + tokenizer: uax_url_email + filter: + - lowercase + content: + tokenizer: standard + filter: + - lowercase + - asciifolding + - cjk_width + - elision + - english_possessive_stemmer + - english_stop + - english_stemmer + hashtag: + tokenizer: standard + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + +tags: + analyzer: + content: + tokenizer: keyword + filter: + - word_delimiter_graph + - lowercase + - asciifolding + - cjk_width + edge_ngram: + tokenizer: edge_ngram + filter: + - lowercase + - asciifolding + - cjk_width + + tokenizer: + edge_ngram: + type: edge_ngram + min_gram: 2 + max_gram: 15 + +accounts_analyzers: + display_name: + analyzer: verbatim + edge_ngram: + analyzer: edge_ngram + search_analyzer: verbatim + username: + analyzer: verbatim + edge_ngram: + analyzer: edge_ngram + search_analyzer: verbatim + text: + analyzer: verbatim + stemmed: + analyzer: natural + +public_statuses_analyzers: + text: + analyzer: verbatim + stemmed: + analyzer: content + tags: + analyzer: hashtag + +statuses_analyzers: + text: + analyzer: verbatim + stemmed: + analyzer: content + tags: + analyzer: hashtag + +tags_analyzers: + name: + analyzer: content + edge_ngram: + analyzer: edge_ngram + search_analyzer: content