Skip to content

Commit

Permalink
Resolve a issue #60 troca do ElasticSearch pelo OpenSearch (#63)
Browse files Browse the repository at this point in the history
Implementa a substituição do motor de busca ElasticSearch pelo
Opensearch, com a troca de bibliotecas Python, resolvendo a #60
  • Loading branch information
Giulio Carvalho authored Nov 30, 2023
2 parents 2faf34f + 096589b commit 2272001
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 153 deletions.
37 changes: 19 additions & 18 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ POSTGRES_HOST ?= localhost
POSTGRES_PORT ?= 5432
POSTGRES_IMAGE ?= docker.io/postgres:10
DATABASE_RESTORE_FILE ?= contrib/data/queridodiariodb.tar
# Elasticsearch info to run the tests
ELASTICSEARCH_PORT1 ?= 9200
ELASTICSEARCH_PORT2 ?= 9300
ELASTICSEARCH_CONTAINER_NAME ?= queridodiario-elasticsearch
# OpenSearch port info
OPENSEARCH_PORT1 ?= 9200
OPENSEARCH_PORT2 ?= 9300
OPENSEARCH_CONTAINER_NAME ?= queridodiario-opensearch
APACHE_TIKA_CONTAINER_NAME ?= queridodiario-apache-tika-server

run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \
Expand Down Expand Up @@ -86,11 +86,11 @@ destroy-pod:

create-pod: destroy-pod
podman pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \
-p $(ELASTICSEARCH_PORT1):$(ELASTICSEARCH_PORT1) \
-p $(STORAGE_PORT):$(STORAGE_PORT) \
--name $(POD_NAME)
-p $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \
-p $(STORAGE_PORT):$(STORAGE_PORT) \
--name $(POD_NAME)

prepare-test-env: create-pod storage apache-tika-server elasticsearch database
prepare-test-env: create-pod storage apache-tika-server opensearch database

.PHONY: test
test: prepare-test-env retest
Expand All @@ -117,7 +117,7 @@ retest-main:

.PHONY: retest-index
retest-index:
$(call run-command, python -m unittest -f tests/elasticsearch.py)
$(call run-command, python -m unittest -f tests/opensearch.py)

.PHONY: retest-tika
retest-tika:
Expand Down Expand Up @@ -200,7 +200,7 @@ set-run-variable-values:
cp --no-clobber contrib/sample.env envvars || true
$(eval POD_NAME=run-$(POD_NAME))
$(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME))
$(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME))
$(eval OPENSEARCH_CONTAINER_NAME=run-$(OPENSEARCH_CONTAINER_NAME))

.PHONY: sql
sql: set-run-variable-values
Expand All @@ -209,7 +209,7 @@ sql: set-run-variable-values
$(POSTGRES_IMAGE) psql -h localhost -U $(POSTGRES_USER) $(POSTGRES_DB)

.PHONY: setup
setup: set-run-variable-values create-pod storage apache-tika-server elasticsearch database
setup: set-run-variable-values create-pod storage apache-tika-server opensearch database

.PHONY: re-run
re-run: set-run-variable-values
Expand All @@ -235,19 +235,20 @@ shell-database: set-run-variable-values
podman exec -it $(DATABASE_CONTAINER_NAME) \
psql -h localhost -d $(POSTGRES_DB) -U $(POSTGRES_USER)

elasticsearch: stop-elasticsearch start-elasticsearch wait-elasticsearch
opensearch: stop-opensearch start-opensearch wait-opensearch

start-elasticsearch:
start-opensearch:
podman run -d --rm -ti \
--name $(ELASTICSEARCH_CONTAINER_NAME) \
--name $(OPENSEARCH_CONTAINER_NAME) \
--pod $(POD_NAME) \
--env discovery.type=single-node \
docker.io/elasticsearch:7.9.1
--env plugins.security.ssl.http.enabled=false \
docker.io/opensearchproject/opensearch:2.9.0

stop-elasticsearch:
podman rm --force --ignore $(ELASTICSEARCH_CONTAINER_NAME)
stop-opensearch:
podman rm --force --ignore $(OPENSEARCH_CONTAINER_NAME)

wait-elasticsearch:
wait-opensearch:
$(call wait-for, localhost:9200)

.PHONY: publish-tag
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ select processed, count(1) from gazettes g group by processed;
make re-run
```
- and see gazettes processed running the query above
- you can search using ElasticSearch
- you can search using OpenSearch on port 9200
```console
curl 'http://localhost:9200/querido-diario/_search' \
-H 'Content-Type: application/json' \
Expand Down
7 changes: 4 additions & 3 deletions contrib/sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ POSTGRES_HOST=127.0.0.1
POSTGRES_PORT=5432
DATABASE_RESTORE_FILE=contrib/data/queridodiariodb.tar

ELASTICSEARCH_HOST=http://localhost:9200
ELASTICSEARCH_INDEX=querido-diario

OPENSEARCH_HOST=http://localhost:9200
OPENSEARCH_INDEX=querido-diario
OPENSEARCH_USER=admin
OPENSEARCH_PASSWORD=admin
DEBUG=1

APACHE_TIKA_SERVER=http://localhost:9998
Expand Down
2 changes: 1 addition & 1 deletion index/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .elasticsearch import create_index_interface
from .opensearch import create_index_interface
45 changes: 25 additions & 20 deletions index/elasticsearch.py → index/opensearch.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from typing import Dict, Iterable, List, Union
import os

import elasticsearch
import opensearchpy

from tasks import IndexInterface


class ElasticSearchInterface(IndexInterface):
def __init__(self, hosts: List, timeout: str = "30s", default_index: str = ""):
self._es = elasticsearch.Elasticsearch(hosts=hosts)
class OpenSearchInterface(IndexInterface):
def __init__(self, hosts: List, user: str, password: str, timeout: int = 30, default_index: str = ""):
self._search_engine = opensearchpy.OpenSearch(hosts=hosts, http_auth=(user, password))
self._timeout = timeout
self._default_index = default_index

def index_exists(self, index_name: str) -> bool:
return self._es.indices.exists(index=index_name)
return self._search_engine.indices.exists(index=index_name)

def is_valid_index_name(self, index_name: str) -> bool:
return isinstance(index_name, str) and len(index_name) > 0
Expand All @@ -29,7 +29,7 @@ def create_index(self, index_name: str = "", body: Dict = {}) -> None:
index_name = self.get_index_name(index_name)
if self.index_exists(index_name):
return
self._es.indices.create(
self._search_engine.indices.create(
index=index_name,
body=body,
timeout=self._timeout,
Expand All @@ -39,7 +39,7 @@ def refresh_index(self, index_name: str = "") -> None:
index_name = self.get_index_name(index_name)
if self.index_exists(index_name):
return
self._es.indices.refresh(
self._search_engine.indices.refresh(
index=index_name,
)

Expand All @@ -51,23 +51,23 @@ def index_document(
refresh: bool = False,
) -> None:
index = self.get_index_name(index)
self._es.index(index=index, body=document, id=document_id, refresh=refresh)
self._search_engine.index(index=index, body=document, id=document_id, refresh=refresh)

def search(self, query: Dict, index: str = "") -> Dict:
index = self.get_index_name(index)
result = self._es.search(index=index, body=query, request_timeout=60)
result = self._search_engine.search(index=index, body=query, request_timeout=60)
return result

def analyze(self, text: str, field: str, index: str = "") -> Dict:
index = self.get_index_name(index)
result = self._es.indices.analyze(body={"text": text, "field":field}, index=index)
result = self._search_engine.indices.analyze(body={"text": text, "field":field}, index=index)
return result

def paginated_search(
self, query: Dict, index: str = "", keep_alive: str = "5m"
) -> Iterable[Dict]:
index = self.get_index_name(index)
result = self._es.search(
result = self._search_engine.search(
index=index, body=query, scroll=keep_alive, request_timeout=120
)

Expand All @@ -77,26 +77,31 @@ def paginated_search(
while len(result["hits"]["hits"]) > 0:
yield result
scroll_id = result["_scroll_id"]
result = self._es.scroll(
result = self._search_engine.scroll(
scroll_id=scroll_id, scroll=keep_alive, request_timeout=120
)

self._es.clear_scroll(scroll_id=scroll_id)
self._search_engine.clear_scroll(scroll_id=scroll_id)


def get_elasticsearch_host():
return os.environ["ELASTICSEARCH_HOST"]
def get_opensearch_host():
return os.environ["OPENSEARCH_HOST"]


def get_elasticsearch_index():
return os.environ["ELASTICSEARCH_INDEX"]
def get_opensearch_index():
return os.environ["OPENSEARCH_INDEX"]

def get_opensearch_user():
return os.environ["OPENSEARCH_USER"]

def get_opensearch_password():
return os.environ["OPENSEARCH_PASSWORD"]

def create_index_interface() -> IndexInterface:
hosts = get_elasticsearch_host()
hosts = get_opensearch_host()
if not isinstance(hosts, str) or len(hosts) == 0:
raise Exception("Missing index hosts")
default_index_name = get_elasticsearch_index()
default_index_name = get_opensearch_index()
if not isinstance(default_index_name, str) or len(default_index_name) == 0:
raise Exception("Invalid index name")
return ElasticSearchInterface([hosts], default_index=default_index_name)
return OpenSearchInterface([hosts], get_opensearch_user(), get_opensearch_password(), default_index=default_index_name)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ python-magic==0.4.18
boto3==1.22.6
psycopg2==2.8.6
botocore==1.25.6
elasticsearch==7.17.3
opensearch-py==2.3.2
requests==2.25.0
scikit-learn==1.0.2
sentence-transformers==2.2.0
Expand Down
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

from .main_tests import MainModuleTests

from .elasticsearch import (
ElasticsearchBasicTests,
from .opensearch import (
OpensearchBasicTests,
IndexInterfaceFactoryFunctionTests,
ElasticsearchIntegrationTests,
OpensearchIntegrationTests,
)
Loading

0 comments on commit 2272001

Please sign in to comment.