From e4a3f715733ab631e9138dc69b105999874c9f6c Mon Sep 17 00:00:00 2001
From: Luisa-Coelho <fancelli_luisa@hotmail.com>
Date: Wed, 13 Sep 2023 16:54:56 -0300
Subject: [PATCH 01/19] new Makefile Windows

---
 Makefile                 | 12 +-----------
 scripts/Dockerfile       |  2 +-
 scripts/requirements.txt | 11 +++++++++++
 3 files changed, 13 insertions(+), 12 deletions(-)
 create mode 100644 scripts/requirements.txt

diff --git a/Makefile b/Makefile
index d5fc959..7f1f649 100644
--- a/Makefile
+++ b/Makefile
@@ -37,16 +37,6 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \
 	--env POSTGRES_PORT=$(POSTGRES_PORT) \
 	$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1)
 
-wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \
-	--pod $(POD_NAME) \
-	--env PYTHONPATH=/mnt/code \
-	--env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \
-	--env POSTGRES_USER=$(POSTGRES_USER) \
-	--env POSTGRES_DB=$(POSTGRES_DB) \
-	--env POSTGRES_HOST=$(POSTGRES_HOST) \
-	--env POSTGRES_PORT=$(POSTGRES_PORT) \
-	$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) wait-for-it --timeout=60 $1)
-
 .PHONY: black
 black:
 	podman run --rm -ti --volume $(PWD):/mnt/code:rw \
@@ -197,7 +187,7 @@ else
 endif
 
 set-run-variable-values:
-	cp --no-clobber contrib/sample.env envvars || true
+	copy /y contrib\sample.env envvars
 	$(eval POD_NAME=run-$(POD_NAME))
 	$(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME))
 	$(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME))
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
index 139d337..129d662 100644
--- a/scripts/Dockerfile
+++ b/scripts/Dockerfile
@@ -1,4 +1,4 @@
-FROM docker.io/python:3.8
+FROM docker.io/python:3.10
 
 ENV USER gazette
 ENV USER_HOME /home/$USER
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 0000000..92894c1
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1,11 @@
+black==19.10b0
+coverage==5.2.1
+python-magic==0.4.18
+boto3==1.22.6
+psycopg2==2.8.6
+botocore==1.25.6
+elasticsearch==7.17.3
+requests==2.25.0
+scikit-learn==1.0.2 
+sentence-transformers==2.2.0
+huggingface-hub==0.10.1  # fix: https://github.com/UKPLab/sentence-transformers/issues/1762

From 94d100fe8668e455148df748c3a5baec7c9a46e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <fancelli_luisa@hotmail.com>
Date: Thu, 26 Oct 2023 16:48:42 -0300
Subject: [PATCH 02/19] windows_settings

---
 Makefile                                      |  14 +-
 contrib/import-env.py                         |   7 +
 scripts/Dockerfile                            |   4 +-
 scripts/Dockerfile_original                   |  23 ++
 scripts/tasks/__init__.py                     |  12 +
 .../gazette_excerpts_embedding_reranking.py   |  39 ++++
 .../gazette_excerpts_entities_tagging.py      | 105 +++++++++
 scripts/tasks/gazette_text_extraction.py      | 219 ++++++++++++++++++
 .../gazette_themed_excerpts_extraction.py     | 191 +++++++++++++++
 scripts/tasks/gazette_themes_listing.py       |  13 ++
 scripts/tasks/interfaces.py                   | 105 +++++++++
 .../tasks/list_gazettes_to_be_processed.py    | 143 ++++++++++++
 scripts/tasks/utils/__init__.py               |   5 +
 scripts/tasks/utils/index.py                  |  38 +++
 scripts/tasks/utils/text.py                   |   5 +
 15 files changed, 919 insertions(+), 4 deletions(-)
 create mode 100644 contrib/import-env.py
 create mode 100644 scripts/Dockerfile_original
 create mode 100644 scripts/tasks/__init__.py
 create mode 100644 scripts/tasks/gazette_excerpts_embedding_reranking.py
 create mode 100644 scripts/tasks/gazette_excerpts_entities_tagging.py
 create mode 100644 scripts/tasks/gazette_text_extraction.py
 create mode 100644 scripts/tasks/gazette_themed_excerpts_extraction.py
 create mode 100644 scripts/tasks/gazette_themes_listing.py
 create mode 100644 scripts/tasks/interfaces.py
 create mode 100644 scripts/tasks/list_gazettes_to_be_processed.py
 create mode 100644 scripts/tasks/utils/__init__.py
 create mode 100644 scripts/tasks/utils/index.py
 create mode 100644 scripts/tasks/utils/text.py

diff --git a/Makefile b/Makefile
index 7f1f649..5a17846 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,16 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \
 	--env POSTGRES_PORT=$(POSTGRES_PORT) \
 	$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1)
 
+wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \
+	--pod $(POD_NAME) \
+	--env PYTHONPATH=/mnt/code \
+	--env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \
+	--env POSTGRES_USER=$(POSTGRES_USER) \
+	--env POSTGRES_DB=$(POSTGRES_DB) \
+	--env POSTGRES_HOST=$(POSTGRES_HOST) \
+	--env POSTGRES_PORT=$(POSTGRES_PORT) \
+	$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) wait-for-it --timeout=60 $1)
+
 .PHONY: black
 black:
 	podman run --rm -ti --volume $(PWD):/mnt/code:rw \
@@ -187,7 +197,7 @@ else
 endif
 
 set-run-variable-values:
-	copy /y contrib\sample.env envvars
+	cp --no-clobber contrib/sample.env envvars || true
 	$(eval POD_NAME=run-$(POD_NAME))
 	$(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME))
 	$(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME))
@@ -243,4 +253,4 @@ wait-elasticsearch:
 .PHONY: publish-tag
 publish-tag:
 	podman tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):${IMAGE_TAG} $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags)
-	podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags)
+	podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags)
\ No newline at end of file
diff --git a/contrib/import-env.py b/contrib/import-env.py
new file mode 100644
index 0000000..4cfbfd3
--- /dev/null
+++ b/contrib/import-env.py
@@ -0,0 +1,7 @@
+import os
+from dotenv import load_dotenv
+
+try:
+    load_dotenv()  # Load environment variables from a .env file
+except Exception as e:
+    print(f"Error loading .env file: {e}")
\ No newline at end of file
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
index 129d662..df8b6e9 100644
--- a/scripts/Dockerfile
+++ b/scripts/Dockerfile
@@ -2,9 +2,9 @@ FROM docker.io/python:3.10
 
 ENV USER gazette
 ENV USER_HOME /home/$USER
-ENV WORKDIR /mnt/code
+ENV WORKDIR /tasks
 
-RUN adduser --system $USER --home $USER_HOME && \
+RUN net user --system $USER --home $USER_HOME && \
 	apt-get update -y && \
   curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
 	apt-get -y install git-lfs wait-for-it && \
diff --git a/scripts/Dockerfile_original b/scripts/Dockerfile_original
new file mode 100644
index 0000000..1fff372
--- /dev/null
+++ b/scripts/Dockerfile_original
@@ -0,0 +1,23 @@
+FROM docker.io/python:3.8
+
+ENV USER gazette
+ENV USER_HOME /home/$USER
+ENV WORKDIR /mnt/code
+
+RUN adduser --system $USER --home $USER_HOME && \
+	apt-get update -y && \
+  curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
+	apt-get -y install git-lfs wait-for-it && \
+	apt-get clean && \
+  git lfs install && \
+	mkdir $WORKDIR
+
+ENV PYTHONPATH $WORKDIR
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . $WORKDIR
+WORKDIR $WORKDIR
+USER $USER
+
+RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
\ No newline at end of file
diff --git a/scripts/tasks/__init__.py b/scripts/tasks/__init__.py
new file mode 100644
index 0000000..bb16ccd
--- /dev/null
+++ b/scripts/tasks/__init__.py
@@ -0,0 +1,12 @@
+from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts
+from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts
+from .gazette_text_extraction import extract_text_from_gazettes
+from .gazette_themed_excerpts_extraction import extract_themed_excerpts_from_gazettes
+from .gazette_themes_listing import get_themes
+from .interfaces import (
+    DatabaseInterface,
+    StorageInterface,
+    IndexInterface,
+    TextExtractorInterface,
+)
+from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
diff --git a/scripts/tasks/gazette_excerpts_embedding_reranking.py b/scripts/tasks/gazette_excerpts_embedding_reranking.py
new file mode 100644
index 0000000..3919056
--- /dev/null
+++ b/scripts/tasks/gazette_excerpts_embedding_reranking.py
@@ -0,0 +1,39 @@
+import os
+from typing import Dict, List
+
+import sentence_transformers
+
+from .interfaces import IndexInterface
+from .utils import get_documents_with_ids
+
+
+def embedding_rerank_excerpts(
+    theme: Dict, excerpt_ids: List[str], index: IndexInterface
+) -> None:
+    user_folder = os.environ["HOME"]
+    model = sentence_transformers.SentenceTransformer(
+        f"{user_folder}/models/bert-base-portuguese-cased"
+    )
+    queries = get_natural_language_queries(theme)
+    queries_vectors = model.encode(queries, convert_to_tensor=True)
+
+    excerpts = (
+        excerpt["_source"]
+        for excerpt in get_documents_with_ids(excerpt_ids, index, theme["index"])
+    )
+    for excerpt in excerpts:
+        excerpt_vector = model.encode(excerpt["excerpt"], convert_to_tensor=True)
+        excerpt_max_score = sentence_transformers.util.semantic_search(
+            excerpt_vector, queries_vectors, top_k=1
+        )
+        excerpt["excerpt_embedding_score"] = excerpt_max_score[0][0]["score"]
+        index.index_document(
+            excerpt,
+            document_id=excerpt["excerpt_id"],
+            index=theme["index"],
+            refresh=True,
+        )
+
+
+def get_natural_language_queries(theme: Dict) -> List[str]:
+    return [query["title"] for query in theme["queries"]]
diff --git a/scripts/tasks/gazette_excerpts_entities_tagging.py b/scripts/tasks/gazette_excerpts_entities_tagging.py
new file mode 100644
index 0000000..8c67303
--- /dev/null
+++ b/scripts/tasks/gazette_excerpts_entities_tagging.py
@@ -0,0 +1,105 @@
+import re
+from typing import Dict, List
+
+from .interfaces import IndexInterface
+from .utils import (
+    get_documents_from_query_with_highlights,
+    get_documents_with_ids,
+)
+
+
+def tag_entities_in_excerpts(
+    theme: Dict, excerpt_ids: List[str], index: IndexInterface
+) -> None:
+    tag_theme_cases(theme, excerpt_ids, index)
+    tag_cnpjs(theme, excerpt_ids, index)
+
+
+def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None:
+    cases = theme["entities"]["cases"]
+    es_queries = [get_es_query_from_entity_case(case, excerpt_ids) for case in cases]
+    for case, es_query in zip(cases, es_queries):
+        documents = get_documents_from_query_with_highlights(
+            es_query, index, theme["index"]
+        )
+        for document in documents:
+            excerpt = document["_source"]
+            highlight = document["highlight"][
+                "excerpt.with_stopwords"
+            ][0]
+            excerpt.update(
+                {
+                    "excerpt_entities": list(
+                        set(excerpt.get("excerpt_entities", [])) | {case["title"]}
+                    ),
+                    "excerpt": highlight,
+                }
+            )
+            index.index_document(
+                excerpt,
+                document_id=excerpt["excerpt_id"],
+                index=theme["index"],
+                refresh=True,
+            )
+
+
+def get_es_query_from_entity_case(
+    case: Dict,
+    excerpt_ids: List[str],
+) -> Dict:
+    es_query = {
+        "query": {"bool": {"should": [], "filter": {"ids": {"values": excerpt_ids}}}},
+        "size": 100,
+        "highlight": {
+            "fields": {
+                "excerpt.with_stopwords": {  # Allows tagging phrases containing stopwords correctly
+                    "type": "fvh",  # Only highlighter to tag phrases correctly and not the tokens individually
+                    "matched_fields": ["excerpt", "excerpt.with_stopwords"],
+                    "fragment_size": 10000,
+                    "number_of_fragments": 1,
+                    "pre_tags": [f"<{case['category']}>"],
+                    "post_tags": [f"</{case['category']}>"],
+                }
+            },
+        },
+    }
+    for value in case["values"]:
+        es_query["query"]["bool"]["should"].append(
+            {"match_phrase": {"excerpt.with_stopwords": value}}
+        )
+
+    return es_query
+
+
+def tag_cnpjs(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None:
+    excerpts = (
+        document["_source"]
+        for document in get_documents_with_ids(excerpt_ids, index, theme["index"])
+    )
+    cnpj_regex = re.compile(
+        r"""
+        (^|[^\d])                                              # left boundary: start of string or not-a-digit
+        (\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d/?\d{4}-?\d{2})  # cnpj
+        ($|[^\d])                                              # right boundary: end of string or not-a-digit
+        """,
+        re.VERBOSE,
+    )
+    for excerpt in excerpts:
+        found_cnpjs = re.findall(cnpj_regex, excerpt["excerpt"])
+        if not found_cnpjs:
+            continue
+
+        for _, cnpj, _ in set(found_cnpjs):
+            excerpt["excerpt"] = excerpt["excerpt"].replace(
+                cnpj, f"<entidadecnpj>{cnpj}</entidadecnpj>"
+            )
+
+        excerpt["excerpt_entities"] = list(
+            set(excerpt.get("excerpt_entities", [])) | {"CNPJ"}
+        )
+        index.index_document(
+            excerpt,
+            document_id=excerpt["excerpt_id"],
+            index=theme["index"],
+            refresh=True,
+        )
diff --git a/scripts/tasks/gazette_text_extraction.py b/scripts/tasks/gazette_text_extraction.py
new file mode 100644
index 0000000..846b8a5
--- /dev/null
+++ b/scripts/tasks/gazette_text_extraction.py
@@ -0,0 +1,219 @@
+import logging
+import tempfile
+import os
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+from .interfaces import (
+    DatabaseInterface,
+    IndexInterface,
+    StorageInterface,
+    TextExtractorInterface,
+)
+
+
+def extract_text_from_gazettes(
+    gazettes: Iterable[Dict],
+    database: DatabaseInterface,
+    storage: StorageInterface,
+    index: IndexInterface,
+    text_extractor: TextExtractorInterface,
+) -> List[str]:
+    """
+    Extracts the text from a list of gazettes
+    """
+    logging.info("Starting text extraction from gazettes")
+    create_index(index)
+
+    ids = []
+    for gazette in gazettes:
+        try:
+            processed_gazette = try_process_gazette_file(
+                gazette, database, storage, index, text_extractor
+            )
+        except Exception as e:
+            logging.warning(
+                f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
+            )
+        else:
+            ids.append(processed_gazette["file_checksum"])
+
+    return ids
+
+
+def try_process_gazette_file(
+    gazette: Dict,
+    database: DatabaseInterface,
+    storage: StorageInterface,
+    index: IndexInterface,
+    text_extractor: TextExtractorInterface,
+) -> Dict:
+    """
+    Do all the work to extract the content from the gazette files
+    """
+    logging.debug(f"Processing gazette {gazette['file_path']}")
+    gazette_file = download_gazette_file(gazette, storage)
+    get_gazette_text_and_define_url(gazette, gazette_file, text_extractor)
+    upload_gazette_raw_text(gazette, storage)
+    index.index_document(gazette, document_id=gazette["file_checksum"])
+    delete_gazette_files(gazette_file)
+    set_gazette_as_processed(gazette, database)
+    return gazette
+
+
+def create_index(index: IndexInterface) -> None:
+    body = {
+        "mappings": {
+            "properties": {
+                "created_at": {"type": "date"},
+                "date": {"type": "date"},
+                "edition_number": {
+                    "type": "text",
+                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
+                },
+                "file_checksum": {"type": "keyword"},
+                "file_path": {"type": "keyword"},
+                "file_url": {"type": "keyword"},
+                "id": {"type": "keyword"},
+                "is_extra_edition": {"type": "boolean"},
+                "power": {"type": "keyword"},
+                "processed": {"type": "boolean"},
+                "scraped_at": {"type": "date"},
+                "source_text": {
+                    "type": "text",
+                    "analyzer": "brazilian",
+                    "index_options": "offsets",
+                    "term_vector": "with_positions_offsets",
+                    "fields": {
+                        "with_stopwords": {
+                            "type": "text",
+                            "analyzer": "brazilian_with_stopwords",
+                            "index_options": "offsets",
+                            "term_vector": "with_positions_offsets",
+                        },
+                        "exact": {
+                            "type": "text",
+                            "analyzer": "exact",
+                            "index_options": "offsets",
+                            "term_vector": "with_positions_offsets",
+                        }
+                    },
+                },
+                "state_code": {"type": "keyword"},
+                "territory_id": {"type": "keyword"},
+                "territory_name": {
+                    "type": "text",
+                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
+                },
+                "url": {"type": "keyword"},
+            }
+        },
+        "settings": {
+            "index": {
+              "sort.field": ["territory_id", "date"],
+              "sort.order": ["asc", "desc"]
+            },
+            "analysis": {
+                "filter": {
+                    "brazilian_stemmer": {
+                        "type": "stemmer",
+                        "language": "brazilian",
+                    }
+                },
+                "analyzer": {
+                    "brazilian_with_stopwords": {
+                        "tokenizer": "standard",
+                        "filter": ["lowercase", "brazilian_stemmer"],
+                    },
+                    "exact": {
+                        "tokenizer": "standard",
+                        "filter": ["lowercase"],
+                    },
+                },
+            }
+        },
+    }
+    index.create_index(body=body)
+
+
+def upload_gazette_raw_text(gazette: Dict, storage):
+    """
+    Define gazette raw text
+    """
+    file_raw_txt = Path(gazette["file_path"]).with_suffix(".txt").as_posix()
+    storage.upload_content(file_raw_txt, gazette["source_text"])
+    logging.debug(f"file_raw_txt uploaded {file_raw_txt}")
+    file_endpoint = get_file_endpoint()
+    gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}"
+
+
+def get_gazette_text_and_define_url(
+    gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface
+):
+    """
+    Extract file content and define the url to access the file in the storage
+    """
+    gazette["source_text"] = try_to_extract_content(gazette_file, text_extractor)
+    file_endpoint = get_file_endpoint()
+    gazette["url"] = f"{file_endpoint}/{gazette['file_path']}"
+
+
+def get_file_endpoint() -> str:
+    """
+    Get the endpoint where the gazette files can be downloaded.
+    """
+    return os.environ["QUERIDO_DIARIO_FILES_ENDPOINT"]
+
+
+def try_to_extract_content(
+    gazette_file: str, text_extractor: TextExtractorInterface
+) -> str:
+    """
+    Calls the function to extract the content from the gazette file. If it fails
+    remove the gazette file and raise an exception
+    """
+    try:
+        return text_extractor.extract_text(gazette_file)
+    except Exception as e:
+        os.remove(gazette_file)
+        raise e
+
+
+def delete_gazette_files(gazette_file: str) -> None:
+    """
+    Removes the files used to process the gazette content.
+    """
+    os.remove(gazette_file)
+
+
+def download_gazette_file(gazette: Dict, storage: StorageInterface) -> str:
+    """
+    Download the file from the object storage and write it down in the local
+    disk to allow the text extraction
+    """
+    with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
+        gazette_file_key = get_gazette_file_key_used_in_storage(gazette)
+        storage.get_file(gazette_file_key, tmpfile)
+        return tmpfile.name
+
+
+def get_gazette_file_key_used_in_storage(gazette: Dict) -> str:
+    """
+    Get the file key used to store the gazette in the object storage
+    """
+    return gazette["file_path"]
+
+
+def set_gazette_as_processed(gazette: Dict, database: DatabaseInterface) -> None:
+    command = """
+        UPDATE gazettes
+        SET processed = True
+        WHERE id = %(id)s
+        AND file_checksum = %(file_checksum)s
+    ;
+    """
+    id = gazette["id"]
+    checksum = gazette["file_checksum"]
+    data = {"id": id, "file_checksum": checksum}
+    logging.debug(f"Marking {id}({checksum}) as processed")
+    database.update(command, data)
diff --git a/scripts/tasks/gazette_themed_excerpts_extraction.py b/scripts/tasks/gazette_themed_excerpts_extraction.py
new file mode 100644
index 0000000..1e87c89
--- /dev/null
+++ b/scripts/tasks/gazette_themed_excerpts_extraction.py
@@ -0,0 +1,191 @@
+import hashlib
+from typing import Dict, Iterable, List
+
+from .interfaces import IndexInterface
+from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights
+
+
+def extract_themed_excerpts_from_gazettes(
+    theme: Dict, gazette_ids: List[str], index: IndexInterface
+) -> List[str]:
+    create_index(theme, index)
+
+    ids = []
+    for theme_query in theme["queries"]:
+        for excerpt in get_excerpts_from_gazettes_with_themed_query(
+            theme_query, gazette_ids, index
+        ):
+            # excerpts with less than 10% of the expected size of excerpt account for 
+            # fewer than 1% of excerpts yet their score is usually high
+            if len(excerpt["excerpt"]) < 200:
+                continue
+
+            index.index_document(
+                excerpt,
+                document_id=excerpt["excerpt_id"],
+                index=theme["index"],
+                refresh=True,
+            )
+            ids.append(excerpt["excerpt_id"])
+
+    return ids
+
+
+def create_index(theme: Dict, index: IndexInterface) -> None:
+    body = {
+        "mappings": {
+            "properties": {
+                "excerpt_embedding_score": {"type": "rank_feature"},
+                "excerpt_subthemes": {"type": "keyword"},
+                "excerpt_entities": {"type": "keyword"},
+                "excerpt": {
+                    "type": "text",
+                    "analyzer": "brazilian",
+                    "index_options": "offsets",
+                    "term_vector": "with_positions_offsets",
+                    "fields": {
+                        "with_stopwords": {
+                            "type": "text",
+                            "analyzer": "brazilian_with_stopwords",
+                            "index_options": "offsets",
+                            "term_vector": "with_positions_offsets",
+                        },
+                        "exact": {
+                            "type": "text",
+                            "analyzer": "exact",
+                            "index_options": "offsets",
+                            "term_vector": "with_positions_offsets",
+                        },
+                    },
+                },
+                "excerpt_id": {"type": "keyword"},
+                "source_database_id": {"type": "long"},
+                "source_index_id": {"type": "keyword"},
+                "source_created_at": {"type": "date"},
+                "source_date": {"type": "date"},
+                "source_edition_number": {"type": "keyword"},
+                "source_file_checksum": {"type": "keyword"},
+                "source_file_path": {"type": "keyword"},
+                "source_file_raw_txt": {"type": "keyword"},
+                "source_file_url": {"type": "keyword"},
+                "source_is_extra_edition": {"type": "boolean"},
+                "source_power": {"type": "keyword"},
+                "source_processed": {"type": "boolean"},
+                "source_scraped_at": {"type": "date"},
+                "source_state_code": {"type": "keyword"},
+                "source_territory_id": {"type": "keyword"},
+                "source_territory_name": {"type": "keyword"},
+                "source_url": {"type": "keyword"},
+            }
+        },
+        "settings": {
+            "index": {
+              "sort.field": ["source_territory_id", "source_date"],
+              "sort.order": ["asc", "desc"]
+            },
+            "analysis": {
+                "filter": {
+                    "brazilian_stemmer": {
+                        "type": "stemmer",
+                        "language": "brazilian",
+                    }
+                },
+                "analyzer": {
+                    "brazilian_with_stopwords": {
+                        "tokenizer": "standard",
+                        "filter": ["lowercase", "brazilian_stemmer"],
+                    },
+                    "exact": {
+                        "tokenizer": "standard",
+                        "filter": ["lowercase"],
+                    },
+                },
+            }
+        },
+    }
+    index.create_index(index_name=theme["index"], body=body)
+
+
+def get_excerpts_from_gazettes_with_themed_query(
+    query: Dict, gazette_ids: List[str], index: IndexInterface
+) -> Iterable[Dict]:
+    es_query = get_es_query_from_themed_query(query, gazette_ids, index)
+    documents = get_documents_from_query_with_highlights(es_query, index)
+    for document in documents:
+        gazette = document["_source"]
+        excerpts = document["highlight"]["source_text.with_stopwords"]
+        for excerpt in excerpts:
+            yield {
+                "excerpt": preprocess_excerpt(excerpt),
+                "excerpt_subthemes": [query["title"]],
+                "excerpt_id": generate_excerpt_id(excerpt, gazette),
+                "source_index_id": gazette["file_checksum"],
+                "source_created_at": gazette["created_at"],
+                "source_database_id": gazette["id"],
+                "source_date": gazette["date"],
+                "source_edition_number": gazette["edition_number"],
+                "source_file_raw_txt": gazette["file_raw_txt"],
+                "source_is_extra_edition": gazette["is_extra_edition"],
+                "source_file_checksum": gazette["file_checksum"],
+                "source_file_path": gazette["file_path"],
+                "source_file_url": gazette["file_url"],
+                "source_power": gazette["power"],
+                "source_processed": gazette["processed"],
+                "source_scraped_at": gazette["scraped_at"],
+                "source_state_code": gazette["state_code"],
+                "source_territory_id": gazette["territory_id"],
+                "source_territory_name": gazette["territory_name"],
+                "source_url": gazette["url"],
+            }
+
+
+def generate_excerpt_id(excerpt: str, gazette: Dict) -> str:
+    hash = hashlib.md5()
+    hash.update(excerpt.encode())
+    return f"{gazette['file_checksum']}_{hash.hexdigest()}"
+
+
+def get_es_query_from_themed_query(
+    query: Dict,
+    gazette_ids: List[str],
+    index: IndexInterface,
+) -> Dict:
+    es_query = {
+        "query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}},
+        "size": 100,
+        "highlight": {
+            "fields": {
+                "source_text.with_stopwords": {
+                    "type": "unified",
+                    "fragment_size": 2000,
+                    "number_of_fragments": 10,
+                    "pre_tags": [""],
+                    "post_tags": [""],
+                }
+            },
+        },
+    }
+
+    macro_synonym_block = {"span_or": {"clauses": []}}
+    for macro_set in query["term_sets"]:
+        proximity_block = {"span_near": {"clauses": [], "slop": 20, "in_order": False}}
+        for term_set in macro_set:
+            synonym_block = {"span_or": {"clauses": []}}
+            for term in term_set:
+                phrase_block = {
+                    "span_near": {"clauses": [], "slop": 0, "in_order": True}
+                }
+                tokenized_term = index.analyze(text=term, field="source_text.with_stopwords")
+                for token in tokenized_term["tokens"]:
+                    word_block = {"span_term": {"source_text.with_stopwords": token["token"]}}
+                    phrase_block["span_near"]["clauses"].append(word_block)
+                synonym_block["span_or"]["clauses"].append(phrase_block)
+            proximity_block["span_near"]["clauses"].append(synonym_block)
+        macro_synonym_block["span_or"]["clauses"].append(proximity_block)
+
+    es_query["query"]["bool"]["must"].append(macro_synonym_block)
+    return es_query
+
+
+def preprocess_excerpt(excerpt: str) -> str:
+    return clean_extra_whitespaces(excerpt)
diff --git a/scripts/tasks/gazette_themes_listing.py b/scripts/tasks/gazette_themes_listing.py
new file mode 100644
index 0000000..1dbb60c
--- /dev/null
+++ b/scripts/tasks/gazette_themes_listing.py
@@ -0,0 +1,13 @@
+import json
+import pathlib
+from typing import Dict, List
+
+
+def get_themes() -> List[Dict]:
+    ROOT = pathlib.Path(__file__).parent.parent
+    themes_config = ROOT / "config" / "themes_config.json"
+
+    with themes_config.open() as f:
+        themes = json.load(f)["themes"]
+
+    return themes
diff --git a/scripts/tasks/interfaces.py b/scripts/tasks/interfaces.py
new file mode 100644
index 0000000..06b81cb
--- /dev/null
+++ b/scripts/tasks/interfaces.py
@@ -0,0 +1,105 @@
+from typing import Dict, Iterable, Tuple
+import abc
+
+
+class DatabaseInterface(abc.ABC):
+    """
+    Interface to abstract the iteraction with the database storing data used by the
+    tasks
+    """
+
+    @abc.abstractmethod
+    def _commit_changes(self, command: str, data: Dict) -> None:
+        """
+        Make a change in the database and commit it
+        """
+
+    @abc.abstractmethod
+    def select(self, command: str) -> Iterable[Tuple]:
+        """
+        Select entries from the database
+        """
+
+    @abc.abstractmethod
+    def insert(self, command: str, data: Dict) -> None:
+        """
+        Insert entries into the database
+        """
+
+    @abc.abstractmethod
+    def update(self, command: str, data: Dict) -> None:
+        """
+        Update entries from the database
+        """
+
+    @abc.abstractmethod
+    def delete(self, command: str, data: Dict) -> None:
+        """
+        Delete entries from the database
+        """
+
+
+class StorageInterface(abc.ABC):
+    """
+    Interface to abstract the interaction with the object store system.
+    """
+
+    @abc.abstractmethod
+    def get_file(self, file_to_be_downloaded: str, destination) -> None:
+        """
+        Download the given file key in the destination on the host
+        """
+
+    @abc.abstractmethod
+    def upload_content(self, file_key: str, content_to_be_uploaded: str) -> None:
+        """
+        Upload the given content to the destination on the host
+        """
+
+
+class IndexInterface(abc.ABC):
+    """
+    Interface to abstract the interaction with the index system
+    """
+
+    @abc.abstractmethod
+    def create_index(self, index_name: str, body: Dict) -> None:
+        """
+        Create the index used by the application
+        """
+
+    @abc.abstractmethod
+    def refresh_index(self, index_name: str) -> None:
+        """
+        Refreshes the index to make it up-to-date for future searches
+        """
+
+    @abc.abstractmethod
+    def index_document(
+        self, document: Dict, document_id: str, index: str, refresh: bool
+    ) -> None:
+        """
+        Upload document to the index
+        """
+
+    @abc.abstractmethod
+    def search(self, query: Dict, index: str) -> Dict:
+        """
+        Searches the index with the provided query
+        """
+
+    @abc.abstractmethod
+    def paginated_search(
+        self, query: Dict, index: str, keep_alive: str
+    ) -> Iterable[Dict]:
+        """
+        Searches the index with the provided query, with pagination
+        """
+
+
+class TextExtractorInterface(abc.ABC):
+    @abc.abstractmethod
+    def extract_text(self, filepath: str) -> str:
+        """
+        Extract the text from the given file
+        """
diff --git a/scripts/tasks/list_gazettes_to_be_processed.py b/scripts/tasks/list_gazettes_to_be_processed.py
new file mode 100644
index 0000000..1547e7b
--- /dev/null
+++ b/scripts/tasks/list_gazettes_to_be_processed.py
@@ -0,0 +1,143 @@
+import logging
+from typing import Dict, Iterable
+
+from .interfaces import DatabaseInterface
+
+
+def get_gazettes_to_be_processed(
+    execution_mode: str, database: DatabaseInterface
+) -> Iterable[Dict]:
+    if execution_mode == "DAILY":
+        yield from get_gazettes_extracted_since_yesterday(database)
+    elif execution_mode == "ALL":
+        yield from get_all_gazettes_extracted(database)
+    elif execution_mode == "UNPROCESSED":
+        yield from get_unprocessed_gazettes(database)
+    else:
+        raise Exception(f'Execution mode "{execution_mode}" is invalid.')
+
+
+def get_gazettes_extracted_since_yesterday(
+    database: DatabaseInterface,
+) -> Iterable[Dict]:
+    """
+    List the gazettes which were extracted since yesterday
+    """
+    logging.info("Listing gazettes extracted since yesterday")
+
+    command = """
+    SELECT
+        gazettes.id,
+        gazettes.source_text,
+        gazettes.date,
+        gazettes.edition_number,
+        gazettes.is_extra_edition,
+        gazettes.power,
+        gazettes.file_checksum,
+        gazettes.file_path,
+        gazettes.file_url,
+        gazettes.scraped_at,
+        gazettes.created_at,
+        gazettes.territory_id,
+        gazettes.processed,
+        territories.name as territory_name,
+        territories.state_code
+    FROM
+        gazettes
+    INNER JOIN territories ON territories.id = gazettes.territory_id
+    WHERE
+        scraped_at > current_timestamp - interval '1 day'
+    ;
+    """
+    for gazette in database.select(command):
+        yield format_gazette_data(gazette)
+
+
+def get_all_gazettes_extracted(
+    database: DatabaseInterface,
+) -> Iterable[Dict]:
+    """
+    List all the gazettes which were extracted
+    """
+    logging.info("Listing all gazettes extracted")
+
+    command = """
+    SELECT
+        gazettes.id,
+        gazettes.source_text,
+        gazettes.date,
+        gazettes.edition_number,
+        gazettes.is_extra_edition,
+        gazettes.power,
+        gazettes.file_checksum,
+        gazettes.file_path,
+        gazettes.file_url,
+        gazettes.scraped_at,
+        gazettes.created_at,
+        gazettes.territory_id,
+        gazettes.processed,
+        territories.name as territory_name,
+        territories.state_code
+    FROM
+        gazettes
+    INNER JOIN territories ON territories.id = gazettes.territory_id
+    ;
+    """
+    for gazette in database.select(command):
+        yield format_gazette_data(gazette)
+
+
+def get_unprocessed_gazettes(
+    database: DatabaseInterface,
+) -> Iterable[Dict]:
+    """
+    List all the gazettes which were extracted
+    """
+    logging.info("Listing all gazettes extracted")
+
+    command = """
+    SELECT
+        gazettes.id,
+        gazettes.source_text,
+        gazettes.date,
+        gazettes.edition_number,
+        gazettes.is_extra_edition,
+        gazettes.power,
+        gazettes.file_checksum,
+        gazettes.file_path,
+        gazettes.file_url,
+        gazettes.scraped_at,
+        gazettes.created_at,
+        gazettes.territory_id,
+        gazettes.processed,
+        territories.name as territory_name,
+        territories.state_code
+    FROM
+        gazettes
+    INNER JOIN territories ON territories.id = gazettes.territory_id
+    WHERE
+        processed is False
+    ;
+    """
+    for gazette in database.select(command):
+        yield format_gazette_data(gazette)
+
+
+def format_gazette_data(data):
+    return {
+        "id": data[0],
+        "source_text": data[1],
+        "date": data[2],
+        "edition_number": data[3],
+        "is_extra_edition": data[4],
+        "power": data[5],
+        "file_checksum": data[6],
+        "file_path": data[7],
+        "file_url": data[8],
+        "scraped_at": data[9],
+        "created_at": data[10],
+        "territory_id": data[11],
+        "processed": data[12],
+        "territory_name": data[13],
+        "state_code": data[14],
+    }
diff --git a/scripts/tasks/utils/__init__.py b/scripts/tasks/utils/__init__.py
new file mode 100644
index 0000000..1bd9cf3
--- /dev/null
+++ b/scripts/tasks/utils/__init__.py
@@ -0,0 +1,5 @@
+from .index import (
+    get_documents_from_query_with_highlights,
+    get_documents_with_ids,
+)
+from .text import clean_extra_whitespaces
diff --git a/scripts/tasks/utils/index.py b/scripts/tasks/utils/index.py
new file mode 100644
index 0000000..83d769c
--- /dev/null
+++ b/scripts/tasks/utils/index.py
@@ -0,0 +1,38 @@
+from typing import Dict, Iterable, List
+
+from ..interfaces import IndexInterface
+
+
+def get_documents_with_ids(
+    ids: List[str], index: IndexInterface, index_name: str = ""
+) -> Iterable[Dict]:
+    query_filter_by_ids = {
+        "query": {"bool": {"filter": {"ids": {"values": ids}}}},
+        "size": 100,
+    }
+    yield from get_documents_from_query(query_filter_by_ids, index, index_name)
+
+
+def get_documents_from_query(
+    query: Dict, index: IndexInterface, index_name: str = ""
+) -> Iterable[Dict]:
+    index.refresh_index(index_name)
+    documents = (
+        hit
+        for result in index.paginated_search(query, index=index_name)
+        for hit in result["hits"]["hits"]
+    )
+    yield from documents
+
+
+def get_documents_from_query_with_highlights(
+    query: Dict, index: IndexInterface, index_name: str = ""
+) -> Iterable[Dict]:
+    index.refresh_index(index_name)
+    documents = (
+        hit
+        for result in index.paginated_search(query, index=index_name)
+        for hit in result["hits"]["hits"]
+        if hit.get("highlight")
+    )
+    yield from documents
diff --git a/scripts/tasks/utils/text.py b/scripts/tasks/utils/text.py
new file mode 100644
index 0000000..1cc7c39
--- /dev/null
+++ b/scripts/tasks/utils/text.py
@@ -0,0 +1,5 @@
+import re
+
+
+def clean_extra_whitespaces(text: str) -> str:
+    return re.sub(r"\s+", " ", text)

From a8d710f97437bafd1ffbd6c3b500663f1f532b34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <fancelli_luisa@hotmail.com>
Date: Wed, 4 Oct 2023 19:07:42 -0300
Subject: [PATCH 03/19] env readme_update

---
 scripts/Dockerfile                            |   8 +-
 ...Dockerfile_original => Dockerfile_windows} |   8 +-
 scripts/tasks/__init__.py                     |  12 -
 .../gazette_excerpts_embedding_reranking.py   |  39 ----
 .../gazette_excerpts_entities_tagging.py      | 105 ---------
 scripts/tasks/gazette_text_extraction.py      | 219 ------------------
 .../gazette_themed_excerpts_extraction.py     | 191 ---------------
 scripts/tasks/gazette_themes_listing.py       |  13 --
 scripts/tasks/interfaces.py                   | 105 ---------
 .../tasks/list_gazettes_to_be_processed.py    | 143 ------------
 scripts/tasks/utils/__init__.py               |   5 -
 scripts/tasks/utils/index.py                  |  38 ---
 scripts/tasks/utils/text.py                   |   5 -
 13 files changed, 8 insertions(+), 883 deletions(-)
 rename scripts/{Dockerfile_original => Dockerfile_windows} (78%)
 delete mode 100644 scripts/tasks/__init__.py
 delete mode 100644 scripts/tasks/gazette_excerpts_embedding_reranking.py
 delete mode 100644 scripts/tasks/gazette_excerpts_entities_tagging.py
 delete mode 100644 scripts/tasks/gazette_text_extraction.py
 delete mode 100644 scripts/tasks/gazette_themed_excerpts_extraction.py
 delete mode 100644 scripts/tasks/gazette_themes_listing.py
 delete mode 100644 scripts/tasks/interfaces.py
 delete mode 100644 scripts/tasks/list_gazettes_to_be_processed.py
 delete mode 100644 scripts/tasks/utils/__init__.py
 delete mode 100644 scripts/tasks/utils/index.py
 delete mode 100644 scripts/tasks/utils/text.py

diff --git a/scripts/Dockerfile b/scripts/Dockerfile
index df8b6e9..1fff372 100644
--- a/scripts/Dockerfile
+++ b/scripts/Dockerfile
@@ -1,10 +1,10 @@
-FROM docker.io/python:3.10
+FROM docker.io/python:3.8
 
 ENV USER gazette
 ENV USER_HOME /home/$USER
-ENV WORKDIR /tasks
+ENV WORKDIR /mnt/code
 
-RUN net user --system $USER --home $USER_HOME && \
+RUN adduser --system $USER --home $USER_HOME && \
 	apt-get update -y && \
   curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
 	apt-get -y install git-lfs wait-for-it && \
@@ -20,4 +20,4 @@ COPY . $WORKDIR
 WORKDIR $WORKDIR
 USER $USER
 
-RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
+RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
\ No newline at end of file
diff --git a/scripts/Dockerfile_original b/scripts/Dockerfile_windows
similarity index 78%
rename from scripts/Dockerfile_original
rename to scripts/Dockerfile_windows
index 1fff372..df8b6e9 100644
--- a/scripts/Dockerfile_original
+++ b/scripts/Dockerfile_windows
@@ -1,10 +1,10 @@
-FROM docker.io/python:3.8
+FROM docker.io/python:3.10
 
 ENV USER gazette
 ENV USER_HOME /home/$USER
-ENV WORKDIR /mnt/code
+ENV WORKDIR /tasks
 
-RUN adduser --system $USER --home $USER_HOME && \
+RUN net user --system $USER --home $USER_HOME && \
 	apt-get update -y && \
   curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
 	apt-get -y install git-lfs wait-for-it && \
@@ -20,4 +20,4 @@ COPY . $WORKDIR
 WORKDIR $WORKDIR
 USER $USER
 
-RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
\ No newline at end of file
+RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
diff --git a/scripts/tasks/__init__.py b/scripts/tasks/__init__.py
deleted file mode 100644
index bb16ccd..0000000
--- a/scripts/tasks/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts
-from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts
-from .gazette_text_extraction import extract_text_from_gazettes
-from .gazette_themed_excerpts_extraction import extract_themed_excerpts_from_gazettes
-from .gazette_themes_listing import get_themes
-from .interfaces import (
-    DatabaseInterface,
-    StorageInterface,
-    IndexInterface,
-    TextExtractorInterface,
-)
-from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
diff --git a/scripts/tasks/gazette_excerpts_embedding_reranking.py b/scripts/tasks/gazette_excerpts_embedding_reranking.py
deleted file mode 100644
index 3919056..0000000
--- a/scripts/tasks/gazette_excerpts_embedding_reranking.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-from typing import Dict, List
-
-import sentence_transformers
-
-from .interfaces import IndexInterface
-from .utils import get_documents_with_ids
-
-
-def embedding_rerank_excerpts(
-    theme: Dict, excerpt_ids: List[str], index: IndexInterface
-) -> None:
-    user_folder = os.environ["HOME"]
-    model = sentence_transformers.SentenceTransformer(
-        f"{user_folder}/models/bert-base-portuguese-cased"
-    )
-    queries = get_natural_language_queries(theme)
-    queries_vectors = model.encode(queries, convert_to_tensor=True)
-
-    excerpts = (
-        excerpt["_source"]
-        for excerpt in get_documents_with_ids(excerpt_ids, index, theme["index"])
-    )
-    for excerpt in excerpts:
-        excerpt_vector = model.encode(excerpt["excerpt"], convert_to_tensor=True)
-        excerpt_max_score = sentence_transformers.util.semantic_search(
-            excerpt_vector, queries_vectors, top_k=1
-        )
-        excerpt["excerpt_embedding_score"] = excerpt_max_score[0][0]["score"]
-        index.index_document(
-            excerpt,
-            document_id=excerpt["excerpt_id"],
-            index=theme["index"],
-            refresh=True,
-        )
-
-
-def get_natural_language_queries(theme: Dict) -> List[str]:
-    return [query["title"] for query in theme["queries"]]
diff --git a/scripts/tasks/gazette_excerpts_entities_tagging.py b/scripts/tasks/gazette_excerpts_entities_tagging.py
deleted file mode 100644
index 8c67303..0000000
--- a/scripts/tasks/gazette_excerpts_entities_tagging.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import re
-from typing import Dict, List
-
-from .interfaces import IndexInterface
-from .utils import (
-    get_documents_from_query_with_highlights,
-    get_documents_with_ids,
-)
-
-
-def tag_entities_in_excerpts(
-    theme: Dict, excerpt_ids: List[str], index: IndexInterface
-) -> None:
-    tag_theme_cases(theme, excerpt_ids, index)
-    tag_cnpjs(theme, excerpt_ids, index)
-
-
-def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None:
-    cases = theme["entities"]["cases"]
-    es_queries = [get_es_query_from_entity_case(case, excerpt_ids) for case in cases]
-    for case, es_query in zip(cases, es_queries):
-        documents = get_documents_from_query_with_highlights(
-            es_query, index, theme["index"]
-        )
-        for document in documents:
-            excerpt = document["_source"]
-            highlight = document["highlight"][
-                "excerpt.with_stopwords"
-            ][0]
-            excerpt.update(
-                {
-                    "excerpt_entities": list(
-                        set(excerpt.get("excerpt_entities", [])) | {case["title"]}
-                    ),
-                    "excerpt": highlight,
-                }
-            )
-            index.index_document(
-                excerpt,
-                document_id=excerpt["excerpt_id"],
-                index=theme["index"],
-                refresh=True,
-            )
-
-
-def get_es_query_from_entity_case(
-    case: Dict,
-    excerpt_ids: List[str],
-) -> Dict:
-    es_query = {
-        "query": {"bool": {"should": [], "filter": {"ids": {"values": excerpt_ids}}}},
-        "size": 100,
-        "highlight": {
-            "fields": {
-                "excerpt.with_stopwords": {  # Allows tagging phrases containing stopwords correctly
-                    "type": "fvh",  # Only highlighter to tag phrases correctly and not the tokens individually
-                    "matched_fields": ["excerpt", "excerpt.with_stopwords"],
-                    "fragment_size": 10000,
-                    "number_of_fragments": 1,
-                    "pre_tags": [f"<{case['category']}>"],
-                    "post_tags": [f"</{case['category']}>"],
-                }
-            },
-        },
-    }
-    for value in case["values"]:
-        es_query["query"]["bool"]["should"].append(
-            {"match_phrase": {"excerpt.with_stopwords": value}}
-        )
-
-    return es_query
-
-
-def tag_cnpjs(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None:
-    excerpts = (
-        document["_source"]
-        for document in get_documents_with_ids(excerpt_ids, index, theme["index"])
-    )
-    cnpj_regex = re.compile(
-        r"""
-        (^|[^\d])                                              # left boundary: start of string or not-a-digit
-        (\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d/?\d{4}-?\d{2})  # cnpj
-        ($|[^\d])                                              # right boundary: end of string or not-a-digit
-        """,
-        re.VERBOSE,
-    )
-    for excerpt in excerpts:
-        found_cnpjs = re.findall(cnpj_regex, excerpt["excerpt"])
-        if not found_cnpjs:
-            continue
-
-        for _, cnpj, _ in set(found_cnpjs):
-            excerpt["excerpt"] = excerpt["excerpt"].replace(
-                cnpj, f"<entidadecnpj>{cnpj}</entidadecnpj>"
-            )
-
-        excerpt["excerpt_entities"] = list(
-            set(excerpt.get("excerpt_entities", [])) | {"CNPJ"}
-        )
-        index.index_document(
-            excerpt,
-            document_id=excerpt["excerpt_id"],
-            index=theme["index"],
-            refresh=True,
-        )
diff --git a/scripts/tasks/gazette_text_extraction.py b/scripts/tasks/gazette_text_extraction.py
deleted file mode 100644
index 846b8a5..0000000
--- a/scripts/tasks/gazette_text_extraction.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import logging
-import tempfile
-import os
-from pathlib import Path
-from typing import Dict, Iterable, List
-
-from .interfaces import (
-    DatabaseInterface,
-    IndexInterface,
-    StorageInterface,
-    TextExtractorInterface,
-)
-
-
-def extract_text_from_gazettes(
-    gazettes: Iterable[Dict],
-    database: DatabaseInterface,
-    storage: StorageInterface,
-    index: IndexInterface,
-    text_extractor: TextExtractorInterface,
-) -> List[str]:
-    """
-    Extracts the text from a list of gazettes
-    """
-    logging.info("Starting text extraction from gazettes")
-    create_index(index)
-
-    ids = []
-    for gazette in gazettes:
-        try:
-            processed_gazette = try_process_gazette_file(
-                gazette, database, storage, index, text_extractor
-            )
-        except Exception as e:
-            logging.warning(
-                f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
-            )
-        else:
-            ids.append(processed_gazette["file_checksum"])
-
-    return ids
-
-
-def try_process_gazette_file(
-    gazette: Dict,
-    database: DatabaseInterface,
-    storage: StorageInterface,
-    index: IndexInterface,
-    text_extractor: TextExtractorInterface,
-) -> Dict:
-    """
-    Do all the work to extract the content from the gazette files
-    """
-    logging.debug(f"Processing gazette {gazette['file_path']}")
-    gazette_file = download_gazette_file(gazette, storage)
-    get_gazette_text_and_define_url(gazette, gazette_file, text_extractor)
-    upload_gazette_raw_text(gazette, storage)
-    index.index_document(gazette, document_id=gazette["file_checksum"])
-    delete_gazette_files(gazette_file)
-    set_gazette_as_processed(gazette, database)
-    return gazette
-
-
-def create_index(index: IndexInterface) -> None:
-    body = {
-        "mappings": {
-            "properties": {
-                "created_at": {"type": "date"},
-                "date": {"type": "date"},
-                "edition_number": {
-                    "type": "text",
-                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                },
-                "file_checksum": {"type": "keyword"},
-                "file_path": {"type": "keyword"},
-                "file_url": {"type": "keyword"},
-                "id": {"type": "keyword"},
-                "is_extra_edition": {"type": "boolean"},
-                "power": {"type": "keyword"},
-                "processed": {"type": "boolean"},
-                "scraped_at": {"type": "date"},
-                "source_text": {
-                    "type": "text",
-                    "analyzer": "brazilian",
-                    "index_options": "offsets",
-                    "term_vector": "with_positions_offsets",
-                    "fields": {
-                        "with_stopwords": {
-                            "type": "text",
-                            "analyzer": "brazilian_with_stopwords",
-                            "index_options": "offsets",
-                            "term_vector": "with_positions_offsets",
-                        },
-                        "exact": {
-                            "type": "text",
-                            "analyzer": "exact",
-                            "index_options": "offsets",
-                            "term_vector": "with_positions_offsets",
-                        }
-                    },
-                },
-                "state_code": {"type": "keyword"},
-                "territory_id": {"type": "keyword"},
-                "territory_name": {
-                    "type": "text",
-                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
-                },
-                "url": {"type": "keyword"},
-            }
-        },
-        "settings": {
-            "index": {
-              "sort.field": ["territory_id", "date"],
-              "sort.order": ["asc", "desc"]
-            },
-            "analysis": {
-                "filter": {
-                    "brazilian_stemmer": {
-                        "type": "stemmer",
-                        "language": "brazilian",
-                    }
-                },
-                "analyzer": {
-                    "brazilian_with_stopwords": {
-                        "tokenizer": "standard",
-                        "filter": ["lowercase", "brazilian_stemmer"],
-                    },
-                    "exact": {
-                        "tokenizer": "standard",
-                        "filter": ["lowercase"],
-                    },
-                },
-            }
-        },
-    }
-    index.create_index(body=body)
-
-
-def upload_gazette_raw_text(gazette: Dict, storage):
-    """
-    Define gazette raw text
-    """
-    file_raw_txt = Path(gazette["file_path"]).with_suffix(".txt").as_posix()
-    storage.upload_content(file_raw_txt, gazette["source_text"])
-    logging.debug(f"file_raw_txt uploaded {file_raw_txt}")
-    file_endpoint = get_file_endpoint()
-    gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}"
-
-
-def get_gazette_text_and_define_url(
-    gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface
-):
-    """
-    Extract file content and define the url to access the file in the storage
-    """
-    gazette["source_text"] = try_to_extract_content(gazette_file, text_extractor)
-    file_endpoint = get_file_endpoint()
-    gazette["url"] = f"{file_endpoint}/{gazette['file_path']}"
-
-
-def get_file_endpoint() -> str:
-    """
-    Get the endpoint where the gazette files can be downloaded.
-    """
-    return os.environ["QUERIDO_DIARIO_FILES_ENDPOINT"]
-
-
-def try_to_extract_content(
-    gazette_file: str, text_extractor: TextExtractorInterface
-) -> str:
-    """
-    Calls the function to extract the content from the gazette file. If it fails
-    remove the gazette file and raise an exception
-    """
-    try:
-        return text_extractor.extract_text(gazette_file)
-    except Exception as e:
-        os.remove(gazette_file)
-        raise e
-
-
-def delete_gazette_files(gazette_file: str) -> None:
-    """
-    Removes the files used to process the gazette content.
-    """
-    os.remove(gazette_file)
-
-
-def download_gazette_file(gazette: Dict, storage: StorageInterface) -> str:
-    """
-    Download the file from the object storage and write it down in the local
-    disk to allow the text extraction
-    """
-    with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
-        gazette_file_key = get_gazette_file_key_used_in_storage(gazette)
-        storage.get_file(gazette_file_key, tmpfile)
-        return tmpfile.name
-
-
-def get_gazette_file_key_used_in_storage(gazette: Dict) -> str:
-    """
-    Get the file key used to store the gazette in the object storage
-    """
-    return gazette["file_path"]
-
-
-def set_gazette_as_processed(gazette: Dict, database: DatabaseInterface) -> None:
-    command = """
-        UPDATE gazettes
-        SET processed = True
-        WHERE id = %(id)s
-        AND file_checksum = %(file_checksum)s
-    ;
-    """
-    id = gazette["id"]
-    checksum = gazette["file_checksum"]
-    data = {"id": id, "file_checksum": checksum}
-    logging.debug(f"Marking {id}({checksum}) as processed")
-    database.update(command, data)
diff --git a/scripts/tasks/gazette_themed_excerpts_extraction.py b/scripts/tasks/gazette_themed_excerpts_extraction.py
deleted file mode 100644
index 1e87c89..0000000
--- a/scripts/tasks/gazette_themed_excerpts_extraction.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import hashlib
-from typing import Dict, Iterable, List
-
-from .interfaces import IndexInterface
-from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights
-
-
-def extract_themed_excerpts_from_gazettes(
-    theme: Dict, gazette_ids: List[str], index: IndexInterface
-) -> List[str]:
-    create_index(theme, index)
-
-    ids = []
-    for theme_query in theme["queries"]:
-        for excerpt in get_excerpts_from_gazettes_with_themed_query(
-            theme_query, gazette_ids, index
-        ):
-            # excerpts with less than 10% of the expected size of excerpt account for 
-            # fewer than 1% of excerpts yet their score is usually high
-            if len(excerpt["excerpt"]) < 200:
-                continue
-
-            index.index_document(
-                excerpt,
-                document_id=excerpt["excerpt_id"],
-                index=theme["index"],
-                refresh=True,
-            )
-            ids.append(excerpt["excerpt_id"])
-
-    return ids
-
-
-def create_index(theme: Dict, index: IndexInterface) -> None:
-    body = {
-        "mappings": {
-            "properties": {
-                "excerpt_embedding_score": {"type": "rank_feature"},
-                "excerpt_subthemes": {"type": "keyword"},
-                "excerpt_entities": {"type": "keyword"},
-                "excerpt": {
-                    "type": "text",
-                    "analyzer": "brazilian",
-                    "index_options": "offsets",
-                    "term_vector": "with_positions_offsets",
-                    "fields": {
-                        "with_stopwords": {
-                            "type": "text",
-                            "analyzer": "brazilian_with_stopwords",
-                            "index_options": "offsets",
-                            "term_vector": "with_positions_offsets",
-                        },
-                        "exact": {
-                            "type": "text",
-                            "analyzer": "exact",
-                            "index_options": "offsets",
-                            "term_vector": "with_positions_offsets",
-                        },
-                    },
-                },
-                "excerpt_id": {"type": "keyword"},
-                "source_database_id": {"type": "long"},
-                "source_index_id": {"type": "keyword"},
-                "source_created_at": {"type": "date"},
-                "source_date": {"type": "date"},
-                "source_edition_number": {"type": "keyword"},
-                "source_file_checksum": {"type": "keyword"},
-                "source_file_path": {"type": "keyword"},
-                "source_file_raw_txt": {"type": "keyword"},
-                "source_file_url": {"type": "keyword"},
-                "source_is_extra_edition": {"type": "boolean"},
-                "source_power": {"type": "keyword"},
-                "source_processed": {"type": "boolean"},
-                "source_scraped_at": {"type": "date"},
-                "source_state_code": {"type": "keyword"},
-                "source_territory_id": {"type": "keyword"},
-                "source_territory_name": {"type": "keyword"},
-                "source_url": {"type": "keyword"},
-            }
-        },
-        "settings": {
-            "index": {
-              "sort.field": ["source_territory_id", "source_date"],
-              "sort.order": ["asc", "desc"]
-            },
-            "analysis": {
-                "filter": {
-                    "brazilian_stemmer": {
-                        "type": "stemmer",
-                        "language": "brazilian",
-                    }
-                },
-                "analyzer": {
-                    "brazilian_with_stopwords": {
-                        "tokenizer": "standard",
-                        "filter": ["lowercase", "brazilian_stemmer"],
-                    },
-                    "exact": {
-                        "tokenizer": "standard",
-                        "filter": ["lowercase"],
-                    },
-                },
-            }
-        },
-    }
-    index.create_index(index_name=theme["index"], body=body)
-
-
-def get_excerpts_from_gazettes_with_themed_query(
-    query: Dict, gazette_ids: List[str], index: IndexInterface
-) -> Iterable[Dict]:
-    es_query = get_es_query_from_themed_query(query, gazette_ids, index)
-    documents = get_documents_from_query_with_highlights(es_query, index)
-    for document in documents:
-        gazette = document["_source"]
-        excerpts = document["highlight"]["source_text.with_stopwords"]
-        for excerpt in excerpts:
-            yield {
-                "excerpt": preprocess_excerpt(excerpt),
-                "excerpt_subthemes": [query["title"]],
-                "excerpt_id": generate_excerpt_id(excerpt, gazette),
-                "source_index_id": gazette["file_checksum"],
-                "source_created_at": gazette["created_at"],
-                "source_database_id": gazette["id"],
-                "source_date": gazette["date"],
-                "source_edition_number": gazette["edition_number"],
-                "source_file_raw_txt": gazette["file_raw_txt"],
-                "source_is_extra_edition": gazette["is_extra_edition"],
-                "source_file_checksum": gazette["file_checksum"],
-                "source_file_path": gazette["file_path"],
-                "source_file_url": gazette["file_url"],
-                "source_power": gazette["power"],
-                "source_processed": gazette["processed"],
-                "source_scraped_at": gazette["scraped_at"],
-                "source_state_code": gazette["state_code"],
-                "source_territory_id": gazette["territory_id"],
-                "source_territory_name": gazette["territory_name"],
-                "source_url": gazette["url"],
-            }
-
-
-def generate_excerpt_id(excerpt: str, gazette: Dict) -> str:
-    hash = hashlib.md5()
-    hash.update(excerpt.encode())
-    return f"{gazette['file_checksum']}_{hash.hexdigest()}"
-
-
-def get_es_query_from_themed_query(
-    query: Dict,
-    gazette_ids: List[str],
-    index: IndexInterface,
-) -> Dict:
-    es_query = {
-        "query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}},
-        "size": 100,
-        "highlight": {
-            "fields": {
-                "source_text.with_stopwords": {
-                    "type": "unified",
-                    "fragment_size": 2000,
-                    "number_of_fragments": 10,
-                    "pre_tags": [""],
-                    "post_tags": [""],
-                }
-            },
-        },
-    }
-
-    macro_synonym_block = {"span_or": {"clauses": []}}
-    for macro_set in query["term_sets"]:
-        proximity_block = {"span_near": {"clauses": [], "slop": 20, "in_order": False}}
-        for term_set in macro_set:
-            synonym_block = {"span_or": {"clauses": []}}
-            for term in term_set:
-                phrase_block = {
-                    "span_near": {"clauses": [], "slop": 0, "in_order": True}
-                }
-                tokenized_term = index.analyze(text=term, field="source_text.with_stopwords")
-                for token in tokenized_term["tokens"]:
-                    word_block = {"span_term": {"source_text.with_stopwords": token["token"]}}
-                    phrase_block["span_near"]["clauses"].append(word_block)
-                synonym_block["span_or"]["clauses"].append(phrase_block)
-            proximity_block["span_near"]["clauses"].append(synonym_block)
-        macro_synonym_block["span_or"]["clauses"].append(proximity_block)
-
-    es_query["query"]["bool"]["must"].append(macro_synonym_block)
-    return es_query
-
-
-def preprocess_excerpt(excerpt: str) -> str:
-    return clean_extra_whitespaces(excerpt)
diff --git a/scripts/tasks/gazette_themes_listing.py b/scripts/tasks/gazette_themes_listing.py
deleted file mode 100644
index 1dbb60c..0000000
--- a/scripts/tasks/gazette_themes_listing.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import json
-import pathlib
-from typing import Dict, List
-
-
-def get_themes() -> List[Dict]:
-    ROOT = pathlib.Path(__file__).parent.parent
-    themes_config = ROOT / "config" / "themes_config.json"
-
-    with themes_config.open() as f:
-        themes = json.load(f)["themes"]
-
-    return themes
diff --git a/scripts/tasks/interfaces.py b/scripts/tasks/interfaces.py
deleted file mode 100644
index 06b81cb..0000000
--- a/scripts/tasks/interfaces.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from typing import Dict, Iterable, Tuple
-import abc
-
-
-class DatabaseInterface(abc.ABC):
-    """
-    Interface to abstract the iteraction with the database storing data used by the
-    tasks
-    """
-
-    @abc.abstractmethod
-    def _commit_changes(self, command: str, data: Dict) -> None:
-        """
-        Make a change in the database and commit it
-        """
-
-    @abc.abstractmethod
-    def select(self, command: str) -> Iterable[Tuple]:
-        """
-        Select entries from the database
-        """
-
-    @abc.abstractmethod
-    def insert(self, command: str, data: Dict) -> None:
-        """
-        Insert entries into the database
-        """
-
-    @abc.abstractmethod
-    def update(self, command: str, data: Dict) -> None:
-        """
-        Update entries from the database
-        """
-
-    @abc.abstractmethod
-    def delete(self, command: str, data: Dict) -> None:
-        """
-        Delete entries from the database
-        """
-
-
-class StorageInterface(abc.ABC):
-    """
-    Interface to abstract the interaction with the object store system.
-    """
-
-    @abc.abstractmethod
-    def get_file(self, file_to_be_downloaded: str, destination) -> None:
-        """
-        Download the given file key in the destination on the host
-        """
-
-    @abc.abstractmethod
-    def upload_content(self, file_key: str, content_to_be_uploaded: str) -> None:
-        """
-        Upload the given content to the destination on the host
-        """
-
-
-class IndexInterface(abc.ABC):
-    """
-    Interface to abstract the interaction with the index system
-    """
-
-    @abc.abstractmethod
-    def create_index(self, index_name: str, body: Dict) -> None:
-        """
-        Create the index used by the application
-        """
-
-    @abc.abstractmethod
-    def refresh_index(self, index_name: str) -> None:
-        """
-        Refreshes the index to make it up-to-date for future searches
-        """
-
-    @abc.abstractmethod
-    def index_document(
-        self, document: Dict, document_id: str, index: str, refresh: bool
-    ) -> None:
-        """
-        Upload document to the index
-        """
-
-    @abc.abstractmethod
-    def search(self, query: Dict, index: str) -> Dict:
-        """
-        Searches the index with the provided query
-        """
-
-    @abc.abstractmethod
-    def paginated_search(
-        self, query: Dict, index: str, keep_alive: str
-    ) -> Iterable[Dict]:
-        """
-        Searches the index with the provided query, with pagination
-        """
-
-
-class TextExtractorInterface(abc.ABC):
-    @abc.abstractmethod
-    def extract_text(self, filepath: str) -> str:
-        """
-        Extract the text from the given file
-        """
diff --git a/scripts/tasks/list_gazettes_to_be_processed.py b/scripts/tasks/list_gazettes_to_be_processed.py
deleted file mode 100644
index 1547e7b..0000000
--- a/scripts/tasks/list_gazettes_to_be_processed.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import logging
-from typing import Dict, Iterable
-
-from .interfaces import DatabaseInterface
-
-
-def get_gazettes_to_be_processed(
-    execution_mode: str, database: DatabaseInterface
-) -> Iterable[Dict]:
-    if execution_mode == "DAILY":
-        yield from get_gazettes_extracted_since_yesterday(database)
-    elif execution_mode == "ALL":
-        yield from get_all_gazettes_extracted(database)
-    elif execution_mode == "UNPROCESSED":
-        yield from get_unprocessed_gazettes(database)
-    else:
-        raise Exception(f'Execution mode "{execution_mode}" is invalid.')
-
-
-def get_gazettes_extracted_since_yesterday(
-    database: DatabaseInterface,
-) -> Iterable[Dict]:
-    """
-    List the gazettes which were extracted since yesterday
-    """
-    logging.info("Listing gazettes extracted since yesterday")
-
-    command = """
-    SELECT
-        gazettes.id,
-        gazettes.source_text,
-        gazettes.date,
-        gazettes.edition_number,
-        gazettes.is_extra_edition,
-        gazettes.power,
-        gazettes.file_checksum,
-        gazettes.file_path,
-        gazettes.file_url,
-        gazettes.scraped_at,
-        gazettes.created_at,
-        gazettes.territory_id,
-        gazettes.processed,
-        territories.name as territory_name,
-        territories.state_code
-    FROM
-        gazettes
-    INNER JOIN territories ON territories.id = gazettes.territory_id
-    WHERE
-        scraped_at > current_timestamp - interval '1 day'
-    ;
-    """
-    for gazette in database.select(command):
-        yield format_gazette_data(gazette)
-
-
-def get_all_gazettes_extracted(
-    database: DatabaseInterface,
-) -> Iterable[Dict]:
-    """
-    List all the gazettes which were extracted
-    """
-    logging.info("Listing all gazettes extracted")
-
-    command = """
-    SELECT
-        gazettes.id,
-        gazettes.source_text,
-        gazettes.date,
-        gazettes.edition_number,
-        gazettes.is_extra_edition,
-        gazettes.power,
-        gazettes.file_checksum,
-        gazettes.file_path,
-        gazettes.file_url,
-        gazettes.scraped_at,
-        gazettes.created_at,
-        gazettes.territory_id,
-        gazettes.processed,
-        territories.name as territory_name,
-        territories.state_code
-    FROM
-        gazettes
-    INNER JOIN territories ON territories.id = gazettes.territory_id
-    ;
-    """
-    for gazette in database.select(command):
-        yield format_gazette_data(gazette)
-
-
-def get_unprocessed_gazettes(
-    database: DatabaseInterface,
-) -> Iterable[Dict]:
-    """
-    List all the gazettes which were extracted
-    """
-    logging.info("Listing all gazettes extracted")
-
-    command = """
-    SELECT
-        gazettes.id,
-        gazettes.source_text,
-        gazettes.date,
-        gazettes.edition_number,
-        gazettes.is_extra_edition,
-        gazettes.power,
-        gazettes.file_checksum,
-        gazettes.file_path,
-        gazettes.file_url,
-        gazettes.scraped_at,
-        gazettes.created_at,
-        gazettes.territory_id,
-        gazettes.processed,
-        territories.name as territory_name,
-        territories.state_code
-    FROM
-        gazettes
-    INNER JOIN territories ON territories.id = gazettes.territory_id
-    WHERE
-        processed is False
-    ;
-    """
-    for gazette in database.select(command):
-        yield format_gazette_data(gazette)
-
-
-def format_gazette_data(data):
-    return {
-        "id": data[0],
-        "source_text": data[1],
-        "date": data[2],
-        "edition_number": data[3],
-        "is_extra_edition": data[4],
-        "power": data[5],
-        "file_checksum": data[6],
-        "file_path": data[7],
-        "file_url": data[8],
-        "scraped_at": data[9],
-        "created_at": data[10],
-        "territory_id": data[11],
-        "processed": data[12],
-        "territory_name": data[13],
-        "state_code": data[14],
-    }
diff --git a/scripts/tasks/utils/__init__.py b/scripts/tasks/utils/__init__.py
deleted file mode 100644
index 1bd9cf3..0000000
--- a/scripts/tasks/utils/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .index import (
-    get_documents_from_query_with_highlights,
-    get_documents_with_ids,
-)
-from .text import clean_extra_whitespaces
diff --git a/scripts/tasks/utils/index.py b/scripts/tasks/utils/index.py
deleted file mode 100644
index 83d769c..0000000
--- a/scripts/tasks/utils/index.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from typing import Dict, Iterable, List
-
-from ..interfaces import IndexInterface
-
-
-def get_documents_with_ids(
-    ids: List[str], index: IndexInterface, index_name: str = ""
-) -> Iterable[Dict]:
-    query_filter_by_ids = {
-        "query": {"bool": {"filter": {"ids": {"values": ids}}}},
-        "size": 100,
-    }
-    yield from get_documents_from_query(query_filter_by_ids, index, index_name)
-
-
-def get_documents_from_query(
-    query: Dict, index: IndexInterface, index_name: str = ""
-) -> Iterable[Dict]:
-    index.refresh_index(index_name)
-    documents = (
-        hit
-        for result in index.paginated_search(query, index=index_name)
-        for hit in result["hits"]["hits"]
-    )
-    yield from documents
-
-
-def get_documents_from_query_with_highlights(
-    query: Dict, index: IndexInterface, index_name: str = ""
-) -> Iterable[Dict]:
-    index.refresh_index(index_name)
-    documents = (
-        hit
-        for result in index.paginated_search(query, index=index_name)
-        for hit in result["hits"]["hits"]
-        if hit.get("highlight")
-    )
-    yield from documents
diff --git a/scripts/tasks/utils/text.py b/scripts/tasks/utils/text.py
deleted file mode 100644
index 1cc7c39..0000000
--- a/scripts/tasks/utils/text.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import re
-
-
-def clean_extra_whitespaces(text: str) -> str:
-    return re.sub(r"\s+", " ", text)

From 05d357c2109aa0c122d57cac46cf5f2e44f83abd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:57:20 -0300
Subject: [PATCH 04/19] Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 README.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/README.md b/README.md
index 31e4c9a..2dd4ff6 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,88 @@ make build
 make setup
 ```
 
+-----------------------------
+- [x]  como configurar as credenciais em ambos os projetos para que eles se comuniquem
+- [ ]  como realizar um seed no data-processing usando um spider do querido-diario
+
+Para configurar as credenciais é necessário mudar alguns parâmetros em settings.py. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py.
+
+Mude os seguintes parâmetros:
+
+~~~Python
+###linha xxx
+FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/")
+
+### linha xx
+QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" )
+
+### linhas 52 a 56
+AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="")
+AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="")
+AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="")
+AWS_REGION_NAME = config("AWS_REGION_NAME", default="")
+FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
+
+# Substitua por
+AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key")
+AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key")
+AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/")
+AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1")
+FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
+~~~
+
+
+- **Linux**
+
+
+- **Windows**
+
+1. **Usando WSL**
+Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). 
+
+Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
+
+Caso haja um erro com cython_sources, assim como na imagem:
+
+![[Pasted image 20231005102449.png]]
+
+Faça esse procedimento e instale os requirements-dev novamente:
+
+~~~Linux
+pip3 install wheel -v
+pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v
+~~~
+
+Caso haja um erro com legacy-install
+![[Pasted image 20231005104343.png]]
+![[Pasted image 20231005103545.png]]
+
+Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux:
+
+~~~Linux
+python3 -m pip install --upgrade pip
+sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
+~~~
+
+2. **Usando o terminal do Windows**
+
+Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos....
+
+Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo:
+
+~~~Linux
+pip install -r data_collection/requirements-dev.txt --no-deps
+~~~ 
+
+Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 
+
+Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos)
+
+Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”.
+
+- **Mac**
+
+...
 ## Populate data
 Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container).
 

From 504ae96a6f3a19d12523ca83055ae3ab8c3cedb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:01:10 -0300
Subject: [PATCH 05/19] Create configurando_ambientes.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 configurando_ambientes.md | 51 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 configurando_ambientes.md

diff --git a/configurando_ambientes.md b/configurando_ambientes.md
new file mode 100644
index 0000000..3531b14
--- /dev/null
+++ b/configurando_ambientes.md
@@ -0,0 +1,51 @@
+## Como os Projetos se relacionam
+
+O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário no [[fluxograma_1.png]]. As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)).
+
+Esse é o objetivo principal mas não é o único, já que além da possibilidade da colaboração por meio do desenvolvimento, é também possível aplicar as técnicas de PLN em um _dataset_ específico.
+
+## Configurando seu ambiente de Desenvolvimento
+ 
+Sempre fique ligado(a) ao documento de [Contribuição](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#ecossistema), nele é possível verificar as exigências básicas como formatação _black_, configuração de ambiente seguro, detalhamento nas _[[issues e pull requests]]_. Lembre-se também que as **issues e pull requests são uma parte da documentação do projeto**!  
+ 
+Sabendo desses pontos, é necessário configurar o ambiente de trabalho. Existem três diferentes sistemas operacionais que são compatíveis com o ambiente desenvolvido: Linux (o padrão e raíz), Windows e Mac. Vamos explorar cada um deles.
+
+### Linux
+
+Se você já trabalha com Linux seguir as orientações de instalação contidas no [repositório](https://github.com/okfn-brasil/querido-diario-data-processing) serão suficientes para instalar o ambiente.
+
+Alguns possíveis problemas que talvez precisem de um cuidado é relacionado à conexão do ecossistema com o [[querido-diario]]. Veja em [[conectar ao querido-diario]].
+
+### Windows
+##### Utilizando WSL
+
+Para utlizar essa etapa é necessário instalar o WSL na sua máquina Windows e instalar um sistema operacional. Veja esse tutorial de [[Instalando WSL]] caso tenha dúvidas.
+
+Dentro da sua máquina Linux já é possível seguir as instruções de instalação do ambiente contidas no repositório em [Setup](). Instale o Podman e inicie o ambiente virtual. Um comando de cada vez.
+
+~~~Linux
+ sudo apt-get update
+ ## sudo apt update && sudo apt upgrade  ##testar
+ sudo apt-get -y install podman
+ 
+ sudo apt install python3.10-venv
+ python3 -m venv .venv
+ source .venv/Scripts/activate  ### Ativando o ambiente virtual
+
+ sudo apt install make          ### Caso apresente erro de instalação
+ make build                     ### Somente a 1° vez
+ make setup
+ ~~~
+
+Teste para ver se o seu ambiente funciona:
+~~~Linux
+make shell-database
+~~~
+
+![[Pasted image 20231005100446.png]]
+![[Pasted image 20231005100534.png]]
+
+Após essa etapa é necessário **[[conectar ao querido-diario]]** ao banco de dados gerados pelo repositório [[querido-diario]] o qual é responsável por extrair os diários oficiais. Se a conexão não for feita, esse repositório não possui documentos para processar. Faça o fork do repositório [[querido-diario]] e [[querido-diario-data-processing]] na sua conta do Github e a partir daí faça o clone para a sua máquina Linux desses repositórios.
+### Mac
+
+...

From 497d9dfcc478b47af8ea1c8db00bdb5b43c65592 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:02:07 -0300
Subject: [PATCH 06/19] Create conectando_qd.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 conectando_qd.md | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 conectando_qd.md

diff --git a/conectando_qd.md b/conectando_qd.md
new file mode 100644
index 0000000..5fc2905
--- /dev/null
+++ b/conectando_qd.md
@@ -0,0 +1,91 @@
+## Configurando as credenciais para a comunicação dos dois projetos
+
+Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py.
+
+Mude os seguintes parâmetros:
+
+~~~Python
+###linha 21
+FILES_STORE = config("FILES_STORE", default="data")
+
+### Substitua por:
+FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/")
+
+### linhas 44 a 46
+QUERIDODIARIO_DATABASE_URL = config(
+    "QUERIDODIARIO_DATABASE_URL", default="sqlite:///querido-diario.db"
+)
+
+### Substitua por:
+QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" )
+
+### linhas 52 a 56
+AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="")
+AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="")
+AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="")
+AWS_REGION_NAME = config("AWS_REGION_NAME", default="")
+FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
+
+# Substitua por
+AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key")
+AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key")
+AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/")
+AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1")
+FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
+~~~
+
+Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível
+## Configurando o ambiente do querido-diario
+
+### Linux
+
+
+### Windows
+
+#### Usando WSL
+
+Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). 
+
+Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
+
+Caso haja um erro com cython_sources, assim como na imagem:
+
+![[Pasted image 20231005102449.png]]
+
+Faça esse procedimento e instale os requirements-dev novamente:
+
+~~~Linux
+pip3 install wheel -v
+pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v
+~~~
+
+Caso haja um erro com legacy-install
+![[Pasted image 20231005104343.png]]
+![[Pasted image 20231005103545.png]]
+
+Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux:
+
+~~~Linux
+python3 -m pip install --upgrade pip
+sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
+~~~
+
+#### Usando o terminal do Windows
+
+Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos....
+
+Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo:
+
+~~~Linux
+pip install -r data_collection/requirements-dev.txt --no-deps
+~~~ 
+
+Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 
+
+Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos)
+
+Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”.
+
+- **Mac**
+
+...

From 70d63e7cea81bccd9fbefd501ddeeec8f5211a11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:08:15 -0300
Subject: [PATCH 07/19] Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 README.md | 84 ++-----------------------------------------------------
 1 file changed, 2 insertions(+), 82 deletions(-)

diff --git a/README.md b/README.md
index 2dd4ff6..d57ef57 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md)
+
 # querido-diario-data-processing
 
 ## Setup
@@ -12,88 +14,6 @@ make build
 make setup
 ```
 
------------------------------
-- [x]  como configurar as credenciais em ambos os projetos para que eles se comuniquem
-- [ ]  como realizar um seed no data-processing usando um spider do querido-diario
-
-Para configurar as credenciais é necessário mudar alguns parâmetros em settings.py. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py.
-
-Mude os seguintes parâmetros:
-
-~~~Python
-###linha xxx
-FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/")
-
-### linha xx
-QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" )
-
-### linhas 52 a 56
-AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="")
-AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="")
-AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="")
-AWS_REGION_NAME = config("AWS_REGION_NAME", default="")
-FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
-
-# Substitua por
-AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key")
-AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key")
-AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/")
-AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1")
-FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
-~~~
-
-
-- **Linux**
-
-
-- **Windows**
-
-1. **Usando WSL**
-Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). 
-
-Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
-
-Caso haja um erro com cython_sources, assim como na imagem:
-
-![[Pasted image 20231005102449.png]]
-
-Faça esse procedimento e instale os requirements-dev novamente:
-
-~~~Linux
-pip3 install wheel -v
-pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v
-~~~
-
-Caso haja um erro com legacy-install
-![[Pasted image 20231005104343.png]]
-![[Pasted image 20231005103545.png]]
-
-Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux:
-
-~~~Linux
-python3 -m pip install --upgrade pip
-sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
-~~~
-
-2. **Usando o terminal do Windows**
-
-Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos....
-
-Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo:
-
-~~~Linux
-pip install -r data_collection/requirements-dev.txt --no-deps
-~~~ 
-
-Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 
-
-Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos)
-
-Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”.
-
-- **Mac**
-
-...
 ## Populate data
 Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container).
 

From 841231227f2dbb7c3f9b405bf765377965c3f465 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:10:16 -0300
Subject: [PATCH 08/19] Create tutorial.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 tutorial.md | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 tutorial.md

diff --git a/tutorial.md b/tutorial.md
new file mode 100644
index 0000000..31e4c9a
--- /dev/null
+++ b/tutorial.md
@@ -0,0 +1,40 @@
+# querido-diario-data-processing
+
+## Setup
+
+- [Install podman](https://podman.io/getting-started/installation)
+- execute build stage (only the first time):
+```console
+make build
+```
+- execute setup stage:
+```console
+make setup
+```
+
+## Populate data
+Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container).
+
+- you can see created data inside [storage](http://localhost:9000/minio/queridodiariobucket) using [local credentials](contrib/sample.env#L3)
+- you can see gazettes not processed yet connecting on database
+- open database console in a new terminal
+```console
+make shell-database
+```
+- and run a query to see gazettes not processed
+```sql
+select processed, count(1) from gazettes g group by processed;
+```
+
+## Run
+- execute processing stage:
+```console
+make re-run
+```
+- and see gazettes processed running the query above
+- you can search using ElasticSearch
+```console
+curl 'http://localhost:9200/querido-diario/_search' \
+  -H 'Content-Type: application/json' \
+  --data-raw '{"query":{"query_string":{"query":"*"}},"size":2}'
+```

From 09e40e19e037ebab47b76b422e14cc1b0b14f956 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:17:52 -0300
Subject: [PATCH 09/19] Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 README.md | 70 ++++++++++++++++++++++---------------------------------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index d57ef57..f6d1c9b 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,28 @@
-[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md)
-
-# querido-diario-data-processing
-
-## Setup
-
-- [Install podman](https://podman.io/getting-started/installation)
-- execute build stage (only the first time):
-```console
-make build
-```
-- execute setup stage:
-```console
-make setup
-```
-
-## Populate data
-Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container).
-
-- you can see created data inside [storage](http://localhost:9000/minio/queridodiariobucket) using [local credentials](contrib/sample.env#L3)
-- you can see gazettes not processed yet connecting on database
-- open database console in a new terminal
-```console
-make shell-database
-```
-- and run a query to see gazettes not processed
-```sql
-select processed, count(1) from gazettes g group by processed;
-```
-
-## Run
-- execute processing stage:
-```console
-make re-run
-```
-- and see gazettes processed running the query above
-- you can search using ElasticSearch
-```console
-curl 'http://localhost:9200/querido-diario/_search' \
-  -H 'Content-Type: application/json' \
-  --data-raw '{"query":{"query_string":{"query":"*"}},"size":2}'
-```
+[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md)
+
+## O processamento de dados
+
+É responsável pelo [repositório](https://github.com/okfn-brasil/querido-diario-data-processing). O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo.
+![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/cd6b5589-f4e7-45a0-86a9-5cbb0bf14cb7)
+
+As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)).
+
+Veja a estrutura completa do projeto [aqui](https://docs.queridodiario.ok.org.br/pt/latest/).
+
+### Entendendo a estrutura do querido-diario-data-processing
+
+1. Montando o ambiente de trabalho
+
+A pasta "scripts" são responsáveis pelo ambiente de trabalho.
+
+2. Extração do texto
+
+"data_extraction"
+
+3. Processamento do texto
+
+A pasta "tasks" 
+
+4. Armazenamento
+
+"database" e "storage"

From 275e5a62df9b0764377a11e42d01b9159628f8f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:20:59 -0300
Subject: [PATCH 10/19] Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f6d1c9b..5296bd6 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
-[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md)
+PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+
+EN/US
 
 ## O processamento de dados
 
-É responsável pelo [repositório](https://github.com/okfn-brasil/querido-diario-data-processing). O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo.
+O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo.
 ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/cd6b5589-f4e7-45a0-86a9-5cbb0bf14cb7)
 
 As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)).

From e40e4e5d957c49d925e5f0d1139b58857195f8a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:30:47 -0300
Subject: [PATCH 11/19] Create wsl_windows.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 wsl_windows.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 wsl_windows.md

diff --git a/wsl_windows.md b/wsl_windows.md
new file mode 100644
index 0000000..4102379
--- /dev/null
+++ b/wsl_windows.md
@@ -0,0 +1,48 @@
+O WSL é uma sigla para Subsistema de Windows para Linux, tradução de _Windows Subsystem for Linux_, 
+
+O sistema do Querido Diário foi totalmente desenvolvido para Linux e por isso algumas configurações não funcionam para Windows, sabendo disso uma das maneiras menos trabalhosas é configurar um subsistema para Linux, através do WSL. 
+
+Primeiramente é necessário executar o **Windows Power Shell** como administrador. No terminal digite:
+
+~~~ Windows PowerShell (admin)
+wsl --install     ### Instalando o WSL
+~~~
+
+Atenção: Recursos mais atuais do WSL exigem um sistema operacional Windows mais recentes (a partir do Windows 10).
+
+Após isso, será possível configurar um nome de usuário e senha para que você possa logar na sua nova máquina. Feito isso é necessário configurar o ambiente para o Querido Diário. É necessário ter Python, Git, Podman e o próprio repositório na sua nova máquina.
+Nas máquinas Linux normalmente já está instalado o Python, verifique a partir desse comando:
+~~~Linux
+python --version
+~~~
+
+A partir disso é possível atualizar ou dar continuidade com a instalação do ambiente de trabalho. Para instalar o Podman (para trabalhar com dockers) siga o [tutorial](https://podman.io/docs/installation) de instalação e vá até "Installing on Linux". Somente com a instalação já é possível iniciar o ambiente (utilizando o Makefile).
+
+
+~~~Linux
+sudo apt install python3-venv -y
+sudo apt install python3.10-venv
+python3 -m venv .venv
+source .venv/bin/activate
+~~~
+
+Ao iniciar uma nova máquina, já é possível acessá-la no menu iniciar do Windows. Por exemplo, caso tenha instalado o Ubuntu, pesquise assim:
+![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/233e1427-2557-4c7e-ae35-40a2b7fccbf9)
+
+Caso ao iniciar seu terminal Linux apareça o erro **"Error: 0x80370114 Não foi possível iniciar a operação porque um recurso necessário não foi instalado"** ,tente habilitar os recursos Hyper-V. Para isso, digite "hyper-V" em Pesquisar e aparecerá uma opção de "Ativar ou desativar recursos do Windows".
+
+![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/c82bba62-5225-40bb-8e55-ad5b39b3b5c4)
+
+Selecione Plataforma do Hipervisor do Windows e clique em Ok. Após esse procedimento, reinicie a sua máquina. 
+ 
+ A partir daí já será possível realizar o git clone de um repositório forked do [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) e então criar e iniciar um ambiente virtual:
+~~~Linux
+git clone repositorio_forked_querido-diario
+git clone repositorio_forked_querido-diario-data-processing
+~~~
+
+Caso nesta etapa tenha dado algum erro de conexão ao host do github, tente reiniciar o terminal Linux pelo comando:
+
+~~~Linux
+sudo shutdown -h now
+~~~

From a7ccb35caa0ff7c3d6e25449906acc7c981def62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:35:38 -0300
Subject: [PATCH 12/19] Update conectando_qd.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 conectando_qd.md | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/conectando_qd.md b/conectando_qd.md
index 5fc2905..5befd7d 100644
--- a/conectando_qd.md
+++ b/conectando_qd.md
@@ -44,27 +44,23 @@ Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e o
 
 #### Usando WSL
 
-Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). 
+Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](https://github.com/okfn-brasil/querido-diario). Se tiver dúvidas, acesse o [tutorial de instalação do WSL no Windows](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/wsl_windows.md).
 
-Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
+Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario](https://github.com/okfn-brasil/querido-diario) em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
 
 Caso haja um erro com cython_sources, assim como na imagem:
-
-![[Pasted image 20231005102449.png]]
+![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/57afdb93-26cd-4ddc-be43-53cd4fd60365)
 
 Faça esse procedimento e instale os requirements-dev novamente:
-
 ~~~Linux
 pip3 install wheel -v
 pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v
 ~~~
 
 Caso haja um erro com legacy-install
-![[Pasted image 20231005104343.png]]
-![[Pasted image 20231005103545.png]]
+![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/2040db6a-0d47-404f-aa98-2d2204a6ff4c)
 
 Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux:
-
 ~~~Linux
 python3 -m pip install --upgrade pip
 sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
@@ -74,10 +70,10 @@ sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
 
 Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos....
 
-Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo:
+Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma:
 
 ~~~Linux
-pip install -r data_collection/requirements-dev.txt --no-deps
+pip3 install -r data_collection/requirements-dev.txt --no-deps
 ~~~ 
 
 Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 

From 27dc8ba481bd3e48cf4abf08cf32ada1db5ca817 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:03:34 -0300
Subject: [PATCH 13/19] Update conectando_qd.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 conectando_qd.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/conectando_qd.md b/conectando_qd.md
index 5befd7d..6d9d392 100644
--- a/conectando_qd.md
+++ b/conectando_qd.md
@@ -34,7 +34,10 @@ AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1")
 FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
 ~~~
 
-Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível
+Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível...
+
+Acesse os diários baixados através desse link: http://localhost:9000/minio/queridodiariobucket
+
 ## Configurando o ambiente do querido-diario
 
 ### Linux

From cf918ab8b0a5c155511fff348b10f9ad2e7fb41d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:09:23 -0300
Subject: [PATCH 14/19] Update tutorial.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 tutorial.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tutorial.md b/tutorial.md
index 31e4c9a..1d3ebfe 100644
--- a/tutorial.md
+++ b/tutorial.md
@@ -1,3 +1,7 @@
+PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+
+EN/US
+
 # querido-diario-data-processing
 
 ## Setup

From 23eaffda78c25179a6c4e583f4412d872124bad0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:09:38 -0300
Subject: [PATCH 15/19] Update configurando_ambientes.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 configurando_ambientes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/configurando_ambientes.md b/configurando_ambientes.md
index 3531b14..747161d 100644
--- a/configurando_ambientes.md
+++ b/configurando_ambientes.md
@@ -1,3 +1,7 @@
+PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+
+EN/US
+
 ## Como os Projetos se relacionam
 
 O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário no [[fluxograma_1.png]]. As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)).

From 164ddf47cb7a3fe70672cbdd92d6c02f6276dc2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?=
 <87907716+Luisa-Coelho@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:09:54 -0300
Subject: [PATCH 16/19] Update conectando_qd.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com>
---
 conectando_qd.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/conectando_qd.md b/conectando_qd.md
index 6d9d392..d8ad964 100644
--- a/conectando_qd.md
+++ b/conectando_qd.md
@@ -1,3 +1,7 @@
+PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+
+EN/US
+
 ## Configurando as credenciais para a comunicação dos dois projetos
 
 Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py.

From 7e9f0ae0a90c8694a81d7bc96f94103918c6b7c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <fancelli_luisa@hotmail.com>
Date: Sat, 14 Oct 2023 08:37:35 -0300
Subject: [PATCH 17/19] .env

---
 conectando_qd.md | 46 ++++++++++++----------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/conectando_qd.md b/conectando_qd.md
index d8ad964..04c3bb1 100644
--- a/conectando_qd.md
+++ b/conectando_qd.md
@@ -4,43 +4,21 @@ EN/US
 
 ## Configurando as credenciais para a comunicação dos dois projetos
 
-Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py.
-
-Mude os seguintes parâmetros:
-
-~~~Python
-###linha 21
-FILES_STORE = config("FILES_STORE", default="data")
-
-### Substitua por:
-FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/")
-
-### linhas 44 a 46
-QUERIDODIARIO_DATABASE_URL = config(
-    "QUERIDODIARIO_DATABASE_URL", default="sqlite:///querido-diario.db"
-)
-
-### Substitua por:
-QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" )
-
-### linhas 52 a 56
-AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="")
-AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="")
-AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="")
-AWS_REGION_NAME = config("AWS_REGION_NAME", default="")
-FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
-
-# Substitua por
-AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key")
-AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key")
-AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/")
-AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1")
-FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read")
+Para configurar as credenciais é necessário vincular os dois projetos como um só. Para isso é necessário **criar um arquivo .env** na raíz do repositório [querido-diario]() e inserir parâmetros coincidentes com o processamento do repositório [queridod-diario-data-processing](). Depois de ter realizado o fork do querido-diario, abra este repositório na sua máquina e insira um arquivo .env com as seguintes informações.
+
+~~~.env
+AWS_ACCESS_KEY_ID=minio-access-key
+AWS_SECRET_ACCESS_KEY=minio-secret-key
+AWS_ENDPOINT_URL=http://127.0.0.1:9000/
+AWS_REGION_NAME=us-east-1
+FILES_STORE=s3://queridodiariobucket/
+FILES_STORE_S3_ACL=public-read
+QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb
 ~~~
 
-Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível...
+A variável .env já está como ignorada no projeto do querido-diario, portanto não é necessário mudar mais nada. Para executar a requisição abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível...
 
-Acesse os diários baixados através desse link: http://localhost:9000/minio/queridodiariobucket
+Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket**
 
 ## Configurando o ambiente do querido-diario
 

From 54daf39111be1f1500ac9fae9397bfb2fa8a454c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <fancelli_luisa@hotmail.com>
Date: Sat, 14 Oct 2023 09:20:24 -0300
Subject: [PATCH 18/19] arrumando arquivos

---
 conectando_qd.md           | 40 ++++++++++++++++++++++++--------------
 configurando_ambientes.md  |  2 +-
 scripts/Dockerfile_windows | 23 ----------------------
 tutorial.md                |  2 +-
 4 files changed, 27 insertions(+), 40 deletions(-)
 delete mode 100644 scripts/Dockerfile_windows

diff --git a/conectando_qd.md b/conectando_qd.md
index 04c3bb1..97a13a7 100644
--- a/conectando_qd.md
+++ b/conectando_qd.md
@@ -1,4 +1,4 @@
-PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md)
 
 EN/US
 
@@ -18,22 +18,31 @@ QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127
 
 A variável .env já está como ignorada no projeto do querido-diario, portanto não é necessário mudar mais nada. Para executar a requisição abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível...
 
-Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket**
+Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket**.
+
+Essa etapa tem que dar certo para qualquer tipo de sistema operacional, utilizando Linux ou WSL no Windows.
 
 ## Configurando o ambiente do querido-diario
 
-### Linux
+É importante seguir as instruções gerais no repositório no repositório [querido-diario](). Iniciar um ambiente virtual, instalar o arquivo de _requirements_ bem como o pre-commit. Caso sinta dificuldades ao configurar o ambiente, é possível consultar o material de "lidando com erros" preparado.
+
+### Lidando com erros na Configuração
 
+#### Linux
+Os erros em Linux são menos comuns mas podem ocorrer devido a novas atualizações. É provável que você encontre essas soluções nas seções de WSL, Windows e Mac.
 
-### Windows
+Não se preocupe, como as atualizações são constantes logo o erro será resolvido e para isso você pode informá-lo em uma nova issue, pull request ou entrando em contato com os mantenedores pelo [Discord da Open Knowledge Brasil]().
 
-#### Usando WSL
+#### Windows
+
+##### Usando WSL
 
 Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](https://github.com/okfn-brasil/querido-diario). Se tiver dúvidas, acesse o [tutorial de instalação do WSL no Windows](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/wsl_windows.md).
 
 Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario](https://github.com/okfn-brasil/querido-diario) em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario.
 
 Caso haja um erro com cython_sources, assim como na imagem:
+
 ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/57afdb93-26cd-4ddc-be43-53cd4fd60365)
 
 Faça esse procedimento e instale os requirements-dev novamente:
@@ -43,6 +52,7 @@ pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v
 ~~~
 
 Caso haja um erro com legacy-install
+
 ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/2040db6a-0d47-404f-aa98-2d2204a6ff4c)
 
 Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux:
@@ -51,22 +61,22 @@ python3 -m pip install --upgrade pip
 sudo apt-get install build-essential libssl-dev libffi-dev python3-dev
 ~~~
 
-#### Usando o terminal do Windows
+##### Usando o terminal do Windows
 
-Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos....
+Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira baixar os diários diretamente na sua máquina utilizando o Windows, é possível seguir as configurações no tutorial geral do [querido-diario]() levando em conta os possível erros que podem aparecer.
 
-Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma:
+É necessário que as configurações C++ estejam instaladas. Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 
 
-~~~Linux
-pip3 install -r data_collection/requirements-dev.txt --no-deps
-~~~ 
+Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos)
 
-Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … 
+Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. Instale e siga o resto do tutorial de configuração.
 
-Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos)
+Caso haja um erro com "pinned with == "  na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma:
 
-Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”.
+~~~Windows
+pip3 install -r data_collection/requirements-dev.txt --no-deps
+~~~ 
 
-- **Mac**
+#### Mac
 
 ...
diff --git a/configurando_ambientes.md b/configurando_ambientes.md
index 747161d..fedfcbc 100644
--- a/configurando_ambientes.md
+++ b/configurando_ambientes.md
@@ -1,4 +1,4 @@
-PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md)
 
 EN/US
 
diff --git a/scripts/Dockerfile_windows b/scripts/Dockerfile_windows
deleted file mode 100644
index df8b6e9..0000000
--- a/scripts/Dockerfile_windows
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM docker.io/python:3.10
-
-ENV USER gazette
-ENV USER_HOME /home/$USER
-ENV WORKDIR /tasks
-
-RUN net user --system $USER --home $USER_HOME && \
-	apt-get update -y && \
-  curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
-	apt-get -y install git-lfs wait-for-it && \
-	apt-get clean && \
-  git lfs install && \
-	mkdir $WORKDIR
-
-ENV PYTHONPATH $WORKDIR
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY . $WORKDIR
-WORKDIR $WORKDIR
-USER $USER
-
-RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')"
diff --git a/tutorial.md b/tutorial.md
index 1d3ebfe..039dc60 100644
--- a/tutorial.md
+++ b/tutorial.md
@@ -1,4 +1,4 @@
-PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md)
+PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md)
 
 EN/US
 

From 1b20217b7db3fefb6aa1d72c6924064a613113b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <fancelli_luisa@hotmail.com>
Date: Thu, 26 Oct 2023 16:22:12 -0300
Subject: [PATCH 19/19] add .venv gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3762ce4..9333dc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ __pycache__
 .coverage
 envvars
 contrib/data
+.venv