From e4a3f715733ab631e9138dc69b105999874c9f6c Mon Sep 17 00:00:00 2001 From: Luisa-Coelho Date: Wed, 13 Sep 2023 16:54:56 -0300 Subject: [PATCH 01/19] new Makefile Windows --- Makefile | 12 +----------- scripts/Dockerfile | 2 +- scripts/requirements.txt | 11 +++++++++++ 3 files changed, 13 insertions(+), 12 deletions(-) create mode 100644 scripts/requirements.txt diff --git a/Makefile b/Makefile index d5fc959..7f1f649 100644 --- a/Makefile +++ b/Makefile @@ -37,16 +37,6 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ --env POSTGRES_PORT=$(POSTGRES_PORT) \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) -wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ - --env POSTGRES_USER=$(POSTGRES_USER) \ - --env POSTGRES_DB=$(POSTGRES_DB) \ - --env POSTGRES_HOST=$(POSTGRES_HOST) \ - --env POSTGRES_PORT=$(POSTGRES_PORT) \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) wait-for-it --timeout=60 $1) - .PHONY: black black: podman run --rm -ti --volume $(PWD):/mnt/code:rw \ @@ -197,7 +187,7 @@ else endif set-run-variable-values: - cp --no-clobber contrib/sample.env envvars || true + copy /y contrib\sample.env envvars $(eval POD_NAME=run-$(POD_NAME)) $(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME)) $(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME)) diff --git a/scripts/Dockerfile b/scripts/Dockerfile index 139d337..129d662 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -1,4 +1,4 @@ -FROM docker.io/python:3.8 +FROM docker.io/python:3.10 ENV USER gazette ENV USER_HOME /home/$USER diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..92894c1 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,11 @@ +black==19.10b0 +coverage==5.2.1 +python-magic==0.4.18 +boto3==1.22.6 +psycopg2==2.8.6 +botocore==1.25.6 +elasticsearch==7.17.3 +requests==2.25.0 +scikit-learn==1.0.2 +sentence-transformers==2.2.0 +huggingface-hub==0.10.1 # fix: https://github.com/UKPLab/sentence-transformers/issues/1762 From 94d100fe8668e455148df748c3a5baec7c9a46e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= Date: Thu, 26 Oct 2023 16:48:42 -0300 Subject: [PATCH 02/19] windows_settings --- Makefile | 14 +- contrib/import-env.py | 7 + scripts/Dockerfile | 4 +- scripts/Dockerfile_original | 23 ++ scripts/tasks/__init__.py | 12 + .../gazette_excerpts_embedding_reranking.py | 39 ++++ .../gazette_excerpts_entities_tagging.py | 105 +++++++++ scripts/tasks/gazette_text_extraction.py | 219 ++++++++++++++++++ .../gazette_themed_excerpts_extraction.py | 191 +++++++++++++++ scripts/tasks/gazette_themes_listing.py | 13 ++ scripts/tasks/interfaces.py | 105 +++++++++ .../tasks/list_gazettes_to_be_processed.py | 143 ++++++++++++ scripts/tasks/utils/__init__.py | 5 + scripts/tasks/utils/index.py | 38 +++ scripts/tasks/utils/text.py | 5 + 15 files changed, 919 insertions(+), 4 deletions(-) create mode 100644 contrib/import-env.py create mode 100644 scripts/Dockerfile_original create mode 100644 scripts/tasks/__init__.py create mode 100644 scripts/tasks/gazette_excerpts_embedding_reranking.py create mode 100644 scripts/tasks/gazette_excerpts_entities_tagging.py create mode 100644 scripts/tasks/gazette_text_extraction.py create mode 100644 scripts/tasks/gazette_themed_excerpts_extraction.py create mode 100644 scripts/tasks/gazette_themes_listing.py create mode 100644 scripts/tasks/interfaces.py create mode 100644 scripts/tasks/list_gazettes_to_be_processed.py create mode 100644 scripts/tasks/utils/__init__.py create mode 100644 scripts/tasks/utils/index.py create mode 100644 scripts/tasks/utils/text.py diff --git a/Makefile b/Makefile index 7f1f649..5a17846 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,16 @@ run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ --env POSTGRES_PORT=$(POSTGRES_PORT) \ $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) +wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ + --pod $(POD_NAME) \ + --env PYTHONPATH=/mnt/code \ + --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ + --env POSTGRES_USER=$(POSTGRES_USER) \ + --env POSTGRES_DB=$(POSTGRES_DB) \ + --env POSTGRES_HOST=$(POSTGRES_HOST) \ + --env POSTGRES_PORT=$(POSTGRES_PORT) \ + $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) wait-for-it --timeout=60 $1) + .PHONY: black black: podman run --rm -ti --volume $(PWD):/mnt/code:rw \ @@ -187,7 +197,7 @@ else endif set-run-variable-values: - copy /y contrib\sample.env envvars + cp --no-clobber contrib/sample.env envvars || true $(eval POD_NAME=run-$(POD_NAME)) $(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME)) $(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME)) @@ -243,4 +253,4 @@ wait-elasticsearch: .PHONY: publish-tag publish-tag: podman tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):${IMAGE_TAG} $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags) - podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags) + podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags) \ No newline at end of file diff --git a/contrib/import-env.py b/contrib/import-env.py new file mode 100644 index 0000000..4cfbfd3 --- /dev/null +++ b/contrib/import-env.py @@ -0,0 +1,7 @@ +import os +from dotenv import load_dotenv + +try: + load_dotenv() # Load environment variables from a .env file +except Exception as e: + print(f"Error loading .env file: {e}") \ No newline at end of file diff --git a/scripts/Dockerfile b/scripts/Dockerfile index 129d662..df8b6e9 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -2,9 +2,9 @@ FROM docker.io/python:3.10 ENV USER gazette ENV USER_HOME /home/$USER -ENV WORKDIR /mnt/code +ENV WORKDIR /tasks -RUN adduser --system $USER --home $USER_HOME && \ +RUN net user --system $USER --home $USER_HOME && \ apt-get update -y && \ curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ apt-get -y install git-lfs wait-for-it && \ diff --git a/scripts/Dockerfile_original b/scripts/Dockerfile_original new file mode 100644 index 0000000..1fff372 --- /dev/null +++ b/scripts/Dockerfile_original @@ -0,0 +1,23 @@ +FROM docker.io/python:3.8 + +ENV USER gazette +ENV USER_HOME /home/$USER +ENV WORKDIR /mnt/code + +RUN adduser --system $USER --home $USER_HOME && \ + apt-get update -y && \ + curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ + apt-get -y install git-lfs wait-for-it && \ + apt-get clean && \ + git lfs install && \ + mkdir $WORKDIR + +ENV PYTHONPATH $WORKDIR +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . $WORKDIR +WORKDIR $WORKDIR +USER $USER + +RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" \ No newline at end of file diff --git a/scripts/tasks/__init__.py b/scripts/tasks/__init__.py new file mode 100644 index 0000000..bb16ccd --- /dev/null +++ b/scripts/tasks/__init__.py @@ -0,0 +1,12 @@ +from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts +from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts +from .gazette_text_extraction import extract_text_from_gazettes +from .gazette_themed_excerpts_extraction import extract_themed_excerpts_from_gazettes +from .gazette_themes_listing import get_themes +from .interfaces import ( + DatabaseInterface, + StorageInterface, + IndexInterface, + TextExtractorInterface, +) +from .list_gazettes_to_be_processed import get_gazettes_to_be_processed diff --git a/scripts/tasks/gazette_excerpts_embedding_reranking.py b/scripts/tasks/gazette_excerpts_embedding_reranking.py new file mode 100644 index 0000000..3919056 --- /dev/null +++ b/scripts/tasks/gazette_excerpts_embedding_reranking.py @@ -0,0 +1,39 @@ +import os +from typing import Dict, List + +import sentence_transformers + +from .interfaces import IndexInterface +from .utils import get_documents_with_ids + + +def embedding_rerank_excerpts( + theme: Dict, excerpt_ids: List[str], index: IndexInterface +) -> None: + user_folder = os.environ["HOME"] + model = sentence_transformers.SentenceTransformer( + f"{user_folder}/models/bert-base-portuguese-cased" + ) + queries = get_natural_language_queries(theme) + queries_vectors = model.encode(queries, convert_to_tensor=True) + + excerpts = ( + excerpt["_source"] + for excerpt in get_documents_with_ids(excerpt_ids, index, theme["index"]) + ) + for excerpt in excerpts: + excerpt_vector = model.encode(excerpt["excerpt"], convert_to_tensor=True) + excerpt_max_score = sentence_transformers.util.semantic_search( + excerpt_vector, queries_vectors, top_k=1 + ) + excerpt["excerpt_embedding_score"] = excerpt_max_score[0][0]["score"] + index.index_document( + excerpt, + document_id=excerpt["excerpt_id"], + index=theme["index"], + refresh=True, + ) + + +def get_natural_language_queries(theme: Dict) -> List[str]: + return [query["title"] for query in theme["queries"]] diff --git a/scripts/tasks/gazette_excerpts_entities_tagging.py b/scripts/tasks/gazette_excerpts_entities_tagging.py new file mode 100644 index 0000000..8c67303 --- /dev/null +++ b/scripts/tasks/gazette_excerpts_entities_tagging.py @@ -0,0 +1,105 @@ +import re +from typing import Dict, List + +from .interfaces import IndexInterface +from .utils import ( + get_documents_from_query_with_highlights, + get_documents_with_ids, +) + + +def tag_entities_in_excerpts( + theme: Dict, excerpt_ids: List[str], index: IndexInterface +) -> None: + tag_theme_cases(theme, excerpt_ids, index) + tag_cnpjs(theme, excerpt_ids, index) + + +def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None: + cases = theme["entities"]["cases"] + es_queries = [get_es_query_from_entity_case(case, excerpt_ids) for case in cases] + for case, es_query in zip(cases, es_queries): + documents = get_documents_from_query_with_highlights( + es_query, index, theme["index"] + ) + for document in documents: + excerpt = document["_source"] + highlight = document["highlight"][ + "excerpt.with_stopwords" + ][0] + excerpt.update( + { + "excerpt_entities": list( + set(excerpt.get("excerpt_entities", [])) | {case["title"]} + ), + "excerpt": highlight, + } + ) + index.index_document( + excerpt, + document_id=excerpt["excerpt_id"], + index=theme["index"], + refresh=True, + ) + + +def get_es_query_from_entity_case( + case: Dict, + excerpt_ids: List[str], +) -> Dict: + es_query = { + "query": {"bool": {"should": [], "filter": {"ids": {"values": excerpt_ids}}}}, + "size": 100, + "highlight": { + "fields": { + "excerpt.with_stopwords": { # Allows tagging phrases containing stopwords correctly + "type": "fvh", # Only highlighter to tag phrases correctly and not the tokens individually + "matched_fields": ["excerpt", "excerpt.with_stopwords"], + "fragment_size": 10000, + "number_of_fragments": 1, + "pre_tags": [f"<{case['category']}>"], + "post_tags": [f""], + } + }, + }, + } + for value in case["values"]: + es_query["query"]["bool"]["should"].append( + {"match_phrase": {"excerpt.with_stopwords": value}} + ) + + return es_query + + +def tag_cnpjs(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None: + excerpts = ( + document["_source"] + for document in get_documents_with_ids(excerpt_ids, index, theme["index"]) + ) + cnpj_regex = re.compile( + r""" + (^|[^\d]) # left boundary: start of string or not-a-digit + (\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d/?\d{4}-?\d{2}) # cnpj + ($|[^\d]) # right boundary: end of string or not-a-digit + """, + re.VERBOSE, + ) + for excerpt in excerpts: + found_cnpjs = re.findall(cnpj_regex, excerpt["excerpt"]) + if not found_cnpjs: + continue + + for _, cnpj, _ in set(found_cnpjs): + excerpt["excerpt"] = excerpt["excerpt"].replace( + cnpj, f"{cnpj}" + ) + + excerpt["excerpt_entities"] = list( + set(excerpt.get("excerpt_entities", [])) | {"CNPJ"} + ) + index.index_document( + excerpt, + document_id=excerpt["excerpt_id"], + index=theme["index"], + refresh=True, + ) diff --git a/scripts/tasks/gazette_text_extraction.py b/scripts/tasks/gazette_text_extraction.py new file mode 100644 index 0000000..846b8a5 --- /dev/null +++ b/scripts/tasks/gazette_text_extraction.py @@ -0,0 +1,219 @@ +import logging +import tempfile +import os +from pathlib import Path +from typing import Dict, Iterable, List + +from .interfaces import ( + DatabaseInterface, + IndexInterface, + StorageInterface, + TextExtractorInterface, +) + + +def extract_text_from_gazettes( + gazettes: Iterable[Dict], + database: DatabaseInterface, + storage: StorageInterface, + index: IndexInterface, + text_extractor: TextExtractorInterface, +) -> List[str]: + """ + Extracts the text from a list of gazettes + """ + logging.info("Starting text extraction from gazettes") + create_index(index) + + ids = [] + for gazette in gazettes: + try: + processed_gazette = try_process_gazette_file( + gazette, database, storage, index, text_extractor + ) + except Exception as e: + logging.warning( + f"Could not process gazette: {gazette['file_path']}. Cause: {e}" + ) + else: + ids.append(processed_gazette["file_checksum"]) + + return ids + + +def try_process_gazette_file( + gazette: Dict, + database: DatabaseInterface, + storage: StorageInterface, + index: IndexInterface, + text_extractor: TextExtractorInterface, +) -> Dict: + """ + Do all the work to extract the content from the gazette files + """ + logging.debug(f"Processing gazette {gazette['file_path']}") + gazette_file = download_gazette_file(gazette, storage) + get_gazette_text_and_define_url(gazette, gazette_file, text_extractor) + upload_gazette_raw_text(gazette, storage) + index.index_document(gazette, document_id=gazette["file_checksum"]) + delete_gazette_files(gazette_file) + set_gazette_as_processed(gazette, database) + return gazette + + +def create_index(index: IndexInterface) -> None: + body = { + "mappings": { + "properties": { + "created_at": {"type": "date"}, + "date": {"type": "date"}, + "edition_number": { + "type": "text", + "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, + }, + "file_checksum": {"type": "keyword"}, + "file_path": {"type": "keyword"}, + "file_url": {"type": "keyword"}, + "id": {"type": "keyword"}, + "is_extra_edition": {"type": "boolean"}, + "power": {"type": "keyword"}, + "processed": {"type": "boolean"}, + "scraped_at": {"type": "date"}, + "source_text": { + "type": "text", + "analyzer": "brazilian", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + "fields": { + "with_stopwords": { + "type": "text", + "analyzer": "brazilian_with_stopwords", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + }, + "exact": { + "type": "text", + "analyzer": "exact", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + } + }, + }, + "state_code": {"type": "keyword"}, + "territory_id": {"type": "keyword"}, + "territory_name": { + "type": "text", + "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, + }, + "url": {"type": "keyword"}, + } + }, + "settings": { + "index": { + "sort.field": ["territory_id", "date"], + "sort.order": ["asc", "desc"] + }, + "analysis": { + "filter": { + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian", + } + }, + "analyzer": { + "brazilian_with_stopwords": { + "tokenizer": "standard", + "filter": ["lowercase", "brazilian_stemmer"], + }, + "exact": { + "tokenizer": "standard", + "filter": ["lowercase"], + }, + }, + } + }, + } + index.create_index(body=body) + + +def upload_gazette_raw_text(gazette: Dict, storage): + """ + Define gazette raw text + """ + file_raw_txt = Path(gazette["file_path"]).with_suffix(".txt").as_posix() + storage.upload_content(file_raw_txt, gazette["source_text"]) + logging.debug(f"file_raw_txt uploaded {file_raw_txt}") + file_endpoint = get_file_endpoint() + gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}" + + +def get_gazette_text_and_define_url( + gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface +): + """ + Extract file content and define the url to access the file in the storage + """ + gazette["source_text"] = try_to_extract_content(gazette_file, text_extractor) + file_endpoint = get_file_endpoint() + gazette["url"] = f"{file_endpoint}/{gazette['file_path']}" + + +def get_file_endpoint() -> str: + """ + Get the endpoint where the gazette files can be downloaded. + """ + return os.environ["QUERIDO_DIARIO_FILES_ENDPOINT"] + + +def try_to_extract_content( + gazette_file: str, text_extractor: TextExtractorInterface +) -> str: + """ + Calls the function to extract the content from the gazette file. If it fails + remove the gazette file and raise an exception + """ + try: + return text_extractor.extract_text(gazette_file) + except Exception as e: + os.remove(gazette_file) + raise e + + +def delete_gazette_files(gazette_file: str) -> None: + """ + Removes the files used to process the gazette content. + """ + os.remove(gazette_file) + + +def download_gazette_file(gazette: Dict, storage: StorageInterface) -> str: + """ + Download the file from the object storage and write it down in the local + disk to allow the text extraction + """ + with tempfile.NamedTemporaryFile(delete=False) as tmpfile: + gazette_file_key = get_gazette_file_key_used_in_storage(gazette) + storage.get_file(gazette_file_key, tmpfile) + return tmpfile.name + + +def get_gazette_file_key_used_in_storage(gazette: Dict) -> str: + """ + Get the file key used to store the gazette in the object storage + """ + return gazette["file_path"] + + +def set_gazette_as_processed(gazette: Dict, database: DatabaseInterface) -> None: + command = """ + UPDATE gazettes + SET processed = True + WHERE id = %(id)s + AND file_checksum = %(file_checksum)s + ; + """ + id = gazette["id"] + checksum = gazette["file_checksum"] + data = {"id": id, "file_checksum": checksum} + logging.debug(f"Marking {id}({checksum}) as processed") + database.update(command, data) diff --git a/scripts/tasks/gazette_themed_excerpts_extraction.py b/scripts/tasks/gazette_themed_excerpts_extraction.py new file mode 100644 index 0000000..1e87c89 --- /dev/null +++ b/scripts/tasks/gazette_themed_excerpts_extraction.py @@ -0,0 +1,191 @@ +import hashlib +from typing import Dict, Iterable, List + +from .interfaces import IndexInterface +from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights + + +def extract_themed_excerpts_from_gazettes( + theme: Dict, gazette_ids: List[str], index: IndexInterface +) -> List[str]: + create_index(theme, index) + + ids = [] + for theme_query in theme["queries"]: + for excerpt in get_excerpts_from_gazettes_with_themed_query( + theme_query, gazette_ids, index + ): + # excerpts with less than 10% of the expected size of excerpt account for + # fewer than 1% of excerpts yet their score is usually high + if len(excerpt["excerpt"]) < 200: + continue + + index.index_document( + excerpt, + document_id=excerpt["excerpt_id"], + index=theme["index"], + refresh=True, + ) + ids.append(excerpt["excerpt_id"]) + + return ids + + +def create_index(theme: Dict, index: IndexInterface) -> None: + body = { + "mappings": { + "properties": { + "excerpt_embedding_score": {"type": "rank_feature"}, + "excerpt_subthemes": {"type": "keyword"}, + "excerpt_entities": {"type": "keyword"}, + "excerpt": { + "type": "text", + "analyzer": "brazilian", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + "fields": { + "with_stopwords": { + "type": "text", + "analyzer": "brazilian_with_stopwords", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + }, + "exact": { + "type": "text", + "analyzer": "exact", + "index_options": "offsets", + "term_vector": "with_positions_offsets", + }, + }, + }, + "excerpt_id": {"type": "keyword"}, + "source_database_id": {"type": "long"}, + "source_index_id": {"type": "keyword"}, + "source_created_at": {"type": "date"}, + "source_date": {"type": "date"}, + "source_edition_number": {"type": "keyword"}, + "source_file_checksum": {"type": "keyword"}, + "source_file_path": {"type": "keyword"}, + "source_file_raw_txt": {"type": "keyword"}, + "source_file_url": {"type": "keyword"}, + "source_is_extra_edition": {"type": "boolean"}, + "source_power": {"type": "keyword"}, + "source_processed": {"type": "boolean"}, + "source_scraped_at": {"type": "date"}, + "source_state_code": {"type": "keyword"}, + "source_territory_id": {"type": "keyword"}, + "source_territory_name": {"type": "keyword"}, + "source_url": {"type": "keyword"}, + } + }, + "settings": { + "index": { + "sort.field": ["source_territory_id", "source_date"], + "sort.order": ["asc", "desc"] + }, + "analysis": { + "filter": { + "brazilian_stemmer": { + "type": "stemmer", + "language": "brazilian", + } + }, + "analyzer": { + "brazilian_with_stopwords": { + "tokenizer": "standard", + "filter": ["lowercase", "brazilian_stemmer"], + }, + "exact": { + "tokenizer": "standard", + "filter": ["lowercase"], + }, + }, + } + }, + } + index.create_index(index_name=theme["index"], body=body) + + +def get_excerpts_from_gazettes_with_themed_query( + query: Dict, gazette_ids: List[str], index: IndexInterface +) -> Iterable[Dict]: + es_query = get_es_query_from_themed_query(query, gazette_ids, index) + documents = get_documents_from_query_with_highlights(es_query, index) + for document in documents: + gazette = document["_source"] + excerpts = document["highlight"]["source_text.with_stopwords"] + for excerpt in excerpts: + yield { + "excerpt": preprocess_excerpt(excerpt), + "excerpt_subthemes": [query["title"]], + "excerpt_id": generate_excerpt_id(excerpt, gazette), + "source_index_id": gazette["file_checksum"], + "source_created_at": gazette["created_at"], + "source_database_id": gazette["id"], + "source_date": gazette["date"], + "source_edition_number": gazette["edition_number"], + "source_file_raw_txt": gazette["file_raw_txt"], + "source_is_extra_edition": gazette["is_extra_edition"], + "source_file_checksum": gazette["file_checksum"], + "source_file_path": gazette["file_path"], + "source_file_url": gazette["file_url"], + "source_power": gazette["power"], + "source_processed": gazette["processed"], + "source_scraped_at": gazette["scraped_at"], + "source_state_code": gazette["state_code"], + "source_territory_id": gazette["territory_id"], + "source_territory_name": gazette["territory_name"], + "source_url": gazette["url"], + } + + +def generate_excerpt_id(excerpt: str, gazette: Dict) -> str: + hash = hashlib.md5() + hash.update(excerpt.encode()) + return f"{gazette['file_checksum']}_{hash.hexdigest()}" + + +def get_es_query_from_themed_query( + query: Dict, + gazette_ids: List[str], + index: IndexInterface, +) -> Dict: + es_query = { + "query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}}, + "size": 100, + "highlight": { + "fields": { + "source_text.with_stopwords": { + "type": "unified", + "fragment_size": 2000, + "number_of_fragments": 10, + "pre_tags": [""], + "post_tags": [""], + } + }, + }, + } + + macro_synonym_block = {"span_or": {"clauses": []}} + for macro_set in query["term_sets"]: + proximity_block = {"span_near": {"clauses": [], "slop": 20, "in_order": False}} + for term_set in macro_set: + synonym_block = {"span_or": {"clauses": []}} + for term in term_set: + phrase_block = { + "span_near": {"clauses": [], "slop": 0, "in_order": True} + } + tokenized_term = index.analyze(text=term, field="source_text.with_stopwords") + for token in tokenized_term["tokens"]: + word_block = {"span_term": {"source_text.with_stopwords": token["token"]}} + phrase_block["span_near"]["clauses"].append(word_block) + synonym_block["span_or"]["clauses"].append(phrase_block) + proximity_block["span_near"]["clauses"].append(synonym_block) + macro_synonym_block["span_or"]["clauses"].append(proximity_block) + + es_query["query"]["bool"]["must"].append(macro_synonym_block) + return es_query + + +def preprocess_excerpt(excerpt: str) -> str: + return clean_extra_whitespaces(excerpt) diff --git a/scripts/tasks/gazette_themes_listing.py b/scripts/tasks/gazette_themes_listing.py new file mode 100644 index 0000000..1dbb60c --- /dev/null +++ b/scripts/tasks/gazette_themes_listing.py @@ -0,0 +1,13 @@ +import json +import pathlib +from typing import Dict, List + + +def get_themes() -> List[Dict]: + ROOT = pathlib.Path(__file__).parent.parent + themes_config = ROOT / "config" / "themes_config.json" + + with themes_config.open() as f: + themes = json.load(f)["themes"] + + return themes diff --git a/scripts/tasks/interfaces.py b/scripts/tasks/interfaces.py new file mode 100644 index 0000000..06b81cb --- /dev/null +++ b/scripts/tasks/interfaces.py @@ -0,0 +1,105 @@ +from typing import Dict, Iterable, Tuple +import abc + + +class DatabaseInterface(abc.ABC): + """ + Interface to abstract the iteraction with the database storing data used by the + tasks + """ + + @abc.abstractmethod + def _commit_changes(self, command: str, data: Dict) -> None: + """ + Make a change in the database and commit it + """ + + @abc.abstractmethod + def select(self, command: str) -> Iterable[Tuple]: + """ + Select entries from the database + """ + + @abc.abstractmethod + def insert(self, command: str, data: Dict) -> None: + """ + Insert entries into the database + """ + + @abc.abstractmethod + def update(self, command: str, data: Dict) -> None: + """ + Update entries from the database + """ + + @abc.abstractmethod + def delete(self, command: str, data: Dict) -> None: + """ + Delete entries from the database + """ + + +class StorageInterface(abc.ABC): + """ + Interface to abstract the interaction with the object store system. + """ + + @abc.abstractmethod + def get_file(self, file_to_be_downloaded: str, destination) -> None: + """ + Download the given file key in the destination on the host + """ + + @abc.abstractmethod + def upload_content(self, file_key: str, content_to_be_uploaded: str) -> None: + """ + Upload the given content to the destination on the host + """ + + +class IndexInterface(abc.ABC): + """ + Interface to abstract the interaction with the index system + """ + + @abc.abstractmethod + def create_index(self, index_name: str, body: Dict) -> None: + """ + Create the index used by the application + """ + + @abc.abstractmethod + def refresh_index(self, index_name: str) -> None: + """ + Refreshes the index to make it up-to-date for future searches + """ + + @abc.abstractmethod + def index_document( + self, document: Dict, document_id: str, index: str, refresh: bool + ) -> None: + """ + Upload document to the index + """ + + @abc.abstractmethod + def search(self, query: Dict, index: str) -> Dict: + """ + Searches the index with the provided query + """ + + @abc.abstractmethod + def paginated_search( + self, query: Dict, index: str, keep_alive: str + ) -> Iterable[Dict]: + """ + Searches the index with the provided query, with pagination + """ + + +class TextExtractorInterface(abc.ABC): + @abc.abstractmethod + def extract_text(self, filepath: str) -> str: + """ + Extract the text from the given file + """ diff --git a/scripts/tasks/list_gazettes_to_be_processed.py b/scripts/tasks/list_gazettes_to_be_processed.py new file mode 100644 index 0000000..1547e7b --- /dev/null +++ b/scripts/tasks/list_gazettes_to_be_processed.py @@ -0,0 +1,143 @@ +import logging +from typing import Dict, Iterable + +from .interfaces import DatabaseInterface + + +def get_gazettes_to_be_processed( + execution_mode: str, database: DatabaseInterface +) -> Iterable[Dict]: + if execution_mode == "DAILY": + yield from get_gazettes_extracted_since_yesterday(database) + elif execution_mode == "ALL": + yield from get_all_gazettes_extracted(database) + elif execution_mode == "UNPROCESSED": + yield from get_unprocessed_gazettes(database) + else: + raise Exception(f'Execution mode "{execution_mode}" is invalid.') + + +def get_gazettes_extracted_since_yesterday( + database: DatabaseInterface, +) -> Iterable[Dict]: + """ + List the gazettes which were extracted since yesterday + """ + logging.info("Listing gazettes extracted since yesterday") + + command = """ + SELECT + gazettes.id, + gazettes.source_text, + gazettes.date, + gazettes.edition_number, + gazettes.is_extra_edition, + gazettes.power, + gazettes.file_checksum, + gazettes.file_path, + gazettes.file_url, + gazettes.scraped_at, + gazettes.created_at, + gazettes.territory_id, + gazettes.processed, + territories.name as territory_name, + territories.state_code + FROM + gazettes + INNER JOIN territories ON territories.id = gazettes.territory_id + WHERE + scraped_at > current_timestamp - interval '1 day' + ; + """ + for gazette in database.select(command): + yield format_gazette_data(gazette) + + +def get_all_gazettes_extracted( + database: DatabaseInterface, +) -> Iterable[Dict]: + """ + List all the gazettes which were extracted + """ + logging.info("Listing all gazettes extracted") + + command = """ + SELECT + gazettes.id, + gazettes.source_text, + gazettes.date, + gazettes.edition_number, + gazettes.is_extra_edition, + gazettes.power, + gazettes.file_checksum, + gazettes.file_path, + gazettes.file_url, + gazettes.scraped_at, + gazettes.created_at, + gazettes.territory_id, + gazettes.processed, + territories.name as territory_name, + territories.state_code + FROM + gazettes + INNER JOIN territories ON territories.id = gazettes.territory_id + ; + """ + for gazette in database.select(command): + yield format_gazette_data(gazette) + + +def get_unprocessed_gazettes( + database: DatabaseInterface, +) -> Iterable[Dict]: + """ + List all the gazettes which were extracted + """ + logging.info("Listing all gazettes extracted") + + command = """ + SELECT + gazettes.id, + gazettes.source_text, + gazettes.date, + gazettes.edition_number, + gazettes.is_extra_edition, + gazettes.power, + gazettes.file_checksum, + gazettes.file_path, + gazettes.file_url, + gazettes.scraped_at, + gazettes.created_at, + gazettes.territory_id, + gazettes.processed, + territories.name as territory_name, + territories.state_code + FROM + gazettes + INNER JOIN territories ON territories.id = gazettes.territory_id + WHERE + processed is False + ; + """ + for gazette in database.select(command): + yield format_gazette_data(gazette) + + +def format_gazette_data(data): + return { + "id": data[0], + "source_text": data[1], + "date": data[2], + "edition_number": data[3], + "is_extra_edition": data[4], + "power": data[5], + "file_checksum": data[6], + "file_path": data[7], + "file_url": data[8], + "scraped_at": data[9], + "created_at": data[10], + "territory_id": data[11], + "processed": data[12], + "territory_name": data[13], + "state_code": data[14], + } diff --git a/scripts/tasks/utils/__init__.py b/scripts/tasks/utils/__init__.py new file mode 100644 index 0000000..1bd9cf3 --- /dev/null +++ b/scripts/tasks/utils/__init__.py @@ -0,0 +1,5 @@ +from .index import ( + get_documents_from_query_with_highlights, + get_documents_with_ids, +) +from .text import clean_extra_whitespaces diff --git a/scripts/tasks/utils/index.py b/scripts/tasks/utils/index.py new file mode 100644 index 0000000..83d769c --- /dev/null +++ b/scripts/tasks/utils/index.py @@ -0,0 +1,38 @@ +from typing import Dict, Iterable, List + +from ..interfaces import IndexInterface + + +def get_documents_with_ids( + ids: List[str], index: IndexInterface, index_name: str = "" +) -> Iterable[Dict]: + query_filter_by_ids = { + "query": {"bool": {"filter": {"ids": {"values": ids}}}}, + "size": 100, + } + yield from get_documents_from_query(query_filter_by_ids, index, index_name) + + +def get_documents_from_query( + query: Dict, index: IndexInterface, index_name: str = "" +) -> Iterable[Dict]: + index.refresh_index(index_name) + documents = ( + hit + for result in index.paginated_search(query, index=index_name) + for hit in result["hits"]["hits"] + ) + yield from documents + + +def get_documents_from_query_with_highlights( + query: Dict, index: IndexInterface, index_name: str = "" +) -> Iterable[Dict]: + index.refresh_index(index_name) + documents = ( + hit + for result in index.paginated_search(query, index=index_name) + for hit in result["hits"]["hits"] + if hit.get("highlight") + ) + yield from documents diff --git a/scripts/tasks/utils/text.py b/scripts/tasks/utils/text.py new file mode 100644 index 0000000..1cc7c39 --- /dev/null +++ b/scripts/tasks/utils/text.py @@ -0,0 +1,5 @@ +import re + + +def clean_extra_whitespaces(text: str) -> str: + return re.sub(r"\s+", " ", text) From a8d710f97437bafd1ffbd6c3b500663f1f532b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= Date: Wed, 4 Oct 2023 19:07:42 -0300 Subject: [PATCH 03/19] env readme_update --- scripts/Dockerfile | 8 +- ...Dockerfile_original => Dockerfile_windows} | 8 +- scripts/tasks/__init__.py | 12 - .../gazette_excerpts_embedding_reranking.py | 39 ---- .../gazette_excerpts_entities_tagging.py | 105 --------- scripts/tasks/gazette_text_extraction.py | 219 ------------------ .../gazette_themed_excerpts_extraction.py | 191 --------------- scripts/tasks/gazette_themes_listing.py | 13 -- scripts/tasks/interfaces.py | 105 --------- .../tasks/list_gazettes_to_be_processed.py | 143 ------------ scripts/tasks/utils/__init__.py | 5 - scripts/tasks/utils/index.py | 38 --- scripts/tasks/utils/text.py | 5 - 13 files changed, 8 insertions(+), 883 deletions(-) rename scripts/{Dockerfile_original => Dockerfile_windows} (78%) delete mode 100644 scripts/tasks/__init__.py delete mode 100644 scripts/tasks/gazette_excerpts_embedding_reranking.py delete mode 100644 scripts/tasks/gazette_excerpts_entities_tagging.py delete mode 100644 scripts/tasks/gazette_text_extraction.py delete mode 100644 scripts/tasks/gazette_themed_excerpts_extraction.py delete mode 100644 scripts/tasks/gazette_themes_listing.py delete mode 100644 scripts/tasks/interfaces.py delete mode 100644 scripts/tasks/list_gazettes_to_be_processed.py delete mode 100644 scripts/tasks/utils/__init__.py delete mode 100644 scripts/tasks/utils/index.py delete mode 100644 scripts/tasks/utils/text.py diff --git a/scripts/Dockerfile b/scripts/Dockerfile index df8b6e9..1fff372 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -1,10 +1,10 @@ -FROM docker.io/python:3.10 +FROM docker.io/python:3.8 ENV USER gazette ENV USER_HOME /home/$USER -ENV WORKDIR /tasks +ENV WORKDIR /mnt/code -RUN net user --system $USER --home $USER_HOME && \ +RUN adduser --system $USER --home $USER_HOME && \ apt-get update -y && \ curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ apt-get -y install git-lfs wait-for-it && \ @@ -20,4 +20,4 @@ COPY . $WORKDIR WORKDIR $WORKDIR USER $USER -RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" +RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" \ No newline at end of file diff --git a/scripts/Dockerfile_original b/scripts/Dockerfile_windows similarity index 78% rename from scripts/Dockerfile_original rename to scripts/Dockerfile_windows index 1fff372..df8b6e9 100644 --- a/scripts/Dockerfile_original +++ b/scripts/Dockerfile_windows @@ -1,10 +1,10 @@ -FROM docker.io/python:3.8 +FROM docker.io/python:3.10 ENV USER gazette ENV USER_HOME /home/$USER -ENV WORKDIR /mnt/code +ENV WORKDIR /tasks -RUN adduser --system $USER --home $USER_HOME && \ +RUN net user --system $USER --home $USER_HOME && \ apt-get update -y && \ curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ apt-get -y install git-lfs wait-for-it && \ @@ -20,4 +20,4 @@ COPY . $WORKDIR WORKDIR $WORKDIR USER $USER -RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" \ No newline at end of file +RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" diff --git a/scripts/tasks/__init__.py b/scripts/tasks/__init__.py deleted file mode 100644 index bb16ccd..0000000 --- a/scripts/tasks/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts -from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts -from .gazette_text_extraction import extract_text_from_gazettes -from .gazette_themed_excerpts_extraction import extract_themed_excerpts_from_gazettes -from .gazette_themes_listing import get_themes -from .interfaces import ( - DatabaseInterface, - StorageInterface, - IndexInterface, - TextExtractorInterface, -) -from .list_gazettes_to_be_processed import get_gazettes_to_be_processed diff --git a/scripts/tasks/gazette_excerpts_embedding_reranking.py b/scripts/tasks/gazette_excerpts_embedding_reranking.py deleted file mode 100644 index 3919056..0000000 --- a/scripts/tasks/gazette_excerpts_embedding_reranking.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -from typing import Dict, List - -import sentence_transformers - -from .interfaces import IndexInterface -from .utils import get_documents_with_ids - - -def embedding_rerank_excerpts( - theme: Dict, excerpt_ids: List[str], index: IndexInterface -) -> None: - user_folder = os.environ["HOME"] - model = sentence_transformers.SentenceTransformer( - f"{user_folder}/models/bert-base-portuguese-cased" - ) - queries = get_natural_language_queries(theme) - queries_vectors = model.encode(queries, convert_to_tensor=True) - - excerpts = ( - excerpt["_source"] - for excerpt in get_documents_with_ids(excerpt_ids, index, theme["index"]) - ) - for excerpt in excerpts: - excerpt_vector = model.encode(excerpt["excerpt"], convert_to_tensor=True) - excerpt_max_score = sentence_transformers.util.semantic_search( - excerpt_vector, queries_vectors, top_k=1 - ) - excerpt["excerpt_embedding_score"] = excerpt_max_score[0][0]["score"] - index.index_document( - excerpt, - document_id=excerpt["excerpt_id"], - index=theme["index"], - refresh=True, - ) - - -def get_natural_language_queries(theme: Dict) -> List[str]: - return [query["title"] for query in theme["queries"]] diff --git a/scripts/tasks/gazette_excerpts_entities_tagging.py b/scripts/tasks/gazette_excerpts_entities_tagging.py deleted file mode 100644 index 8c67303..0000000 --- a/scripts/tasks/gazette_excerpts_entities_tagging.py +++ /dev/null @@ -1,105 +0,0 @@ -import re -from typing import Dict, List - -from .interfaces import IndexInterface -from .utils import ( - get_documents_from_query_with_highlights, - get_documents_with_ids, -) - - -def tag_entities_in_excerpts( - theme: Dict, excerpt_ids: List[str], index: IndexInterface -) -> None: - tag_theme_cases(theme, excerpt_ids, index) - tag_cnpjs(theme, excerpt_ids, index) - - -def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None: - cases = theme["entities"]["cases"] - es_queries = [get_es_query_from_entity_case(case, excerpt_ids) for case in cases] - for case, es_query in zip(cases, es_queries): - documents = get_documents_from_query_with_highlights( - es_query, index, theme["index"] - ) - for document in documents: - excerpt = document["_source"] - highlight = document["highlight"][ - "excerpt.with_stopwords" - ][0] - excerpt.update( - { - "excerpt_entities": list( - set(excerpt.get("excerpt_entities", [])) | {case["title"]} - ), - "excerpt": highlight, - } - ) - index.index_document( - excerpt, - document_id=excerpt["excerpt_id"], - index=theme["index"], - refresh=True, - ) - - -def get_es_query_from_entity_case( - case: Dict, - excerpt_ids: List[str], -) -> Dict: - es_query = { - "query": {"bool": {"should": [], "filter": {"ids": {"values": excerpt_ids}}}}, - "size": 100, - "highlight": { - "fields": { - "excerpt.with_stopwords": { # Allows tagging phrases containing stopwords correctly - "type": "fvh", # Only highlighter to tag phrases correctly and not the tokens individually - "matched_fields": ["excerpt", "excerpt.with_stopwords"], - "fragment_size": 10000, - "number_of_fragments": 1, - "pre_tags": [f"<{case['category']}>"], - "post_tags": [f""], - } - }, - }, - } - for value in case["values"]: - es_query["query"]["bool"]["should"].append( - {"match_phrase": {"excerpt.with_stopwords": value}} - ) - - return es_query - - -def tag_cnpjs(theme: Dict, excerpt_ids: List[str], index: IndexInterface) -> None: - excerpts = ( - document["_source"] - for document in get_documents_with_ids(excerpt_ids, index, theme["index"]) - ) - cnpj_regex = re.compile( - r""" - (^|[^\d]) # left boundary: start of string or not-a-digit - (\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d\.?\d/?\d{4}-?\d{2}) # cnpj - ($|[^\d]) # right boundary: end of string or not-a-digit - """, - re.VERBOSE, - ) - for excerpt in excerpts: - found_cnpjs = re.findall(cnpj_regex, excerpt["excerpt"]) - if not found_cnpjs: - continue - - for _, cnpj, _ in set(found_cnpjs): - excerpt["excerpt"] = excerpt["excerpt"].replace( - cnpj, f"{cnpj}" - ) - - excerpt["excerpt_entities"] = list( - set(excerpt.get("excerpt_entities", [])) | {"CNPJ"} - ) - index.index_document( - excerpt, - document_id=excerpt["excerpt_id"], - index=theme["index"], - refresh=True, - ) diff --git a/scripts/tasks/gazette_text_extraction.py b/scripts/tasks/gazette_text_extraction.py deleted file mode 100644 index 846b8a5..0000000 --- a/scripts/tasks/gazette_text_extraction.py +++ /dev/null @@ -1,219 +0,0 @@ -import logging -import tempfile -import os -from pathlib import Path -from typing import Dict, Iterable, List - -from .interfaces import ( - DatabaseInterface, - IndexInterface, - StorageInterface, - TextExtractorInterface, -) - - -def extract_text_from_gazettes( - gazettes: Iterable[Dict], - database: DatabaseInterface, - storage: StorageInterface, - index: IndexInterface, - text_extractor: TextExtractorInterface, -) -> List[str]: - """ - Extracts the text from a list of gazettes - """ - logging.info("Starting text extraction from gazettes") - create_index(index) - - ids = [] - for gazette in gazettes: - try: - processed_gazette = try_process_gazette_file( - gazette, database, storage, index, text_extractor - ) - except Exception as e: - logging.warning( - f"Could not process gazette: {gazette['file_path']}. Cause: {e}" - ) - else: - ids.append(processed_gazette["file_checksum"]) - - return ids - - -def try_process_gazette_file( - gazette: Dict, - database: DatabaseInterface, - storage: StorageInterface, - index: IndexInterface, - text_extractor: TextExtractorInterface, -) -> Dict: - """ - Do all the work to extract the content from the gazette files - """ - logging.debug(f"Processing gazette {gazette['file_path']}") - gazette_file = download_gazette_file(gazette, storage) - get_gazette_text_and_define_url(gazette, gazette_file, text_extractor) - upload_gazette_raw_text(gazette, storage) - index.index_document(gazette, document_id=gazette["file_checksum"]) - delete_gazette_files(gazette_file) - set_gazette_as_processed(gazette, database) - return gazette - - -def create_index(index: IndexInterface) -> None: - body = { - "mappings": { - "properties": { - "created_at": {"type": "date"}, - "date": {"type": "date"}, - "edition_number": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "file_checksum": {"type": "keyword"}, - "file_path": {"type": "keyword"}, - "file_url": {"type": "keyword"}, - "id": {"type": "keyword"}, - "is_extra_edition": {"type": "boolean"}, - "power": {"type": "keyword"}, - "processed": {"type": "boolean"}, - "scraped_at": {"type": "date"}, - "source_text": { - "type": "text", - "analyzer": "brazilian", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - "fields": { - "with_stopwords": { - "type": "text", - "analyzer": "brazilian_with_stopwords", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - }, - "exact": { - "type": "text", - "analyzer": "exact", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - } - }, - }, - "state_code": {"type": "keyword"}, - "territory_id": {"type": "keyword"}, - "territory_name": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "url": {"type": "keyword"}, - } - }, - "settings": { - "index": { - "sort.field": ["territory_id", "date"], - "sort.order": ["asc", "desc"] - }, - "analysis": { - "filter": { - "brazilian_stemmer": { - "type": "stemmer", - "language": "brazilian", - } - }, - "analyzer": { - "brazilian_with_stopwords": { - "tokenizer": "standard", - "filter": ["lowercase", "brazilian_stemmer"], - }, - "exact": { - "tokenizer": "standard", - "filter": ["lowercase"], - }, - }, - } - }, - } - index.create_index(body=body) - - -def upload_gazette_raw_text(gazette: Dict, storage): - """ - Define gazette raw text - """ - file_raw_txt = Path(gazette["file_path"]).with_suffix(".txt").as_posix() - storage.upload_content(file_raw_txt, gazette["source_text"]) - logging.debug(f"file_raw_txt uploaded {file_raw_txt}") - file_endpoint = get_file_endpoint() - gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}" - - -def get_gazette_text_and_define_url( - gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface -): - """ - Extract file content and define the url to access the file in the storage - """ - gazette["source_text"] = try_to_extract_content(gazette_file, text_extractor) - file_endpoint = get_file_endpoint() - gazette["url"] = f"{file_endpoint}/{gazette['file_path']}" - - -def get_file_endpoint() -> str: - """ - Get the endpoint where the gazette files can be downloaded. - """ - return os.environ["QUERIDO_DIARIO_FILES_ENDPOINT"] - - -def try_to_extract_content( - gazette_file: str, text_extractor: TextExtractorInterface -) -> str: - """ - Calls the function to extract the content from the gazette file. If it fails - remove the gazette file and raise an exception - """ - try: - return text_extractor.extract_text(gazette_file) - except Exception as e: - os.remove(gazette_file) - raise e - - -def delete_gazette_files(gazette_file: str) -> None: - """ - Removes the files used to process the gazette content. - """ - os.remove(gazette_file) - - -def download_gazette_file(gazette: Dict, storage: StorageInterface) -> str: - """ - Download the file from the object storage and write it down in the local - disk to allow the text extraction - """ - with tempfile.NamedTemporaryFile(delete=False) as tmpfile: - gazette_file_key = get_gazette_file_key_used_in_storage(gazette) - storage.get_file(gazette_file_key, tmpfile) - return tmpfile.name - - -def get_gazette_file_key_used_in_storage(gazette: Dict) -> str: - """ - Get the file key used to store the gazette in the object storage - """ - return gazette["file_path"] - - -def set_gazette_as_processed(gazette: Dict, database: DatabaseInterface) -> None: - command = """ - UPDATE gazettes - SET processed = True - WHERE id = %(id)s - AND file_checksum = %(file_checksum)s - ; - """ - id = gazette["id"] - checksum = gazette["file_checksum"] - data = {"id": id, "file_checksum": checksum} - logging.debug(f"Marking {id}({checksum}) as processed") - database.update(command, data) diff --git a/scripts/tasks/gazette_themed_excerpts_extraction.py b/scripts/tasks/gazette_themed_excerpts_extraction.py deleted file mode 100644 index 1e87c89..0000000 --- a/scripts/tasks/gazette_themed_excerpts_extraction.py +++ /dev/null @@ -1,191 +0,0 @@ -import hashlib -from typing import Dict, Iterable, List - -from .interfaces import IndexInterface -from .utils import clean_extra_whitespaces, get_documents_from_query_with_highlights - - -def extract_themed_excerpts_from_gazettes( - theme: Dict, gazette_ids: List[str], index: IndexInterface -) -> List[str]: - create_index(theme, index) - - ids = [] - for theme_query in theme["queries"]: - for excerpt in get_excerpts_from_gazettes_with_themed_query( - theme_query, gazette_ids, index - ): - # excerpts with less than 10% of the expected size of excerpt account for - # fewer than 1% of excerpts yet their score is usually high - if len(excerpt["excerpt"]) < 200: - continue - - index.index_document( - excerpt, - document_id=excerpt["excerpt_id"], - index=theme["index"], - refresh=True, - ) - ids.append(excerpt["excerpt_id"]) - - return ids - - -def create_index(theme: Dict, index: IndexInterface) -> None: - body = { - "mappings": { - "properties": { - "excerpt_embedding_score": {"type": "rank_feature"}, - "excerpt_subthemes": {"type": "keyword"}, - "excerpt_entities": {"type": "keyword"}, - "excerpt": { - "type": "text", - "analyzer": "brazilian", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - "fields": { - "with_stopwords": { - "type": "text", - "analyzer": "brazilian_with_stopwords", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - }, - "exact": { - "type": "text", - "analyzer": "exact", - "index_options": "offsets", - "term_vector": "with_positions_offsets", - }, - }, - }, - "excerpt_id": {"type": "keyword"}, - "source_database_id": {"type": "long"}, - "source_index_id": {"type": "keyword"}, - "source_created_at": {"type": "date"}, - "source_date": {"type": "date"}, - "source_edition_number": {"type": "keyword"}, - "source_file_checksum": {"type": "keyword"}, - "source_file_path": {"type": "keyword"}, - "source_file_raw_txt": {"type": "keyword"}, - "source_file_url": {"type": "keyword"}, - "source_is_extra_edition": {"type": "boolean"}, - "source_power": {"type": "keyword"}, - "source_processed": {"type": "boolean"}, - "source_scraped_at": {"type": "date"}, - "source_state_code": {"type": "keyword"}, - "source_territory_id": {"type": "keyword"}, - "source_territory_name": {"type": "keyword"}, - "source_url": {"type": "keyword"}, - } - }, - "settings": { - "index": { - "sort.field": ["source_territory_id", "source_date"], - "sort.order": ["asc", "desc"] - }, - "analysis": { - "filter": { - "brazilian_stemmer": { - "type": "stemmer", - "language": "brazilian", - } - }, - "analyzer": { - "brazilian_with_stopwords": { - "tokenizer": "standard", - "filter": ["lowercase", "brazilian_stemmer"], - }, - "exact": { - "tokenizer": "standard", - "filter": ["lowercase"], - }, - }, - } - }, - } - index.create_index(index_name=theme["index"], body=body) - - -def get_excerpts_from_gazettes_with_themed_query( - query: Dict, gazette_ids: List[str], index: IndexInterface -) -> Iterable[Dict]: - es_query = get_es_query_from_themed_query(query, gazette_ids, index) - documents = get_documents_from_query_with_highlights(es_query, index) - for document in documents: - gazette = document["_source"] - excerpts = document["highlight"]["source_text.with_stopwords"] - for excerpt in excerpts: - yield { - "excerpt": preprocess_excerpt(excerpt), - "excerpt_subthemes": [query["title"]], - "excerpt_id": generate_excerpt_id(excerpt, gazette), - "source_index_id": gazette["file_checksum"], - "source_created_at": gazette["created_at"], - "source_database_id": gazette["id"], - "source_date": gazette["date"], - "source_edition_number": gazette["edition_number"], - "source_file_raw_txt": gazette["file_raw_txt"], - "source_is_extra_edition": gazette["is_extra_edition"], - "source_file_checksum": gazette["file_checksum"], - "source_file_path": gazette["file_path"], - "source_file_url": gazette["file_url"], - "source_power": gazette["power"], - "source_processed": gazette["processed"], - "source_scraped_at": gazette["scraped_at"], - "source_state_code": gazette["state_code"], - "source_territory_id": gazette["territory_id"], - "source_territory_name": gazette["territory_name"], - "source_url": gazette["url"], - } - - -def generate_excerpt_id(excerpt: str, gazette: Dict) -> str: - hash = hashlib.md5() - hash.update(excerpt.encode()) - return f"{gazette['file_checksum']}_{hash.hexdigest()}" - - -def get_es_query_from_themed_query( - query: Dict, - gazette_ids: List[str], - index: IndexInterface, -) -> Dict: - es_query = { - "query": {"bool": {"must": [], "filter": {"ids": {"values": gazette_ids}}}}, - "size": 100, - "highlight": { - "fields": { - "source_text.with_stopwords": { - "type": "unified", - "fragment_size": 2000, - "number_of_fragments": 10, - "pre_tags": [""], - "post_tags": [""], - } - }, - }, - } - - macro_synonym_block = {"span_or": {"clauses": []}} - for macro_set in query["term_sets"]: - proximity_block = {"span_near": {"clauses": [], "slop": 20, "in_order": False}} - for term_set in macro_set: - synonym_block = {"span_or": {"clauses": []}} - for term in term_set: - phrase_block = { - "span_near": {"clauses": [], "slop": 0, "in_order": True} - } - tokenized_term = index.analyze(text=term, field="source_text.with_stopwords") - for token in tokenized_term["tokens"]: - word_block = {"span_term": {"source_text.with_stopwords": token["token"]}} - phrase_block["span_near"]["clauses"].append(word_block) - synonym_block["span_or"]["clauses"].append(phrase_block) - proximity_block["span_near"]["clauses"].append(synonym_block) - macro_synonym_block["span_or"]["clauses"].append(proximity_block) - - es_query["query"]["bool"]["must"].append(macro_synonym_block) - return es_query - - -def preprocess_excerpt(excerpt: str) -> str: - return clean_extra_whitespaces(excerpt) diff --git a/scripts/tasks/gazette_themes_listing.py b/scripts/tasks/gazette_themes_listing.py deleted file mode 100644 index 1dbb60c..0000000 --- a/scripts/tasks/gazette_themes_listing.py +++ /dev/null @@ -1,13 +0,0 @@ -import json -import pathlib -from typing import Dict, List - - -def get_themes() -> List[Dict]: - ROOT = pathlib.Path(__file__).parent.parent - themes_config = ROOT / "config" / "themes_config.json" - - with themes_config.open() as f: - themes = json.load(f)["themes"] - - return themes diff --git a/scripts/tasks/interfaces.py b/scripts/tasks/interfaces.py deleted file mode 100644 index 06b81cb..0000000 --- a/scripts/tasks/interfaces.py +++ /dev/null @@ -1,105 +0,0 @@ -from typing import Dict, Iterable, Tuple -import abc - - -class DatabaseInterface(abc.ABC): - """ - Interface to abstract the iteraction with the database storing data used by the - tasks - """ - - @abc.abstractmethod - def _commit_changes(self, command: str, data: Dict) -> None: - """ - Make a change in the database and commit it - """ - - @abc.abstractmethod - def select(self, command: str) -> Iterable[Tuple]: - """ - Select entries from the database - """ - - @abc.abstractmethod - def insert(self, command: str, data: Dict) -> None: - """ - Insert entries into the database - """ - - @abc.abstractmethod - def update(self, command: str, data: Dict) -> None: - """ - Update entries from the database - """ - - @abc.abstractmethod - def delete(self, command: str, data: Dict) -> None: - """ - Delete entries from the database - """ - - -class StorageInterface(abc.ABC): - """ - Interface to abstract the interaction with the object store system. - """ - - @abc.abstractmethod - def get_file(self, file_to_be_downloaded: str, destination) -> None: - """ - Download the given file key in the destination on the host - """ - - @abc.abstractmethod - def upload_content(self, file_key: str, content_to_be_uploaded: str) -> None: - """ - Upload the given content to the destination on the host - """ - - -class IndexInterface(abc.ABC): - """ - Interface to abstract the interaction with the index system - """ - - @abc.abstractmethod - def create_index(self, index_name: str, body: Dict) -> None: - """ - Create the index used by the application - """ - - @abc.abstractmethod - def refresh_index(self, index_name: str) -> None: - """ - Refreshes the index to make it up-to-date for future searches - """ - - @abc.abstractmethod - def index_document( - self, document: Dict, document_id: str, index: str, refresh: bool - ) -> None: - """ - Upload document to the index - """ - - @abc.abstractmethod - def search(self, query: Dict, index: str) -> Dict: - """ - Searches the index with the provided query - """ - - @abc.abstractmethod - def paginated_search( - self, query: Dict, index: str, keep_alive: str - ) -> Iterable[Dict]: - """ - Searches the index with the provided query, with pagination - """ - - -class TextExtractorInterface(abc.ABC): - @abc.abstractmethod - def extract_text(self, filepath: str) -> str: - """ - Extract the text from the given file - """ diff --git a/scripts/tasks/list_gazettes_to_be_processed.py b/scripts/tasks/list_gazettes_to_be_processed.py deleted file mode 100644 index 1547e7b..0000000 --- a/scripts/tasks/list_gazettes_to_be_processed.py +++ /dev/null @@ -1,143 +0,0 @@ -import logging -from typing import Dict, Iterable - -from .interfaces import DatabaseInterface - - -def get_gazettes_to_be_processed( - execution_mode: str, database: DatabaseInterface -) -> Iterable[Dict]: - if execution_mode == "DAILY": - yield from get_gazettes_extracted_since_yesterday(database) - elif execution_mode == "ALL": - yield from get_all_gazettes_extracted(database) - elif execution_mode == "UNPROCESSED": - yield from get_unprocessed_gazettes(database) - else: - raise Exception(f'Execution mode "{execution_mode}" is invalid.') - - -def get_gazettes_extracted_since_yesterday( - database: DatabaseInterface, -) -> Iterable[Dict]: - """ - List the gazettes which were extracted since yesterday - """ - logging.info("Listing gazettes extracted since yesterday") - - command = """ - SELECT - gazettes.id, - gazettes.source_text, - gazettes.date, - gazettes.edition_number, - gazettes.is_extra_edition, - gazettes.power, - gazettes.file_checksum, - gazettes.file_path, - gazettes.file_url, - gazettes.scraped_at, - gazettes.created_at, - gazettes.territory_id, - gazettes.processed, - territories.name as territory_name, - territories.state_code - FROM - gazettes - INNER JOIN territories ON territories.id = gazettes.territory_id - WHERE - scraped_at > current_timestamp - interval '1 day' - ; - """ - for gazette in database.select(command): - yield format_gazette_data(gazette) - - -def get_all_gazettes_extracted( - database: DatabaseInterface, -) -> Iterable[Dict]: - """ - List all the gazettes which were extracted - """ - logging.info("Listing all gazettes extracted") - - command = """ - SELECT - gazettes.id, - gazettes.source_text, - gazettes.date, - gazettes.edition_number, - gazettes.is_extra_edition, - gazettes.power, - gazettes.file_checksum, - gazettes.file_path, - gazettes.file_url, - gazettes.scraped_at, - gazettes.created_at, - gazettes.territory_id, - gazettes.processed, - territories.name as territory_name, - territories.state_code - FROM - gazettes - INNER JOIN territories ON territories.id = gazettes.territory_id - ; - """ - for gazette in database.select(command): - yield format_gazette_data(gazette) - - -def get_unprocessed_gazettes( - database: DatabaseInterface, -) -> Iterable[Dict]: - """ - List all the gazettes which were extracted - """ - logging.info("Listing all gazettes extracted") - - command = """ - SELECT - gazettes.id, - gazettes.source_text, - gazettes.date, - gazettes.edition_number, - gazettes.is_extra_edition, - gazettes.power, - gazettes.file_checksum, - gazettes.file_path, - gazettes.file_url, - gazettes.scraped_at, - gazettes.created_at, - gazettes.territory_id, - gazettes.processed, - territories.name as territory_name, - territories.state_code - FROM - gazettes - INNER JOIN territories ON territories.id = gazettes.territory_id - WHERE - processed is False - ; - """ - for gazette in database.select(command): - yield format_gazette_data(gazette) - - -def format_gazette_data(data): - return { - "id": data[0], - "source_text": data[1], - "date": data[2], - "edition_number": data[3], - "is_extra_edition": data[4], - "power": data[5], - "file_checksum": data[6], - "file_path": data[7], - "file_url": data[8], - "scraped_at": data[9], - "created_at": data[10], - "territory_id": data[11], - "processed": data[12], - "territory_name": data[13], - "state_code": data[14], - } diff --git a/scripts/tasks/utils/__init__.py b/scripts/tasks/utils/__init__.py deleted file mode 100644 index 1bd9cf3..0000000 --- a/scripts/tasks/utils/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .index import ( - get_documents_from_query_with_highlights, - get_documents_with_ids, -) -from .text import clean_extra_whitespaces diff --git a/scripts/tasks/utils/index.py b/scripts/tasks/utils/index.py deleted file mode 100644 index 83d769c..0000000 --- a/scripts/tasks/utils/index.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import Dict, Iterable, List - -from ..interfaces import IndexInterface - - -def get_documents_with_ids( - ids: List[str], index: IndexInterface, index_name: str = "" -) -> Iterable[Dict]: - query_filter_by_ids = { - "query": {"bool": {"filter": {"ids": {"values": ids}}}}, - "size": 100, - } - yield from get_documents_from_query(query_filter_by_ids, index, index_name) - - -def get_documents_from_query( - query: Dict, index: IndexInterface, index_name: str = "" -) -> Iterable[Dict]: - index.refresh_index(index_name) - documents = ( - hit - for result in index.paginated_search(query, index=index_name) - for hit in result["hits"]["hits"] - ) - yield from documents - - -def get_documents_from_query_with_highlights( - query: Dict, index: IndexInterface, index_name: str = "" -) -> Iterable[Dict]: - index.refresh_index(index_name) - documents = ( - hit - for result in index.paginated_search(query, index=index_name) - for hit in result["hits"]["hits"] - if hit.get("highlight") - ) - yield from documents diff --git a/scripts/tasks/utils/text.py b/scripts/tasks/utils/text.py deleted file mode 100644 index 1cc7c39..0000000 --- a/scripts/tasks/utils/text.py +++ /dev/null @@ -1,5 +0,0 @@ -import re - - -def clean_extra_whitespaces(text: str) -> str: - return re.sub(r"\s+", " ", text) From 05d357c2109aa0c122d57cac46cf5f2e44f83abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:57:20 -0300 Subject: [PATCH 04/19] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- README.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/README.md b/README.md index 31e4c9a..2dd4ff6 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,88 @@ make build make setup ``` +----------------------------- +- [x]  como configurar as credenciais em ambos os projetos para que eles se comuniquem +- [ ]  como realizar um seed no data-processing usando um spider do querido-diario + +Para configurar as credenciais é necessário mudar alguns parâmetros em settings.py. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py. + +Mude os seguintes parâmetros: + +~~~Python +###linha xxx +FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/") + +### linha xx +QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" ) + +### linhas 52 a 56 +AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="") +AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="") +AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="") +AWS_REGION_NAME = config("AWS_REGION_NAME", default="") +FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") + +# Substitua por +AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key") +AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key") +AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/") +AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1") +FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") +~~~ + + +- **Linux** + + +- **Windows** + +1. **Usando WSL** +Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). + +Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. + +Caso haja um erro com cython_sources, assim como na imagem: + +![[Pasted image 20231005102449.png]] + +Faça esse procedimento e instale os requirements-dev novamente: + +~~~Linux +pip3 install wheel -v +pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v +~~~ + +Caso haja um erro com legacy-install +![[Pasted image 20231005104343.png]] +![[Pasted image 20231005103545.png]] + +Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux: + +~~~Linux +python3 -m pip install --upgrade pip +sudo apt-get install build-essential libssl-dev libffi-dev python3-dev +~~~ + +2. **Usando o terminal do Windows** + +Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos.... + +Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo: + +~~~Linux +pip install -r data_collection/requirements-dev.txt --no-deps +~~~ + +Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … + +Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos) + +Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. + +- **Mac** + +... ## Populate data Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container). From 504ae96a6f3a19d12523ca83055ae3ab8c3cedb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:01:10 -0300 Subject: [PATCH 05/19] Create configurando_ambientes.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- configurando_ambientes.md | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 configurando_ambientes.md diff --git a/configurando_ambientes.md b/configurando_ambientes.md new file mode 100644 index 0000000..3531b14 --- /dev/null +++ b/configurando_ambientes.md @@ -0,0 +1,51 @@ +## Como os Projetos se relacionam + +O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário no [[fluxograma_1.png]]. As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)). + +Esse é o objetivo principal mas não é o único, já que além da possibilidade da colaboração por meio do desenvolvimento, é também possível aplicar as técnicas de PLN em um _dataset_ específico. + +## Configurando seu ambiente de Desenvolvimento + +Sempre fique ligado(a) ao documento de [Contribuição](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#ecossistema), nele é possível verificar as exigências básicas como formatação _black_, configuração de ambiente seguro, detalhamento nas _[[issues e pull requests]]_. Lembre-se também que as **issues e pull requests são uma parte da documentação do projeto**! + +Sabendo desses pontos, é necessário configurar o ambiente de trabalho. Existem três diferentes sistemas operacionais que são compatíveis com o ambiente desenvolvido: Linux (o padrão e raíz), Windows e Mac. Vamos explorar cada um deles. + +### Linux + +Se você já trabalha com Linux seguir as orientações de instalação contidas no [repositório](https://github.com/okfn-brasil/querido-diario-data-processing) serão suficientes para instalar o ambiente. + +Alguns possíveis problemas que talvez precisem de um cuidado é relacionado à conexão do ecossistema com o [[querido-diario]]. Veja em [[conectar ao querido-diario]]. + +### Windows +##### Utilizando WSL + +Para utlizar essa etapa é necessário instalar o WSL na sua máquina Windows e instalar um sistema operacional. Veja esse tutorial de [[Instalando WSL]] caso tenha dúvidas. + +Dentro da sua máquina Linux já é possível seguir as instruções de instalação do ambiente contidas no repositório em [Setup](). Instale o Podman e inicie o ambiente virtual. Um comando de cada vez. + +~~~Linux + sudo apt-get update + ## sudo apt update && sudo apt upgrade ##testar + sudo apt-get -y install podman + + sudo apt install python3.10-venv + python3 -m venv .venv + source .venv/Scripts/activate ### Ativando o ambiente virtual + + sudo apt install make ### Caso apresente erro de instalação + make build ### Somente a 1° vez + make setup + ~~~ + +Teste para ver se o seu ambiente funciona: +~~~Linux +make shell-database +~~~ + +![[Pasted image 20231005100446.png]] +![[Pasted image 20231005100534.png]] + +Após essa etapa é necessário **[[conectar ao querido-diario]]** ao banco de dados gerados pelo repositório [[querido-diario]] o qual é responsável por extrair os diários oficiais. Se a conexão não for feita, esse repositório não possui documentos para processar. Faça o fork do repositório [[querido-diario]] e [[querido-diario-data-processing]] na sua conta do Github e a partir daí faça o clone para a sua máquina Linux desses repositórios. +### Mac + +... From 497d9dfcc478b47af8ea1c8db00bdb5b43c65592 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:02:07 -0300 Subject: [PATCH 06/19] Create conectando_qd.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- conectando_qd.md | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 conectando_qd.md diff --git a/conectando_qd.md b/conectando_qd.md new file mode 100644 index 0000000..5fc2905 --- /dev/null +++ b/conectando_qd.md @@ -0,0 +1,91 @@ +## Configurando as credenciais para a comunicação dos dois projetos + +Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py. + +Mude os seguintes parâmetros: + +~~~Python +###linha 21 +FILES_STORE = config("FILES_STORE", default="data") + +### Substitua por: +FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/") + +### linhas 44 a 46 +QUERIDODIARIO_DATABASE_URL = config( + "QUERIDODIARIO_DATABASE_URL", default="sqlite:///querido-diario.db" +) + +### Substitua por: +QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" ) + +### linhas 52 a 56 +AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="") +AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="") +AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="") +AWS_REGION_NAME = config("AWS_REGION_NAME", default="") +FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") + +# Substitua por +AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key") +AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key") +AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/") +AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1") +FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") +~~~ + +Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível +## Configurando o ambiente do querido-diario + +### Linux + + +### Windows + +#### Usando WSL + +Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). + +Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. + +Caso haja um erro com cython_sources, assim como na imagem: + +![[Pasted image 20231005102449.png]] + +Faça esse procedimento e instale os requirements-dev novamente: + +~~~Linux +pip3 install wheel -v +pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v +~~~ + +Caso haja um erro com legacy-install +![[Pasted image 20231005104343.png]] +![[Pasted image 20231005103545.png]] + +Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux: + +~~~Linux +python3 -m pip install --upgrade pip +sudo apt-get install build-essential libssl-dev libffi-dev python3-dev +~~~ + +#### Usando o terminal do Windows + +Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos.... + +Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo: + +~~~Linux +pip install -r data_collection/requirements-dev.txt --no-deps +~~~ + +Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … + +Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos) + +Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. + +- **Mac** + +... From 70d63e7cea81bccd9fbefd501ddeeec8f5211a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:08:15 -0300 Subject: [PATCH 07/19] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- README.md | 84 ++----------------------------------------------------- 1 file changed, 2 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 2dd4ff6..d57ef57 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) + # querido-diario-data-processing ## Setup @@ -12,88 +14,6 @@ make build make setup ``` ------------------------------ -- [x]  como configurar as credenciais em ambos os projetos para que eles se comuniquem -- [ ]  como realizar um seed no data-processing usando um spider do querido-diario - -Para configurar as credenciais é necessário mudar alguns parâmetros em settings.py. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py. - -Mude os seguintes parâmetros: - -~~~Python -###linha xxx -FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/") - -### linha xx -QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" ) - -### linhas 52 a 56 -AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="") -AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="") -AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="") -AWS_REGION_NAME = config("AWS_REGION_NAME", default="") -FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") - -# Substitua por -AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key") -AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key") -AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/") -AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1") -FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") -~~~ - - -- **Linux** - - -- **Windows** - -1. **Usando WSL** -Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). - -Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. - -Caso haja um erro com cython_sources, assim como na imagem: - -![[Pasted image 20231005102449.png]] - -Faça esse procedimento e instale os requirements-dev novamente: - -~~~Linux -pip3 install wheel -v -pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v -~~~ - -Caso haja um erro com legacy-install -![[Pasted image 20231005104343.png]] -![[Pasted image 20231005103545.png]] - -Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux: - -~~~Linux -python3 -m pip install --upgrade pip -sudo apt-get install build-essential libssl-dev libffi-dev python3-dev -~~~ - -2. **Usando o terminal do Windows** - -Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos.... - -Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo: - -~~~Linux -pip install -r data_collection/requirements-dev.txt --no-deps -~~~ - -Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … - -Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos) - -Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. - -- **Mac** - -... ## Populate data Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container). From 841231227f2dbb7c3f9b405bf765377965c3f465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:10:16 -0300 Subject: [PATCH 08/19] Create tutorial.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- tutorial.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tutorial.md diff --git a/tutorial.md b/tutorial.md new file mode 100644 index 0000000..31e4c9a --- /dev/null +++ b/tutorial.md @@ -0,0 +1,40 @@ +# querido-diario-data-processing + +## Setup + +- [Install podman](https://podman.io/getting-started/installation) +- execute build stage (only the first time): +```console +make build +``` +- execute setup stage: +```console +make setup +``` + +## Populate data +Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container). + +- you can see created data inside [storage](http://localhost:9000/minio/queridodiariobucket) using [local credentials](contrib/sample.env#L3) +- you can see gazettes not processed yet connecting on database +- open database console in a new terminal +```console +make shell-database +``` +- and run a query to see gazettes not processed +```sql +select processed, count(1) from gazettes g group by processed; +``` + +## Run +- execute processing stage: +```console +make re-run +``` +- and see gazettes processed running the query above +- you can search using ElasticSearch +```console +curl 'http://localhost:9200/querido-diario/_search' \ + -H 'Content-Type: application/json' \ + --data-raw '{"query":{"query_string":{"query":"*"}},"size":2}' +``` From 09e40e19e037ebab47b76b422e14cc1b0b14f956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:17:52 -0300 Subject: [PATCH 09/19] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- README.md | 70 ++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index d57ef57..f6d1c9b 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,28 @@ -[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) - -# querido-diario-data-processing - -## Setup - -- [Install podman](https://podman.io/getting-started/installation) -- execute build stage (only the first time): -```console -make build -``` -- execute setup stage: -```console -make setup -``` - -## Populate data -Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container). - -- you can see created data inside [storage](http://localhost:9000/minio/queridodiariobucket) using [local credentials](contrib/sample.env#L3) -- you can see gazettes not processed yet connecting on database -- open database console in a new terminal -```console -make shell-database -``` -- and run a query to see gazettes not processed -```sql -select processed, count(1) from gazettes g group by processed; -``` - -## Run -- execute processing stage: -```console -make re-run -``` -- and see gazettes processed running the query above -- you can search using ElasticSearch -```console -curl 'http://localhost:9200/querido-diario/_search' \ - -H 'Content-Type: application/json' \ - --data-raw '{"query":{"query_string":{"query":"*"}},"size":2}' -``` +[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) + +## O processamento de dados + +É responsável pelo [repositório](https://github.com/okfn-brasil/querido-diario-data-processing). O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo. +![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/cd6b5589-f4e7-45a0-86a9-5cbb0bf14cb7) + +As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)). + +Veja a estrutura completa do projeto [aqui](https://docs.queridodiario.ok.org.br/pt/latest/). + +### Entendendo a estrutura do querido-diario-data-processing + +1. Montando o ambiente de trabalho + +A pasta "scripts" são responsáveis pelo ambiente de trabalho. + +2. Extração do texto + +"data_extraction" + +3. Processamento do texto + +A pasta "tasks" + +4. Armazenamento + +"database" e "storage" From 275e5a62df9b0764377a11e42d01b9159628f8f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:20:59 -0300 Subject: [PATCH 10/19] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f6d1c9b..5296bd6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ -[Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/README.md) | [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) +PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) + +EN/US ## O processamento de dados -É responsável pelo [repositório](https://github.com/okfn-brasil/querido-diario-data-processing). O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo. +O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário na Figura abaixo. ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/cd6b5589-f4e7-45a0-86a9-5cbb0bf14cb7) As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)). From e40e4e5d957c49d925e5f0d1139b58857195f8a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:30:47 -0300 Subject: [PATCH 11/19] Create wsl_windows.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- wsl_windows.md | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 wsl_windows.md diff --git a/wsl_windows.md b/wsl_windows.md new file mode 100644 index 0000000..4102379 --- /dev/null +++ b/wsl_windows.md @@ -0,0 +1,48 @@ +O WSL é uma sigla para Subsistema de Windows para Linux, tradução de _Windows Subsystem for Linux_, + +O sistema do Querido Diário foi totalmente desenvolvido para Linux e por isso algumas configurações não funcionam para Windows, sabendo disso uma das maneiras menos trabalhosas é configurar um subsistema para Linux, através do WSL. + +Primeiramente é necessário executar o **Windows Power Shell** como administrador. No terminal digite: + +~~~ Windows PowerShell (admin) +wsl --install ### Instalando o WSL +~~~ + +Atenção: Recursos mais atuais do WSL exigem um sistema operacional Windows mais recentes (a partir do Windows 10). + +Após isso, será possível configurar um nome de usuário e senha para que você possa logar na sua nova máquina. Feito isso é necessário configurar o ambiente para o Querido Diário. É necessário ter Python, Git, Podman e o próprio repositório na sua nova máquina. +Nas máquinas Linux normalmente já está instalado o Python, verifique a partir desse comando: +~~~Linux +python --version +~~~ + +A partir disso é possível atualizar ou dar continuidade com a instalação do ambiente de trabalho. Para instalar o Podman (para trabalhar com dockers) siga o [tutorial](https://podman.io/docs/installation) de instalação e vá até "Installing on Linux". Somente com a instalação já é possível iniciar o ambiente (utilizando o Makefile). + + +~~~Linux +sudo apt install python3-venv -y +sudo apt install python3.10-venv +python3 -m venv .venv +source .venv/bin/activate +~~~ + +Ao iniciar uma nova máquina, já é possível acessá-la no menu iniciar do Windows. Por exemplo, caso tenha instalado o Ubuntu, pesquise assim: +![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/233e1427-2557-4c7e-ae35-40a2b7fccbf9) + +Caso ao iniciar seu terminal Linux apareça o erro **"Error: 0x80370114 Não foi possível iniciar a operação porque um recurso necessário não foi instalado"** ,tente habilitar os recursos Hyper-V. Para isso, digite "hyper-V" em Pesquisar e aparecerá uma opção de "Ativar ou desativar recursos do Windows". + +![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/c82bba62-5225-40bb-8e55-ad5b39b3b5c4) + +Selecione Plataforma do Hipervisor do Windows e clique em Ok. Após esse procedimento, reinicie a sua máquina. + + A partir daí já será possível realizar o git clone de um repositório forked do [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) e então criar e iniciar um ambiente virtual: +~~~Linux +git clone repositorio_forked_querido-diario +git clone repositorio_forked_querido-diario-data-processing +~~~ + +Caso nesta etapa tenha dado algum erro de conexão ao host do github, tente reiniciar o terminal Linux pelo comando: + +~~~Linux +sudo shutdown -h now +~~~ From a7ccb35caa0ff7c3d6e25449906acc7c981def62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:35:38 -0300 Subject: [PATCH 12/19] Update conectando_qd.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- conectando_qd.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/conectando_qd.md b/conectando_qd.md index 5fc2905..5befd7d 100644 --- a/conectando_qd.md +++ b/conectando_qd.md @@ -44,27 +44,23 @@ Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e o #### Usando WSL -Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](). +Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](https://github.com/okfn-brasil/querido-diario). Se tiver dúvidas, acesse o [tutorial de instalação do WSL no Windows](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/wsl_windows.md). -Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario]() em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. +Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario](https://github.com/okfn-brasil/querido-diario) em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. Caso haja um erro com cython_sources, assim como na imagem: - -![[Pasted image 20231005102449.png]] +![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/57afdb93-26cd-4ddc-be43-53cd4fd60365) Faça esse procedimento e instale os requirements-dev novamente: - ~~~Linux pip3 install wheel -v pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v ~~~ Caso haja um erro com legacy-install -![[Pasted image 20231005104343.png]] -![[Pasted image 20231005103545.png]] +![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/2040db6a-0d47-404f-aa98-2d2204a6ff4c) Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux: - ~~~Linux python3 -m pip install --upgrade pip sudo apt-get install build-essential libssl-dev libffi-dev python3-dev @@ -74,10 +70,10 @@ sudo apt-get install build-essential libssl-dev libffi-dev python3-dev Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos.... -Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install e adicione um dos comandos abaixo: +Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma: ~~~Linux -pip install -r data_collection/requirements-dev.txt --no-deps +pip3 install -r data_collection/requirements-dev.txt --no-deps ~~~ Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … From 27dc8ba481bd3e48cf4abf08cf32ada1db5ca817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:03:34 -0300 Subject: [PATCH 13/19] Update conectando_qd.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- conectando_qd.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/conectando_qd.md b/conectando_qd.md index 5befd7d..6d9d392 100644 --- a/conectando_qd.md +++ b/conectando_qd.md @@ -34,7 +34,10 @@ AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1") FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") ~~~ -Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível +Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível... + +Acesse os diários baixados através desse link: http://localhost:9000/minio/queridodiariobucket + ## Configurando o ambiente do querido-diario ### Linux From cf918ab8b0a5c155511fff348b10f9ad2e7fb41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:09:23 -0300 Subject: [PATCH 14/19] Update tutorial.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- tutorial.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tutorial.md b/tutorial.md index 31e4c9a..1d3ebfe 100644 --- a/tutorial.md +++ b/tutorial.md @@ -1,3 +1,7 @@ +PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) + +EN/US + # querido-diario-data-processing ## Setup From 23eaffda78c25179a6c4e583f4412d872124bad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:09:38 -0300 Subject: [PATCH 15/19] Update configurando_ambientes.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- configurando_ambientes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/configurando_ambientes.md b/configurando_ambientes.md index 3531b14..747161d 100644 --- a/configurando_ambientes.md +++ b/configurando_ambientes.md @@ -1,3 +1,7 @@ +PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) + +EN/US + ## Como os Projetos se relacionam O repositório [querido-diario-data-processing](https://github.com/okfn-brasil/querido-diario-data-processing) tem como objetivo gerar buscas mais assertivas para o usuário por meio do uso de técnicas de processamento de linguagem natural. O processo desse repositório pode ser referenciado a partir da imagem da Infraestrutura do Querido Diário no [[fluxograma_1.png]]. As partes referentes à indexação e extração do texto são responsabilidade desse repositório em específico. Afinal, para ter os documentos em formato de texto (.txt) disponíveis na [plataforma](https://queridodiario.ok.org.br/) é necessário que seja feito um processamento desse conteúdo (os PDFs coletados previamente pelo repositório [querido-diario](https://github.com/okfn-brasil/querido-diario)). From 164ddf47cb7a3fe70672cbdd92d6c02f6276dc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= <87907716+Luisa-Coelho@users.noreply.github.com> Date: Fri, 6 Oct 2023 19:09:54 -0300 Subject: [PATCH 16/19] Update conectando_qd.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luísa F. Coelho <87907716+Luisa-Coelho@users.noreply.github.com> --- conectando_qd.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conectando_qd.md b/conectando_qd.md index 6d9d392..d8ad964 100644 --- a/conectando_qd.md +++ b/conectando_qd.md @@ -1,3 +1,7 @@ +PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) + +EN/US + ## Configurando as credenciais para a comunicação dos dois projetos Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py. From 7e9f0ae0a90c8694a81d7bc96f94103918c6b7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= Date: Sat, 14 Oct 2023 08:37:35 -0300 Subject: [PATCH 17/19] .env --- conectando_qd.md | 46 ++++++++++++---------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/conectando_qd.md b/conectando_qd.md index d8ad964..04c3bb1 100644 --- a/conectando_qd.md +++ b/conectando_qd.md @@ -4,43 +4,21 @@ EN/US ## Configurando as credenciais para a comunicação dos dois projetos -Para configurar as credenciais é necessário mudar alguns parâmetros em **settings.py**. No repositório do [querido-diario]() na sua máquina vá até data_collection depois gazette e finalmente abra no seu editor de código o arquivo settings.py. - -Mude os seguintes parâmetros: - -~~~Python -###linha 21 -FILES_STORE = config("FILES_STORE", default="data") - -### Substitua por: -FILES_STORE = config("FILES_STORE", default="s3://queridodiariobucket/") - -### linhas 44 a 46 -QUERIDODIARIO_DATABASE_URL = config( - "QUERIDODIARIO_DATABASE_URL", default="sqlite:///querido-diario.db" -) - -### Substitua por: -QUERIDODIARIO_DATABASE_URL = config( "QUERIDODIARIO_DATABASE_URL", default="postgresql://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb" ) - -### linhas 52 a 56 -AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="") -AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="") -AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="") -AWS_REGION_NAME = config("AWS_REGION_NAME", default="") -FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") - -# Substitua por -AWS_ACCESS_KEY_ID = config("AWS_ACCESS_KEY_ID", default="minio-access-key") -AWS_SECRET_ACCESS_KEY = config("AWS_SECRET_ACCESS_KEY", default="minio-secret-key") -AWS_ENDPOINT_URL = config("AWS_ENDPOINT_URL", default="http://localhost:9000/") -AWS_REGION_NAME = config("AWS_REGION_NAME", default="us-east-1") -FILES_STORE_S3_ACL = config("FILES_STORE_S3_ACL", default="public-read") +Para configurar as credenciais é necessário vincular os dois projetos como um só. Para isso é necessário **criar um arquivo .env** na raíz do repositório [querido-diario]() e inserir parâmetros coincidentes com o processamento do repositório [queridod-diario-data-processing](). Depois de ter realizado o fork do querido-diario, abra este repositório na sua máquina e insira um arquivo .env com as seguintes informações. + +~~~.env +AWS_ACCESS_KEY_ID=minio-access-key +AWS_SECRET_ACCESS_KEY=minio-secret-key +AWS_ENDPOINT_URL=http://127.0.0.1:9000/ +AWS_REGION_NAME=us-east-1 +FILES_STORE=s3://queridodiariobucket/ +FILES_STORE_S3_ACL=public-read +QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127.0.0.1:5432/queridodiariodb ~~~ -Abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível... +A variável .env já está como ignorada no projeto do querido-diario, portanto não é necessário mudar mais nada. Para executar a requisição abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível... -Acesse os diários baixados através desse link: http://localhost:9000/minio/queridodiariobucket +Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket** ## Configurando o ambiente do querido-diario From 54daf39111be1f1500ac9fae9397bfb2fa8a454c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= Date: Sat, 14 Oct 2023 09:20:24 -0300 Subject: [PATCH 18/19] arrumando arquivos --- conectando_qd.md | 40 ++++++++++++++++++++++++-------------- configurando_ambientes.md | 2 +- scripts/Dockerfile_windows | 23 ---------------------- tutorial.md | 2 +- 4 files changed, 27 insertions(+), 40 deletions(-) delete mode 100644 scripts/Dockerfile_windows diff --git a/conectando_qd.md b/conectando_qd.md index 04c3bb1..97a13a7 100644 --- a/conectando_qd.md +++ b/conectando_qd.md @@ -1,4 +1,4 @@ -PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) +PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md) EN/US @@ -18,22 +18,31 @@ QUERIDODIARIO_DATABASE_URL=postgresql+psycopg2://queridodiario:queridodiario@127 A variável .env já está como ignorada no projeto do querido-diario, portanto não é necessário mudar mais nada. Para executar a requisição abra 2 terminais (1 com o repositório do [querido-diario-data-processing]() e outro com o [querido-diario](), ambos forked). Realize o **make setup** no repositório de processamento de dados e faça a busca scrapy crawl no repositório do querido-diario. Após isso, é possível... -Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket** +Acesse os diários baixados através desse link: **http://localhost:9000/minio/queridodiariobucket**. + +Essa etapa tem que dar certo para qualquer tipo de sistema operacional, utilizando Linux ou WSL no Windows. ## Configurando o ambiente do querido-diario -### Linux +É importante seguir as instruções gerais no repositório no repositório [querido-diario](). Iniciar um ambiente virtual, instalar o arquivo de _requirements_ bem como o pre-commit. Caso sinta dificuldades ao configurar o ambiente, é possível consultar o material de "lidando com erros" preparado. + +### Lidando com erros na Configuração +#### Linux +Os erros em Linux são menos comuns mas podem ocorrer devido a novas atualizações. É provável que você encontre essas soluções nas seções de WSL, Windows e Mac. -### Windows +Não se preocupe, como as atualizações são constantes logo o erro será resolvido e para isso você pode informá-lo em uma nova issue, pull request ou entrando em contato com os mantenedores pelo [Discord da Open Knowledge Brasil](). -#### Usando WSL +#### Windows + +##### Usando WSL Abra um novo terminal do Ubuntu e faça o clone do repositório forked do [querido-diario](https://github.com/okfn-brasil/querido-diario). Se tiver dúvidas, acesse o [tutorial de instalação do WSL no Windows](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/wsl_windows.md). Para fazer a conexão você precisará ter baixado e instalado tudo que for necessário no repositório [querido-diario](https://github.com/okfn-brasil/querido-diario) em outro lugar na sua máquina WSL. Deixe as pastas próximas uma da outra para facilitar seu trabalho. Abra uma outra máquina Ubuntu para iniciar o repositório querido-diario. Caso haja um erro com cython_sources, assim como na imagem: + ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/57afdb93-26cd-4ddc-be43-53cd4fd60365) Faça esse procedimento e instale os requirements-dev novamente: @@ -43,6 +52,7 @@ pip3 install "cython<3.0.0" pyyaml==5.4.1 --no-build-isolation -v ~~~ Caso haja um erro com legacy-install + ![image](https://github.com/Luisa-Coelho/qd-data-processing/assets/87907716/2040db6a-0d47-404f-aa98-2d2204a6ff4c) Então faça o upgrade do pip e instale algumas bibliotecas essenciais do Linux: @@ -51,22 +61,22 @@ python3 -m pip install --upgrade pip sudo apt-get install build-essential libssl-dev libffi-dev python3-dev ~~~ -#### Usando o terminal do Windows +##### Usando o terminal do Windows -Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira conectar é possível apenas fazer essas passos.... +Lembre-se que para conectar o Banco de Dados é necessário vincular o terminal Windows com o Linux. Caso você não queira baixar os diários diretamente na sua máquina utilizando o Windows, é possível seguir as configurações no tutorial geral do [querido-diario]() levando em conta os possível erros que podem aparecer. -Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma: +É necessário que as configurações C++ estejam instaladas. Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … -~~~Linux -pip3 install -r data_collection/requirements-dev.txt --no-deps -~~~ +Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos) -Baixe o Visual Studio Comunidade [aqui](https://visualstudio.microsoft.com/pt-br/downloads/) . Seguindo os passos [aqui](https://github.com/okfn-brasil/querido-diario/blob/main/docs/CONTRIBUTING.md#em-linux), você deverá baixar o Visual Studio e baixar as configurações … +Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. Instale e siga o resto do tutorial de configuração. -Em **Componentes Individuais** selecione "SDK do Windows 10" ou '11 e Ferramentas de build do MSVC v143 - VS 2022 C++ x64/x86 (v14.32-17.4)". Ou conteúdo similares. Note que muitas vezes as versões Windows 10 SDK e MSVC v142 - VS 2019 C++ x64/x86 build tools serão atualizadas, portanto procure por itens similares em Componentes individuais para realizar a instalação (ou seja, mais novos) +Caso haja um erro com "pinned with == " na hora de instalar os requerimentos, utilize o pip3 install junto com o comando --no-deps, dessa forma: -Em **Cargas de Trabalho**, selecione “Desenvolvimento para desktop com C++”. +~~~Windows +pip3 install -r data_collection/requirements-dev.txt --no-deps +~~~ -- **Mac** +#### Mac ... diff --git a/configurando_ambientes.md b/configurando_ambientes.md index 747161d..fedfcbc 100644 --- a/configurando_ambientes.md +++ b/configurando_ambientes.md @@ -1,4 +1,4 @@ -PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) +PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md) EN/US diff --git a/scripts/Dockerfile_windows b/scripts/Dockerfile_windows deleted file mode 100644 index df8b6e9..0000000 --- a/scripts/Dockerfile_windows +++ /dev/null @@ -1,23 +0,0 @@ -FROM docker.io/python:3.10 - -ENV USER gazette -ENV USER_HOME /home/$USER -ENV WORKDIR /tasks - -RUN net user --system $USER --home $USER_HOME && \ - apt-get update -y && \ - curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ - apt-get -y install git-lfs wait-for-it && \ - apt-get clean && \ - git lfs install && \ - mkdir $WORKDIR - -ENV PYTHONPATH $WORKDIR -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY . $WORKDIR -WORKDIR $WORKDIR -USER $USER - -RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" diff --git a/tutorial.md b/tutorial.md index 1d3ebfe..039dc60 100644 --- a/tutorial.md +++ b/tutorial.md @@ -1,4 +1,4 @@ -PT/BR [Tutorial geral](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/tutorial.md) | [Configurando os diferentes ambientes](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/configurando_ambientes.md) | [Conectando ao querido-diario](https://github.com/Luisa-Coelho/qd-data-processing/blob/readme_update/conectando_qd.md) +PT/BR [Tutorial geral](tutorial.md) | [Configurando os diferentes ambientes](configurando_ambientes.md) | [Conectando ao querido-diario](conectando_qd.md) EN/US From 1b20217b7db3fefb6aa1d72c6924064a613113b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADsa=20F=2E=20Coelho?= Date: Thu, 26 Oct 2023 16:22:12 -0300 Subject: [PATCH 19/19] add .venv gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3762ce4..9333dc2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ __pycache__ .coverage envvars contrib/data +.venv