From fe00dd2d0c34551cdb9ec98ee788625943119d6d Mon Sep 17 00:00:00 2001 From: Dennis Kavlakoglu Date: Tue, 17 Dec 2024 16:51:20 -0500 Subject: [PATCH 1/3] Modified ingest.py to deal with exceptions in PDF processing so that failure to process a PDF doesn't stop WARC ingestion. Now the failure is noted and the PDF skipped. Also created a Dockerfile and compose file to run the app in a Docker container. --- .dockerignore | 34 ++++++++++++++++++++++++++++++++++ .gitignore | 6 ++++++ Dockerfile | 30 ++++++++++++++++++++++++++++++ README.Docker.md | 22 ++++++++++++++++++++++ docker-compose.yaml | 15 +++++++++++++++ warc_gpt/commands/ingest.py | 11 +++++++---- 6 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 README.Docker.md create mode 100644 docker-compose.yaml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..03a268b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,34 @@ +# Include any files or directories that you don't want to be copied to your +# container here (e.g., local build artifacts, temporary files, etc.). +# +# For more help, visit the .dockerignore file reference guide at +# https://docs.docker.com/go/build-context-dockerignore/ + +**/.DS_Store +**/__pycache__ +**/.venv +**/.classpath +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/bin +**/charts +**/docker-compose* +**/compose.y*ml +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md diff --git a/.gitignore b/.gitignore index 532ef92..55cffdd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,9 @@ TODO.md runs/ _*/ env/ +/.idea/.gitignore +/.idea/aws.xml +/.idea/misc.xml +/.idea/modules.xml +/.idea/vcs.xml +/.idea/warc-gpt-public.iml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b4786a0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# syntax=docker/dockerfile:1 + +# Comments are provided throughout this file to help you get started. +# If you need more help, visit the Dockerfile reference guide at +# https://docs.docker.com/go/dockerfile-reference/ + +# Want to help us make this template better? Share your feedback here: https://forms.gle/ybq9Krt8jtBL3iCk7 + +ARG PYTHON_VERSION=3.12.4 +FROM python:${PYTHON_VERSION}-slim AS base + +# Prevents Python from writing pyc files. +ENV PYTHONDONTWRITEBYTECODE=1 + +# Keeps Python from buffering stdout and stderr to avoid situations where +# the application crashes without emitting any logs due to buffering. +ENV PYTHONUNBUFFERED=1 + +# Copy the source code into the container. +COPY . . + +# Install poetry app +RUN pip install poetry +RUN poetry env use 3.12 && poetry install + +# Run the application on localhost:5000 +#CMD ["poetry", "run", "flask", "run"] + +# Uncomment to run the application on 0.0.0.0:5000 +CMD ["poetry", "run", "flask", "run", "--host", "0.0.0.0"] \ No newline at end of file diff --git a/README.Docker.md b/README.Docker.md new file mode 100644 index 0000000..4be047b --- /dev/null +++ b/README.Docker.md @@ -0,0 +1,22 @@ +### Building and running your application + +When you're ready, start your application by running: +`docker compose up --build`. + +Your application will be available at http://localhost:5000. + +### Deploying your application to the cloud + +First, build your image, e.g.: `docker build -t myapp .`. +If your cloud uses a different CPU architecture than your development +machine (e.g., you are on a Mac M1 and your cloud provider is amd64), +you'll want to build the image for that platform, e.g.: +`docker build --platform=linux/amd64 -t myapp .`. + +Then, push it to your registry, e.g. `docker push myregistry.com/myapp`. + +Consult Docker's [getting started](https://docs.docker.com/go/get-started-sharing/) +docs for more detail on building and pushing. + +### References +* [Docker's Python guide](https://docs.docker.com/language/python/) \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..22fa3b2 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,15 @@ +services: + warc-gpt: + container_name: warc-gpt + env_file: .env + build: . + ports: + - "5000:5000" + restart: always + environment: + OLLAMA_API_URL: ${OLLAMA_API_URL} + OPENAI_API_KEY: ${OPENAI_API_KEY} + OPENAI_ORG_ID: ${OPENAI_ORG_ID} + WARC_FOLDER_PATH: ${WARC_FOLDER_PATH} + VISUALIZATIONS_FOLDER_PATH: ${VISUALIZATIONS_FOLDER_PATH} + VECTOR_SEARCH_PATH: ${VECTOR_SEARCH_PATH} \ No newline at end of file diff --git a/warc_gpt/commands/ingest.py b/warc_gpt/commands/ingest.py index d992582..a92b2af 100644 --- a/warc_gpt/commands/ingest.py +++ b/warc_gpt/commands/ingest.py @@ -174,10 +174,13 @@ def ingest(batch_size) -> None: # if record_data["warc_record_content_type"].startswith("application/pdf"): raw = io.BytesIO(record.raw_stream.read()) - pdf = PdfReader(raw) - - for page in pdf.pages: - record_data["warc_record_text"] += page.extract_text() + try: + pdf = PdfReader(raw) + for page in pdf.pages: + record_data["warc_record_text"] += page.extract_text() + except Exception as exc: + print(f"- Could not extract text from {record_data['warc_record_target_uri']}") + continue # # Stop here if we don't have text, or text contains less than 5 words From d74e75290406e73663f75cafd2704a3b582242ed Mon Sep 17 00:00:00 2001 From: Dennis Kavlakoglu Date: Tue, 17 Dec 2024 19:18:39 -0500 Subject: [PATCH 2/3] Added bind mounts for app PATHs. Added instructions for how to execute commands in the docker container to README.Docker.md. --- README.Docker.md | 6 ++++++ docker-compose.yaml | 8 ++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.Docker.md b/README.Docker.md index 4be047b..020c30f 100644 --- a/README.Docker.md +++ b/README.Docker.md @@ -18,5 +18,11 @@ Then, push it to your registry, e.g. `docker push myregistry.com/myapp`. Consult Docker's [getting started](https://docs.docker.com/go/get-started-sharing/) docs for more detail on building and pushing. +### Executing Ingestion and Visualization Commands + +ingest: `docker exec -it warc-gpt poetry run flask ingest` + +visualize: `docker exec -it warc-gpt poetry run flask visualize` + ### References * [Docker's Python guide](https://docs.docker.com/language/python/) \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 22fa3b2..07e5764 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,7 +4,7 @@ services: env_file: .env build: . ports: - - "5000:5000" + - "9999:5000" restart: always environment: OLLAMA_API_URL: ${OLLAMA_API_URL} @@ -12,4 +12,8 @@ services: OPENAI_ORG_ID: ${OPENAI_ORG_ID} WARC_FOLDER_PATH: ${WARC_FOLDER_PATH} VISUALIZATIONS_FOLDER_PATH: ${VISUALIZATIONS_FOLDER_PATH} - VECTOR_SEARCH_PATH: ${VECTOR_SEARCH_PATH} \ No newline at end of file + VECTOR_SEARCH_PATH: ${VECTOR_SEARCH_PATH} + volumes: + - "${WARC_FOLDER_PATH}:/warc" + - "${VECTOR_SEARCH_PATH}:/chromadb" + - "${VISUALIZATIONS_FOLDER_PATH}:/visualizations" \ No newline at end of file From 58d3b47ed6df4e7695431192833f06e1492d7902 Mon Sep 17 00:00:00 2001 From: Dennis Kavlakoglu Date: Tue, 17 Dec 2024 19:19:22 -0500 Subject: [PATCH 3/3] Changed port mapping back to 5000:5000 --- docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 07e5764..85025cf 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,7 +4,7 @@ services: env_file: .env build: . ports: - - "9999:5000" + - "5000:5000" restart: always environment: OLLAMA_API_URL: ${OLLAMA_API_URL}