knmlprz · Kleczyk · Feb 25, 2024 · Jan 17, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/README.md b/README.md
@@ -71,12 +71,18 @@ Upon app startup, OpenAI-compatible embedding API will be available at:
 
 Check the docs here: <http://172.16.3.101:5001/docs>
 
-#### llamacpp
+#### Download llm model (must have for servis llm to work !!!)
 
-Download models (this can take >1h):
+Download model (size of file 3.6GB ):
 
 ```sh
-wget https://huggingface.co/TheBloke/sheep-duck-llama-2-70B-v1.1-GGUF/resolve/main/sheep-duck-llama-2-70b-v1.1.Q4_K_S.gguf
+curl -o ./llm/models/llama-2-7b.Q3_K_L.gguf -L https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf
+```
+
+or
+
+```sh
+wget -P ./llm/models/llama-2-7b.Q3_K_L.gguf https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf
 ```
 
 #### Starting app

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,6 +1,6 @@
 services:
   discord-bot:
-    profiles: ["dev","prod"]
+    profiles: [ "dev", "prod" ]
     build: ./discord_bot
     env_file: .env
 
@@ -16,7 +16,7 @@ services:
       - .env
 
   api:
-    profiles: ["dev","prod"]
+    profiles: [ "dev", "prod" ]
     build:
       context: ./api/
     env_file:
@@ -27,3 +27,12 @@ services:
       - "8000:8000"
     depends_on:
       - db
+
+  llm:
+    profiles: [ "dev", "prod" ]
+    build:
+      context: ./llm/
+    volumes:
+      - ./llm/models:/models
+    ports:
+      - "9000:9000"
diff --git a/llm/.gitignore b/llm/.gitignore
@@ -0,0 +1,5 @@
+models/
+*.ipynb
+llama-cpp-python/
+.pytest_cache/
+__pycache__/
diff --git a/llm/Dockerfile b/llm/Dockerfile
@@ -0,0 +1,28 @@
+FROM python:3.11-buster as builder
+
+RUN pip install poetry==1.6.1
+RUN apt-get update && apt-get install -y git
+RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cachee
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install 
+
+FROM python:3.11-slim-buster as runtime
+
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH"
+
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]
+
+ENTRYPOINT ["python3", "-m", "llama_cpp.server", "--host", "0.0.0.0", "--port", "9000", "--model", "models/llama-2-7b.Q3_K_L.gguf", "--n_gpu_layers", "9999999"]