forked from vectara/vectara-ingest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
61 lines (53 loc) · 1.85 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive \
HOME=/home/vectara \
XDG_RUNTIME_DIR=/tmp \
RAY_DEDUP_LOGS="0" \
CUDA_VISIBLE_DEVICES=""
RUN sed 's/main$/main universe/' -i /etc/apt/sources.list
RUN apt-get upgrade
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
wget \
git \
curl \
vim \
wkhtmltopdf \
libssl-dev \
unixodbc \
poppler-utils \
tesseract-ocr \
libtesseract-dev \
xvfb \
python3-pip python3-dev \
libmagic1 \
libfontconfig fontconfig \
libjpeg-turbo8 \
fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \
&& apt-get purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install python packages
WORKDIR ${HOME}
COPY requirements.txt requirements-extra.txt $HOME/
RUN pip install --no-cache-dir torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu
RUN pip install --no-cache-dir -r requirements.txt \
&& find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \
&& find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \
&& find /usr/local -type d \( -name '__pycache__' \) -exec rm -rf '{}' + \
&& find /usr/local -type d \( -name 'build' \) -exec rm -rf '{}' + \
&& rm -rf .cache/* /tmp/* \
&& pip cache purge
RUN playwright install --with-deps firefox
# Install additional large packages for all-docs unstructured inference and PII detection
ARG INSTALL_EXTRA=false
RUN if [ "$INSTALL_EXTRA" = "true" ]; then \
pip3 install --no-cache-dir -r requirements-extra.txt && \
python3 -m spacy download en_core_web_lg; \
fi
COPY *.py $HOME/
COPY core/*.py $HOME/core/
COPY crawlers/ $HOME/crawlers/
#SHELL ["/bin/bash", "-c"]
ENTRYPOINT ["/bin/bash", "-l", "-c"]
CMD ["python3 ingest.py $CONFIG $PROFILE"]