diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..06deb0b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +collections + diff --git a/Dockerfile b/Dockerfile index 2d0be0f..488ebde 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,54 @@ -FROM python:3.9 +# as of Jan 2025, python3 on production is on 3.8.10 +FROM ubuntu:20.04 -RUN apt-get -qq update && apt-get -qqy install awscli +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + build-essential \ + linux-headers-virtual \ + make \ + gcc \ + git \ + curl \ + nano \ + libev-dev \ + libssl-dev \ + libc-dev \ + libffi-dev \ + libpcre3-dev \ + python3.9-full \ + python3.9-dev \ + python3-venv \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-wheel \ + python3-cffi \ + python3-pytest \ + && rm -rf /var/lib/apt/lists/* + +#RUN python3 -m venv /var/venv +#ENV PATH="/var/venv/bin:$PATH" + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \ + update-alternatives --set python3 /usr/bin/python3.9 + +# might come in handy later if we can upgrade gevent further... +#RUN pip install --upgrade pip setuptools wheel +#RUN pip install "Cython<3" "setuptools<58" +#RUN pip download --no-binary :all: --no-deps gevent==20.9.0 +#RUN pip download --no-binary :all: --no-deps gevent==20.9.0 +#RUN pip install gevent-20.9.0.tar.gz # Install dependencies COPY ./requirements.txt /tmp/requirements.txt -RUN pip install -r /tmp/requirements.txt +RUN pip install -Ur /tmp/requirements.txt # Add the cc-index-server code into the image COPY ./ /opt/webapp/ WORKDIR /opt/webapp -RUN ./install-collections.sh -# Note: to avoid that collections are fetched anew on every image build, -# you may install collections locally on the host in the build directory -# and remove this command +VOLUME /opt/webapp/collections -CMD /usr/local/bin/pywb +CMD uwsgi --ini uwsgi.ini diff --git a/README.md b/README.md index a5859cc..1d71039 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ This project is a deployment of the [pywb](https://github.com/webrecorder/pywb) web archive replay and index server to provide an index query mechanism for datasets provided by [Common Crawl](https://commoncrawl.org) +We depend on a fork of pywb, [maintained on this branch](https://github.com/commoncrawl/pywb/tree/common-crawl-cdx-index). It is a modified version of PyWB (pywb>=2.5.0), which is API compatible with PyWB 0.33.2. ## Usage & Installation To run locally, please install with `pip install -r requirements.txt` @@ -28,10 +29,9 @@ If you have docker installed in your system, you can run index server with docke git clone https://github.com/commoncrawl/cc-index-server.git cd cc-index-server docker build . -t cc-index -docker run --rm --publish 8080:8080 -ti cc-index -``` - -You can use `install-collections.sh` to download indexes to your system and mount it on docker. +# optional/one time - big download of data to local collections folder... +./install-collections.sh +docker run --rm -v $PWD/collections/:/opt/webapp/collections/ --publish 8080:8080 -ti cc-index ## CDX Server API diff --git a/install-collections.sh b/install-collections.sh index 7a2bd45..12e3b81 100755 --- a/install-collections.sh +++ b/install-collections.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/bash if [ ! -d "collections" ]; then mkdir collections diff --git a/requirements.txt b/requirements.txt index 58528bf..e24f2f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,13 @@ +# this is the lowest version that seems to build on newer oses on arm. +# currently, our pywb tries to get 20.9.0, we should update that... +gevent==20.12.0 + # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb + +cffi boto3 -gevent uwsgi - -# AWS CLI (aws s3 cp ...) is used by install-collections.sh -# to fetch cluster.idx and metadata.yaml -#awscli +greenlet==1.1.2 +werkzeug==2.0.3 +markupsafe==2.0.1