Skip to content

Commit

Permalink
Fix storing results to datastore, plus other little fixes (#378)
Browse files Browse the repository at this point in the history
* Check for missing credentials
* Change ENV to ARG in Dockerfile
* Set PYTHONPATH in Dockerfile
* Mostly fix storing results in datastore
* Set JOB_TIMEOUT, improve comments
* Upgrade cffi to 1.17.1
  • Loading branch information
marians authored Nov 15, 2024
1 parent 079d967 commit cb2d4ad
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 16 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM alpine:3.19@sha256:c5b1261d6d3e43071626931fc004f70149baeba2c8ec672bd4f27761f8e1ad6b

# Find an eligible version at https://dl-cdn.alpinelinux.org/alpine/v3.19/community/x86_64/
ENV CHROMIUM_VERSION=124.0.6367.78-r0
ARG CHROMIUM_VERSION=124.0.6367.78-r0

RUN echo "http://dl-cdn.alpinelinux.org/alpine/v3.19/community" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates \
Expand Down Expand Up @@ -35,3 +35,5 @@ ADD spider /workdir/spider
ADD export /workdir/export
ADD job.py /workdir/
ADD VERSION /workdir/VERSION

ENV PYTHONPATH="/workdir"
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ jobs:
docker compose up manager
venv/bin/rq info

# Spider a single URL and inspect the result
# Spider a single URL and inspect the result.
# Example:
# make dryrun ARGS="https://gruene-roesrath.de/"
dryrun:
docker run --rm -ti \
-v $(PWD)/volumes/dev-shm:/dev/shm \
Expand All @@ -34,7 +36,7 @@ dryrun:
# Run the spider.
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS.
spider:
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq worker --burst high default low
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES JOB_TIMEOUT=100 venv/bin/rq worker --burst high default low

export:
docker run --rm -ti \
Expand Down
5 changes: 4 additions & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import argparse
import logging
import os
import signal
import sys
import json
Expand Down Expand Up @@ -87,9 +88,11 @@ def handle_sigint(signum, frame):
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))

elif args.command == 'spider':
from spider import spider
if not os.path.exists(args.credentials_path):
raise Exception("Credentials file not found at %s" % args.credentials_path)
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
job = json.loads(args.job)
from spider import spider
spider.execute_single_job(datastore_client, job, "spider-results")

else:
Expand Down
44 changes: 33 additions & 11 deletions job.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import os
from datetime import datetime
from datetime import timezone
import time
import logging

Expand All @@ -16,24 +17,44 @@
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))

# Container image to use for the spider
DOCKER_IMAGE = 'ghcr.io/netzbegruenung/green-spider:latest'

CREDENTIALS_PATH = '/secrets/datastore-writer.json'
# Path to the Google Cloud Datastore credentials file,
# as passed to the spider container for writing
# spider results and screenshots
CREDENTIALS_PATH_CONTAINER = '/secrets/datastore-writer.json'

# Path to the Google Cloud Datastore credentials file,
# as used right here in this script for logging the spider run
CREDENTIALS_PATH_LOCAL = './secrets/datastore-writer.json'

if not os.path.exists(CREDENTIALS_PATH_LOCAL):
raise Exception("Credentials file not found at %s" % CREDENTIALS_PATH_LOCAL)

client = docker.from_env()
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')

datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)
datastore_client = datastore.Client.from_service_account_json(CREDENTIALS_PATH_LOCAL)

pwd = os.path.abspath(".")
secrets_path = pwd + "/secrets"
chromedir_path = pwd + "/volumes/chrome-userdir"
screenshots_path = pwd + "/screenshots"

volumes = {}
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}
# Volumes to be mounted in the spider container.
# Key is the host path. "bind" value is the container path.
volumes = {
pwd + "/secrets": {
"bind": "/secrets",
"mode": "ro",
},
pwd + "/volumes/chrome-userdir": {
"bind": "/opt/chrome-userdir",
"mode": "rw",
},
pwd + "/screenshots": {
"bind": "/screenshots",
"mode": "rw",
},
}

logger = logging.getLogger('rq.worker')
logger.setLevel(logging.DEBUG)
Expand All @@ -51,9 +72,10 @@ def run(job):
" spider "
" --job='{job_json}'")

cmd = cmd_template.format(path=CREDENTIALS_PATH,
cmd = cmd_template.format(path=CREDENTIALS_PATH_CONTAINER,
job_json=json.dumps(job))

# Run spider container
container = client.containers.run(image=DOCKER_IMAGE,
command=cmd,
detach=True,
Expand All @@ -70,7 +92,7 @@ def run(job):
key = datastore_client.key('spider-runs')
entity = datastore.Entity(key=key)
results = {
'datetime': datetime.utcnow(),
'datetime': datetime.now(tz=timezone.utc),
'url': job['url'],
'success': True,
'error': '',
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4==4.12.3
cachetools==5.3.3
certifi==2024.2.2
cffi==1.16.0
cffi==1.17.1
chardet==5.2.0
click>=7,<8
cssselect==1.2.0
Expand Down

0 comments on commit cb2d4ad

Please sign in to comment.