Skip to content

Commit

Permalink
added option for simple containerized execution
Browse files Browse the repository at this point in the history
  • Loading branch information
Umar-Azam committed Nov 20, 2023
1 parent 2e41390 commit b51bcf9
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ Use this option for API access to your generated knowledge that you can integrat

![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)

## (Alternate method) Running in a container with Docker
To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder.


## Contributing
Expand Down
35 changes: 35 additions & 0 deletions containerapp/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM ubuntu:jammy

# Install Git
RUN apt-get update && \
apt-get install sudo -y && \
apt-get install git -y

# Install Docker
RUN apt-get install ca-certificates curl gnupg -y && \
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
apt-get update && \
apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y

# Install Nodejs v20 npm
RUN sudo apt-get update && \
sudo apt-get install -y ca-certificates curl gnupg && \
sudo mkdir -p /etc/apt/keyrings && \
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg

RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \
sudo apt-get update && \
sudo apt-get install nodejs -y

# Install gpt-crawler
RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \
npm i && \
npx playwright install && \
npx playwright install-deps

# Directory to mount in the docker container to get the output.json data
RUN cd /home && mkdir data


WORKDIR /home
15 changes: 15 additions & 0 deletions containerapp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Containerized crawler
## Docker image with packaged crawler, with script for building and execution.


All dependencies set up and configured in the Dockerfile. Requires docker to be installed.


## Get started

### Prerequisites

Be sure you have docker installed

1. ``` cd gpt-crawler/containerapp ```
2. ``` . ./run.sh ```
31 changes: 31 additions & 0 deletions containerapp/data/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { Page } from "playwright";

type Config = {
/** URL to start the crawl */
url: string;
/** Pattern to match against for links on a page to subsequently crawl */
match: string;
/** Selector to grab the inner text from */
selector: string;
/** Don't crawl more than this many pages */
maxPagesToCrawl: number;
/** File name for the finished data */
outputFileName: string;
/** Optional cookie to be set. E.g. for Cookie Consent */
cookie?: {name: string; value: string}
/** Optional function to run for each page found */
onVisitPage?: (options: {
page: Page;
pushData: (data: any) => Promise<void>;
}) => Promise<void>;
/** Optional timeout for waiting for a selector to appear */
waitForSelectorTimeout?: number;
};

export const config: Config = {
url: "https://www.builder.io/c/docs/developers",
match: "https://www.builder.io/c/docs/**",
selector: `.docs-builder-container`,
maxPagesToCrawl: 50,
outputFileName: "output.json",
};
11 changes: 11 additions & 0 deletions containerapp/data/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

# copy the config when starting the container
cp /home/data/config.ts /home/gpt-crawler/

# start the crawler
cd /home/gpt-crawler && npm start

# Print message after crawling and exit
echo "Crawling complete.."
exit
16 changes: 16 additions & 0 deletions containerapp/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# Check if there is a Docker image named "crawler"
if ! sudo docker images | grep -w 'crawler' > /dev/null; then
echo "Docker repository 'crawler' not found. Building the image..."
# Build the Docker image with the name 'crawler'
sudo docker build -t crawler .
else
echo "Docker image already built."
fi

# Ensure that init.sh script is executable
sudo chmod +x ./data/init.sh

# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container
sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh"

0 comments on commit b51bcf9

Please sign in to comment.