forked from BuilderIO/gpt-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added option for simple containerized execution
- Loading branch information
Showing
6 changed files
with
110 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
FROM ubuntu:jammy | ||
|
||
# Install Git | ||
RUN apt-get update && \ | ||
apt-get install sudo -y && \ | ||
apt-get install git -y | ||
|
||
# Install Docker | ||
RUN apt-get install ca-certificates curl gnupg -y && \ | ||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ | ||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \ | ||
apt-get update && \ | ||
apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y | ||
|
||
# Install Nodejs v20 npm | ||
RUN sudo apt-get update && \ | ||
sudo apt-get install -y ca-certificates curl gnupg && \ | ||
sudo mkdir -p /etc/apt/keyrings && \ | ||
curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg | ||
|
||
RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \ | ||
sudo apt-get update && \ | ||
sudo apt-get install nodejs -y | ||
|
||
# Install gpt-crawler | ||
RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \ | ||
npm i && \ | ||
npx playwright install && \ | ||
npx playwright install-deps | ||
|
||
# Directory to mount in the docker container to get the output.json data | ||
RUN cd /home && mkdir data | ||
|
||
|
||
WORKDIR /home |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Containerized crawler | ||
## Docker image with packaged crawler, with script for building and execution. | ||
|
||
|
||
All dependencies set up and configured in the Dockerfile. Requires docker to be installed. | ||
|
||
|
||
## Get started | ||
|
||
### Prerequisites | ||
|
||
Be sure you have docker installed | ||
|
||
1. ``` cd gpt-crawler/containerapp ``` | ||
2. ``` . ./run.sh ``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import { Page } from "playwright"; | ||
|
||
type Config = { | ||
/** URL to start the crawl */ | ||
url: string; | ||
/** Pattern to match against for links on a page to subsequently crawl */ | ||
match: string; | ||
/** Selector to grab the inner text from */ | ||
selector: string; | ||
/** Don't crawl more than this many pages */ | ||
maxPagesToCrawl: number; | ||
/** File name for the finished data */ | ||
outputFileName: string; | ||
/** Optional cookie to be set. E.g. for Cookie Consent */ | ||
cookie?: {name: string; value: string} | ||
/** Optional function to run for each page found */ | ||
onVisitPage?: (options: { | ||
page: Page; | ||
pushData: (data: any) => Promise<void>; | ||
}) => Promise<void>; | ||
/** Optional timeout for waiting for a selector to appear */ | ||
waitForSelectorTimeout?: number; | ||
}; | ||
|
||
export const config: Config = { | ||
url: "https://www.builder.io/c/docs/developers", | ||
match: "https://www.builder.io/c/docs/**", | ||
selector: `.docs-builder-container`, | ||
maxPagesToCrawl: 50, | ||
outputFileName: "output.json", | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/bin/bash | ||
|
||
# copy the config when starting the container | ||
cp /home/data/config.ts /home/gpt-crawler/ | ||
|
||
# start the crawler | ||
cd /home/gpt-crawler && npm start | ||
|
||
# Print message after crawling and exit | ||
echo "Crawling complete.." | ||
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# Check if there is a Docker image named "crawler" | ||
if ! sudo docker images | grep -w 'crawler' > /dev/null; then | ||
echo "Docker repository 'crawler' not found. Building the image..." | ||
# Build the Docker image with the name 'crawler' | ||
sudo docker build -t crawler . | ||
else | ||
echo "Docker image already built." | ||
fi | ||
|
||
# Ensure that init.sh script is executable | ||
sudo chmod +x ./data/init.sh | ||
|
||
# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container | ||
sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh" |