Skip to content

Commit

Permalink
tardigrade crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
vtempest committed Sep 14, 2024
1 parent 4c66fef commit 94ae200
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 1 deletion.
1 change: 0 additions & 1 deletion src/crawler
Submodule crawler deleted from 065a07
20 changes: 20 additions & 0 deletions src/crawler/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Docker for Crawler
# with Node, Chromium, Puppeteer, Alpine

FROM node:alpine
RUN apk add --no-cache \
chromium \
nss \
freetype \
freetype-dev \
harfbuzz \
ca-certificates \
ttf-freefont
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
ENV PUPPETEER_EXECUTABLE_PATH /usr/bin/chromium-browser
WORKDIR /app
COPY package*.json ./
RUN npm install
COPY . .
EXPOSE 3000
CMD [ "npm", "start" ]
21 changes: 21 additions & 0 deletions src/crawler/compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
services:
scrapoxy:
image: fabienvauchelles/scrapoxy
ports:
- 8888:8888
- 8890:8890
environment:
- NODE_ENV=production
- AUTH_LOCAL_USERNAME=admin
- AUTH_LOCAL_PASSWORD=password
- BACKEND_JWT_SECRET=secret1
- FRONTEND_JWT_SECRET=secret2
- STORAGE_FILE_FILENAME=/etc/scrapoxy/config.json

crawler:
build: .
ports:
- 3000:3000



212 changes: 212 additions & 0 deletions src/crawler/crawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
/**
* @class Crawler
* 1. Server API takes url and renders with puppeteer DOM to get all HTML.
* 2. Bypass Cloudflare bot check
* A webpage proxy that request through Chromium (puppeteer) - can be used
* to bypass Cloudflare anti bot / anti ddos on any application (like curl)
* Send your request to the server with the port 3000 and add your URL to the "url"
* query string like this: `http://localhost:3000/?url=https://example.org`
*
* <img src="https://i.imgur.com/XXXTprT.png" width="500px" />
*/
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin()); // Use stealth plugin to make puppeteer harder to detect

// Import Koa and its middleware
const Koa = require("koa");
const bodyParser = require("koa-bodyparser");
const app = new Koa(); // Create a new Koa application
app.use(bodyParser()); // Use body parser middleware

// Import jsesc for escaping JavaScript strings
const jsesc = require("jsesc");

// Define headers to be removed from requests and responses
const requestHeadersToRemove = [
"host",
"user-agent",
"accept-encoding",
"content-length",
"forwarded",
"x-forwarded-proto",
"x-forwarded-for",
"x-cloud-trace-context",
];
const responseHeadersToRemove = [
"Accept-Ranges",
"Content-Length",
"Keep-Alive",
"Connection",
"content-encoding",
"set-cookie",
];

// Main application logic
// Set up Puppeteer options
let options = {
headless: "new", // Use new headless mode
args: ["--no-sandbox", "--disable-setuid-sandbox"], // Security-related arguments
};

// Configure Puppeteer based on environment variables
if (process.env.PUPPETEER_SKIP_CHROMIUM_DOWNLOAD)
options.executablePath = "/usr/bin/chromium-browser";
if (process.env.PUPPETEER_HEADFUL) options.headless = false;
if (process.env.PUPPETEER_USERDATADIR)
options.userDataDir = process.env.PUPPETEER_USERDATADIR;
if (process.env.PUPPETEER_PROXY)
options.args.push(`--proxy-server=${process.env.PUPPETEER_PROXY}`);

// Launch the browser
const browser = await puppeteer.launch(options);

// Set up Koa middleware
app.use(async (ctx) => {
if (ctx.query.url) {
// Extract and decode the URL from the query string
const url = decodeURIComponent(ctx.url.replace("/?url=", ""));
if (process.env.DEBUG) {
console.log(`[DEBUG] URL: ${url}`);
}

// Initialize variables for response data
let responseBody;
let responseData;
let responseHeaders;

// Create a new page
const page = await browser.newPage();

// Set up request interception
await page.removeAllListeners("request");
await page.setRequestInterception(true);
let requestHeaders = ctx.headers;
requestHeadersToRemove.forEach((header) => {
delete requestHeaders[header];
});

// Handle each intercepted request
page.on("request", (request) => {
requestHeaders = Object.assign({}, request.headers(), requestHeaders);
if (process.env.DEBUG) {
console.log(
`[DEBUG] requested headers: \n${JSON.stringify(requestHeaders)}`
);
}
if (ctx.method == "POST") {
request.continue({
headers: requestHeaders,
method: "POST",
postData: ctx.request.rawBody,
});
} else {
request.continue({ headers: requestHeaders });
}
});

// Set up CDP session for more control over the browser
const client = await page.target().createCDPSession();
await client.send("Network.setRequestInterception", {
patterns: [
{
urlPattern: "*",
resourceType: "Document",
interceptionStage: "HeadersReceived",
},
],
});

// Handle intercepted responses
await client.on("Network.requestIntercepted", async (e) => {
let obj = { interceptionId: e.interceptionId };
if (e.isDownload) {
await client
.send("Network.getResponseBodyForInterception", {
interceptionId: e.interceptionId,
})
.then((result) => {
if (result.base64Encoded) {
responseData = Buffer.from(result.body, "base64");
}
});
obj["errorReason"] = "BlockedByClient";
responseHeaders = e.responseHeaders;
}
await client.send("Network.continueInterceptedRequest", obj);
if (e.isDownload) await page.close();
});

try {
// Navigate to the URL and handle potential challenges
let response;
let tryCount = 0;
response = await page.goto(url, {
timeout: 10000,
waitUntil: "domcontentloaded",
});
ctx.status = response.status();
responseBody = await response.text();
responseData = await response.buffer();
while (
responseBody.includes(
process.env.CHALLENGE_MATCH || "challenge-platform"
) &&
tryCount <= 10
) {
newResponse = await page.waitForNavigation({
timeout: 10000,
waitUntil: "domcontentloaded",
});
if (newResponse) response = newResponse;
responseBody = await response.text();
responseData = await response.buffer();
tryCount++;
}
responseHeaders = await response.headers();

// Handle cookies
const cookies = await page.cookies();
if (cookies)
cookies.forEach((cookie) => {
const { name, value, secure, expires, domain, ...options } = cookie;
ctx.cookies.set(cookie.name, cookie.value, options);
});
} catch (error) {
// Handle errors
if (!error.toString().includes("ERR_BLOCKED_BY_CLIENT")) {
ctx.status = 500;
ctx.body = error;
}
}

// Close the page
await page.close();

// Process response headers
if (responseHeaders) {
responseHeadersToRemove.forEach(
(header) => delete responseHeaders[header]
);
Object.keys(responseHeaders).forEach((header) =>
ctx.set(header, jsesc(responseHeaders[header]))
);
}

// Debug logging
if (process.env.DEBUG) {
console.log(
`[DEBUG] response headers: \n${JSON.stringify(responseHeaders)}`
);
}

// Set the response body
ctx.body = responseData;
} else {
// If no URL is provided, return an error message
ctx.body = "Please specify the URL in the 'url' query string.";
}
});

// Start the server
app.listen(process.env.PORT || 3000, process.env.ADDRESS || "::");
23 changes: 23 additions & 0 deletions src/crawler/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"name": "tardigrade-crawler",
"version": "1.0.0",
"description": "Server API renders DOM with puppeteer to get HTML & Bypass Cloudflare bot check.",
"main": "crawler.js",
"license": "MPL-2.0",
"scripts": {
"start": "node crawler.js"
},
"dependencies": {
"jsesc": "^3.0.2",
"koa": "^2.15.0",
"koa-bodyparser": "^4.4.1",
"koa-router": "^12.0.0",
"koa-static": "^5.0.0",
"puppeteer": "^21.7.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-koa": "^0.0.1"
}

}

0 comments on commit 94ae200

Please sign in to comment.