diff --git a/apps/api/src/__tests__/e2e_noAuth/index.test.ts b/apps/api/src/__tests__/e2e_noAuth/index.test.ts index e0aca36ff..b2b293834 100644 --- a/apps/api/src/__tests__/e2e_noAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_noAuth/index.test.ts @@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL without requiring authorization", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => { expect(response.statusCode).not.toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index ba01a7ca4..a165ae223 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002"; .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); + + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://facebook.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response with a valid preview token", async () => { const response = await request(TEST_URL) .post("/v0/scrape") @@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://twitter.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawl") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawl") @@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002"; ); }); + // Additional tests for insufficient credits? }); @@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002"; expect(response.statusCode).toBe(401); }); + it("should return an error for a blocklisted URL", async () => { + const blocklistedUrl = "https://instagram.com/fake-test"; + const response = await request(TEST_URL) + .post("/v0/crawlWebsitePreview") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send({ url: blocklistedUrl }); + expect(response.statusCode).toBe(403); + expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); + }); + it("should return a successful response with a valid API key", async () => { const response = await request(TEST_URL) .post("/v0/crawlWebsitePreview") diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index bd3fecaf4..3d64f7f4d 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlController(req: Request, res: Response) { try { @@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 3f28ef605..569be333c 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -2,6 +2,7 @@ import { Request, Response } from "express"; import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; +import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) { if (!url) { return res.status(400).json({ error: "Url is required" }); } + + if (isUrlBlocked(url)) { + return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); + } + const mode = req.body.mode ?? "crawl"; const crawlerOptions = req.body.crawlerOptions ?? {}; const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index be7080086..cfe35b5bc 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -5,6 +5,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../types"; import { logJob } from "../services/logging/log_job"; import { Document } from "../lib/entities"; +import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function export async function scrapeHelper( req: Request, @@ -22,6 +23,10 @@ export async function scrapeHelper( return { success: false, error: "Url is required", returnCode: 400 }; } + if (isUrlBlocked(url)) { + return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; + } + const a = new WebScraperDataProvider(); await a.setOptions({ mode: "single_urls", diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts new file mode 100644 index 000000000..0eef33209 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -0,0 +1,19 @@ +const socialMediaBlocklist = [ + 'facebook.com', + 'twitter.com', + 'instagram.com', + 'linkedin.com', + 'pinterest.com', + 'snapchat.com', + 'tiktok.com', + 'reddit.com', + 'tumblr.com', + 'flickr.com', + 'whatsapp.com', + 'wechat.com', + 'telegram.org', +]; + +export function isUrlBlocked(url: string): boolean { + return socialMediaBlocklist.some(domain => url.includes(domain)); +}