Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] Added blocklist for social media urls #55

Merged
merged 2 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions apps/api/src/__tests__/e2e_noAuth/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401);
});

it("should return an error for a blocklisted URL without requiring authorization", async () => {
const blocklistedUrl = "https://facebook.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
Expand All @@ -70,6 +80,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401);
});

it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://twitter.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawl")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response", async () => {
const response = await request(TEST_URL)
.post("/v0/crawl")
Expand All @@ -89,6 +109,16 @@ describe("E2E Tests for API Routes with No Authentication", () => {
expect(response.statusCode).not.toBe(401);
});

it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://instagram.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response", async () => {
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
Expand Down
35 changes: 35 additions & 0 deletions apps/api/src/__tests__/e2e_withAuth/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ const TEST_URL = "http://127.0.0.1:3002";
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});

it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://facebook.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response with a valid preview token", async () => {
const response = await request(TEST_URL)
.post("/v0/scrape")
Expand Down Expand Up @@ -86,6 +98,17 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.statusCode).toBe(401);
});

it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://twitter.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL)
.post("/v0/crawl")
Expand All @@ -99,6 +122,7 @@ const TEST_URL = "http://127.0.0.1:3002";
);
});


// Additional tests for insufficient credits?
});

Expand All @@ -119,6 +143,17 @@ const TEST_URL = "http://127.0.0.1:3002";
expect(response.statusCode).toBe(401);
});

it("should return an error for a blocklisted URL", async () => {
const blocklistedUrl = "https://instagram.com/fake-test";
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({ url: blocklistedUrl });
expect(response.statusCode).toBe(403);
expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
});

it("should return a successful response with a valid API key", async () => {
const response = await request(TEST_URL)
.post("/v0/crawlWebsitePreview")
Expand Down
6 changes: 6 additions & 0 deletions apps/api/src/controllers/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";

export async function crawlController(req: Request, res: Response) {
try {
Expand All @@ -27,6 +28,11 @@ export async function crawlController(req: Request, res: Response) {
if (!url) {
return res.status(400).json({ error: "Url is required" });
}

if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}

const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
Expand Down
6 changes: 6 additions & 0 deletions apps/api/src/controllers/crawlPreview.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { Request, Response } from "express";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
import { addWebScraperJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";

export async function crawlPreviewController(req: Request, res: Response) {
try {
Expand All @@ -18,6 +19,11 @@ export async function crawlPreviewController(req: Request, res: Response) {
if (!url) {
return res.status(400).json({ error: "Url is required" });
}

if (isUrlBlocked(url)) {
return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." });
}

const mode = req.body.mode ?? "crawl";
const crawlerOptions = req.body.crawlerOptions ?? {};
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false };
Expand Down
5 changes: 5 additions & 0 deletions apps/api/src/controllers/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../types";
import { logJob } from "../services/logging/log_job";
import { Document } from "../lib/entities";
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function

export async function scrapeHelper(
req: Request,
Expand All @@ -22,6 +23,10 @@ export async function scrapeHelper(
return { success: false, error: "Url is required", returnCode: 400 };
}

if (isUrlBlocked(url)) {
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
}

const a = new WebScraperDataProvider();
await a.setOptions({
mode: "single_urls",
Expand Down
19 changes: 19 additions & 0 deletions apps/api/src/scraper/WebScraper/utils/blocklist.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
const socialMediaBlocklist = [
'facebook.com',
'twitter.com',
'instagram.com',
'linkedin.com',
'pinterest.com',
'snapchat.com',
'tiktok.com',
'reddit.com',
'tumblr.com',
'flickr.com',
'whatsapp.com',
'wechat.com',
'telegram.org',
];

export function isUrlBlocked(url: string): boolean {
return socialMediaBlocklist.some(domain => url.includes(domain));
}
Loading