Skip to content

Commit

Permalink
Avoid browser fingerprinting
Browse files Browse the repository at this point in the history
  • Loading branch information
dylanholmes committed Aug 23, 2024
1 parent 489453e commit 54bc7bb
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 6 deletions.
78 changes: 77 additions & 1 deletion griptape/drivers/web_scraper/markdownify_web_scraper_driver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import re
from random import randint
from typing import Any, Optional

from attrs import Factory, define, field
Expand Down Expand Up @@ -40,6 +41,7 @@ class MarkdownifyWebScraperDriver(BaseWebScraperDriver):

def scrape_url(self, url: str) -> TextArtifact:
sync_playwright = import_optional_dependency("playwright.sync_api").sync_playwright
fake_useragent = import_optional_dependency("fake_useragent")
bs4 = import_optional_dependency("bs4")
markdownify = import_optional_dependency("markdownify")

Expand All @@ -54,7 +56,43 @@ def convert_a(self, el: Any, text: str, convert_as_inline: Any) -> str:
return text

with sync_playwright() as p, p.chromium.launch(headless=True) as browser:
page = browser.new_page()
# Randomize user agent to help prevent fingerprinting
user_agent = fake_useragent.UserAgent().random

# Randomize viewport size to help prevent fingerprinting
viewport = {"width": randint(1024, 1920), "height": randint(768, 1080)}

context = browser.new_context(user_agent=user_agent, viewport=viewport)

# Disable WebRTC to prevent IP leaks
context.add_init_script("""
Object.defineProperty(navigator, 'mediaDevices', {
value: {
getUserMedia: () => Promise.reject(new Error('Not allowed')),
},
configurable: True,
});
""")

# Prevent canvas fingerprinting
context.add_init_script("""
HTMLCanvasElement.prototype.toDataURL = () => "";
HTMLCanvasElement.prototype.getImageData = function(sx, sy, sw, sh) {
const data = CanvasRenderingContext2D.prototype.getImageData.call(this, sx, sy, sw, sh);
for (let i = 0; i < data.data.length; i += 4) data.data[i] ^= 0xFF; // Invert colors
return data;
};
""")

# Add random plugins to prevent fingerprinting
context.add_init_script(f"""
Object.defineProperty(navigator, 'plugins', {{
get: () => {self._random_js_plugin_array(user_agent)},
configurable: True,
}});
""")

page = context.new_page()

def skip_loading_images(route: Any) -> Any:
if route.request.resource_type == "image":
Expand Down Expand Up @@ -101,3 +139,41 @@ def skip_loading_images(route: Any) -> Any:
text = re.sub(r"\n\n+", "\n\n", text)

return TextArtifact(text)

def _random_js_plugin_array(self, user_agent: str) -> str:
faker = import_optional_dependency("faker")
fake = faker.Faker()
num_plugins = randint(0, 5)
extension = self._get_os_extension(user_agent)
plugins = []
for _ in range(num_plugins):
plugins.append(
"".join(
[
"{",
", ".join(
[
f'{k}: "{v}"'
for k, v in {
"name": f"{fake.word().capitalize()} Plugin",
"description": f"{fake.catch_phrase()} Description",
"filename": f"{fake.file_name(extension=extension)}",
}.items()
]
),
"}",
]
)
)

return f"[{', '.join(plugins)}]"

def _get_os_extension(self, user_agent: str) -> str:
if "Windows" in user_agent:
return "dll"
elif "Macintosh" in user_agent:
return "dylib"

Check warning on line 175 in griptape/drivers/web_scraper/markdownify_web_scraper_driver.py

View check run for this annotation

Codecov / codecov/patch

griptape/drivers/web_scraper/markdownify_web_scraper_driver.py#L175

Added line #L175 was not covered by tests
elif "Linux" in user_agent:
return "so"
else:
return "plugin" # Default fallback

Check warning on line 179 in griptape/drivers/web_scraper/markdownify_web_scraper_driver.py

View check run for this annotation

Codecov / codecov/patch

griptape/drivers/web_scraper/markdownify_web_scraper_driver.py#L179

Added line #L179 was not covered by tests
56 changes: 53 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ psycopg2-binary = { version = "^2.9.9", optional = true }
google-generativeai = { version = "^0.7.2", optional = true }
trafilatura = {version = "^1.6", optional = true}
playwright = {version = "^1.42", optional = true}
fake-useragent = {version = "^1.5.1", optional = true}
faker = {version = "^27.4.0", optional = true}
beautifulsoup4 = {version = "^4.12.3", optional = true}
markdownify = {version = "^0.11.6", optional = true}
voyageai = {version = "^0.2.1", optional = true}
Expand Down Expand Up @@ -108,7 +110,7 @@ drivers-embedding-cohere = ["cohere"]
drivers-embedding-ollama = ["ollama"]

drivers-web-scraper-trafilatura = ["trafilatura"]
drivers-web-scraper-markdownify = ["playwright", "beautifulsoup4", "markdownify"]
drivers-web-scraper-markdownify = ["playwright", "fake-useragent", "faker", "beautifulsoup4", "markdownify"]

drivers-web-search-duckduckgo = ["duckduckgo-search"]

Expand Down Expand Up @@ -179,6 +181,8 @@ all = [
"google-generativeai",
"trafilatura",
"playwright",
"fake-useragent",
"faker",
"beautifulsoup4",
"markdownify",
"voyageai",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def mock_playwright(self, mocker):

@pytest.fixture(autouse=True)
def mock_content(self, mock_playwright):
mock_content = mock_playwright.__enter__.return_value.chromium.launch.return_value.__enter__.return_value.new_page.return_value.content
mock_content = mock_playwright.__enter__.return_value.chromium.launch.return_value.__enter__.return_value.new_context.return_value.new_page.return_value.content
mock_content.return_value = '<html><a href="foobar.com">foobar</a></html>'
return mock_content

Expand Down

0 comments on commit 54bc7bb

Please sign in to comment.