diff --git a/griptape/drivers/web_scraper/markdownify_web_scraper_driver.py b/griptape/drivers/web_scraper/markdownify_web_scraper_driver.py index b54ff072f..c23fc542f 100644 --- a/griptape/drivers/web_scraper/markdownify_web_scraper_driver.py +++ b/griptape/drivers/web_scraper/markdownify_web_scraper_driver.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from random import randint from typing import Any, Optional from attrs import Factory, define, field @@ -40,6 +41,7 @@ class MarkdownifyWebScraperDriver(BaseWebScraperDriver): def scrape_url(self, url: str) -> TextArtifact: sync_playwright = import_optional_dependency("playwright.sync_api").sync_playwright + fake_useragent = import_optional_dependency("fake_useragent") bs4 = import_optional_dependency("bs4") markdownify = import_optional_dependency("markdownify") @@ -54,7 +56,43 @@ def convert_a(self, el: Any, text: str, convert_as_inline: Any) -> str: return text with sync_playwright() as p, p.chromium.launch(headless=True) as browser: - page = browser.new_page() + # Randomize user agent to help prevent fingerprinting + user_agent = fake_useragent.UserAgent().random + + # Randomize viewport size to help prevent fingerprinting + viewport = {"width": randint(1024, 1920), "height": randint(768, 1080)} + + context = browser.new_context(user_agent=user_agent, viewport=viewport) + + # Disable WebRTC to prevent IP leaks + context.add_init_script(""" + Object.defineProperty(navigator, 'mediaDevices', { + value: { + getUserMedia: () => Promise.reject(new Error('Not allowed')), + }, + configurable: True, + }); + """) + + # Prevent canvas fingerprinting + context.add_init_script(""" + HTMLCanvasElement.prototype.toDataURL = () => "data:image/png;base64,spoofedData"; + HTMLCanvasElement.prototype.getImageData = function(sx, sy, sw, sh) { + const data = CanvasRenderingContext2D.prototype.getImageData.call(this, sx, sy, sw, sh); + for (let i = 0; i < data.data.length; i += 4) data.data[i] ^= 0xFF; // Invert colors + return data; + }; + """) + + # Add random plugins to prevent fingerprinting + context.add_init_script(f""" + Object.defineProperty(navigator, 'plugins', {{ + get: () => {self._random_js_plugin_array(user_agent)}, + configurable: True, + }}); + """) + + page = context.new_page() def skip_loading_images(route: Any) -> Any: if route.request.resource_type == "image": @@ -101,3 +139,41 @@ def skip_loading_images(route: Any) -> Any: text = re.sub(r"\n\n+", "\n\n", text) return TextArtifact(text) + + def _random_js_plugin_array(self, user_agent: str) -> str: + faker = import_optional_dependency("faker") + fake = faker.Faker() + num_plugins = randint(0, 5) + extension = self._get_os_extension(user_agent) + plugins = [] + for _ in range(num_plugins): + plugins.append( + "".join( + [ + "{", + ", ".join( + [ + f'{k}: "{v}"' + for k, v in { + "name": f"{fake.word().capitalize()} Plugin", + "description": f"{fake.catch_phrase()} Description", + "filename": f"{fake.file_name(extension=extension)}", + }.items() + ] + ), + "}", + ] + ) + ) + + return f"[{', '.join(plugins)}]" + + def _get_os_extension(self, user_agent: str) -> str: + if "Windows" in user_agent: + return "dll" + elif "Macintosh" in user_agent: + return "dylib" + elif "Linux" in user_agent: + return "so" + else: + return "plugin" # Default fallback diff --git a/poetry.lock b/poetry.lock index 2b9bd3641..6c010beb2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1529,6 +1529,34 @@ files = [ [package.extras] testing = ["hatch", "pre-commit", "pytest", "tox"] +[[package]] +name = "fake-useragent" +version = "1.5.1" +description = "Up-to-date simple useragent faker with real world database" +optional = true +python-versions = "*" +files = [ + {file = "fake-useragent-1.5.1.tar.gz", hash = "sha256:6387269f5a2196b5ba7ed8935852f75486845a1c95c50e72460e6a8e762f5c49"}, + {file = "fake_useragent-1.5.1-py3-none-any.whl", hash = "sha256:57415096557c8a4e23b62a375c21c55af5fd4ba30549227f562d2c4f5b60e3b3"}, +] + +[package.dependencies] +importlib-resources = {version = ">=5.0", markers = "python_version < \"3.10\""} + +[[package]] +name = "faker" +version = "27.4.0" +description = "Faker is a Python package that generates fake data for you." +optional = true +python-versions = ">=3.8" +files = [ + {file = "Faker-27.4.0-py3-none-any.whl", hash = "sha256:1c44d4bdcad7237516c9a829b6a0bcb031c6a4cb0506207c480c79f74d8922bf"}, + {file = "faker-27.4.0.tar.gz", hash = "sha256:4ce108fc96053bbba3abf848e3a2885f05faa938deb987f97e4420deaec541c4"}, +] + +[package.dependencies] +python-dateutil = ">=2.4" + [[package]] name = "fastavro" version = "1.9.5" @@ -2357,6 +2385,28 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke perf = ["ipython"] test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] +[[package]] +name = "importlib-resources" +version = "6.4.4" +description = "Read resources from Python packages" +optional = true +python-versions = ">=3.8" +files = [ + {file = "importlib_resources-6.4.4-py3-none-any.whl", hash = "sha256:dda242603d1c9cd836c3368b1174ed74cb4049ecd209e7a1a0104620c18c5c11"}, + {file = "importlib_resources-6.4.4.tar.gz", hash = "sha256:20600c8b7361938dc0bb2d5ec0297802e575df486f5a544fa414da65e13721f7"}, +] + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] +type = ["pytest-mypy"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -6960,7 +7010,7 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -all = ["accelerate", "anthropic", "astrapy", "beautifulsoup4", "boto3", "cohere", "diffusers", "duckduckgo-search", "elevenlabs", "filetype", "google-generativeai", "mail-parser", "markdownify", "marqo", "ollama", "opensearch-py", "opentelemetry-api", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-instrumentation", "opentelemetry-instrumentation-threading", "opentelemetry-sdk", "pandas", "pgvector", "pillow", "pinecone-client", "playwright", "psycopg2-binary", "pusher", "pymongo", "pypdf", "qdrant-client", "redis", "sentencepiece", "snowflake-sqlalchemy", "sqlalchemy", "torch", "trafilatura", "transformers", "voyageai"] +all = ["accelerate", "anthropic", "astrapy", "beautifulsoup4", "boto3", "cohere", "diffusers", "duckduckgo-search", "elevenlabs", "fake-useragent", "faker", "filetype", "google-generativeai", "mail-parser", "markdownify", "marqo", "ollama", "opensearch-py", "opentelemetry-api", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-instrumentation", "opentelemetry-instrumentation-threading", "opentelemetry-sdk", "pandas", "pgvector", "pillow", "pinecone-client", "playwright", "psycopg2-binary", "pusher", "pymongo", "pypdf", "qdrant-client", "redis", "sentencepiece", "snowflake-sqlalchemy", "sqlalchemy", "torch", "trafilatura", "transformers", "voyageai"] drivers-embedding-amazon-bedrock = ["boto3"] drivers-embedding-amazon-sagemaker = ["boto3"] drivers-embedding-cohere = ["cohere"] @@ -6999,7 +7049,7 @@ drivers-vector-pgvector = ["pgvector", "psycopg2-binary", "sqlalchemy"] drivers-vector-pinecone = ["pinecone-client"] drivers-vector-qdrant = ["qdrant-client"] drivers-vector-redis = ["redis"] -drivers-web-scraper-markdownify = ["beautifulsoup4", "markdownify", "playwright"] +drivers-web-scraper-markdownify = ["beautifulsoup4", "fake-useragent", "faker", "markdownify", "playwright"] drivers-web-scraper-trafilatura = ["trafilatura"] drivers-web-search-duckduckgo = ["duckduckgo-search"] loaders-audio = ["filetype"] @@ -7012,4 +7062,4 @@ loaders-sql = ["sqlalchemy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d368587717dd8496f0db30403afa59ca6ff9e0b4e2d747f2b4c703e832d904c3" +content-hash = "7eef6d34ad50d04a1ea788146674b3453a4cc5dfc40fff75475d3307fb48a9da" diff --git a/pyproject.toml b/pyproject.toml index 2afdc5910..1066ac21d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,8 @@ psycopg2-binary = { version = "^2.9.9", optional = true } google-generativeai = { version = "^0.7.2", optional = true } trafilatura = {version = "^1.6", optional = true} playwright = {version = "^1.42", optional = true} +fake-useragent = {version = "^1.5.1", optional = true} +faker = {version = "^27.4.0", optional = true} beautifulsoup4 = {version = "^4.12.3", optional = true} markdownify = {version = "^0.11.6", optional = true} voyageai = {version = "^0.2.1", optional = true} @@ -108,7 +110,7 @@ drivers-embedding-cohere = ["cohere"] drivers-embedding-ollama = ["ollama"] drivers-web-scraper-trafilatura = ["trafilatura"] -drivers-web-scraper-markdownify = ["playwright", "beautifulsoup4", "markdownify"] +drivers-web-scraper-markdownify = ["playwright", "fake-useragent", "faker", "beautifulsoup4", "markdownify"] drivers-web-search-duckduckgo = ["duckduckgo-search"] @@ -179,6 +181,8 @@ all = [ "google-generativeai", "trafilatura", "playwright", + "fake-useragent", + "faker", "beautifulsoup4", "markdownify", "voyageai", diff --git a/tests/unit/drivers/web_scraper/test_markdownify_web_scraper_driver.py b/tests/unit/drivers/web_scraper/test_markdownify_web_scraper_driver.py index dbdafa98f..cc234f086 100644 --- a/tests/unit/drivers/web_scraper/test_markdownify_web_scraper_driver.py +++ b/tests/unit/drivers/web_scraper/test_markdownify_web_scraper_driver.py @@ -12,7 +12,7 @@ def mock_playwright(self, mocker): @pytest.fixture(autouse=True) def mock_content(self, mock_playwright): - mock_content = mock_playwright.__enter__.return_value.chromium.launch.return_value.__enter__.return_value.new_page.return_value.content + mock_content = mock_playwright.__enter__.return_value.chromium.launch.return_value.__enter__.return_value.new_context.return_value.new_page.return_value.content mock_content.return_value = 'foobar' return mock_content