From 3cca3fd56b5476f2d1f76a509be8c20daf384a80 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Sat, 21 Dec 2024 16:17:02 +0200 Subject: [PATCH 1/6] style(parser): more accurate type hints --- scrapling/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapling/parser.py b/scrapling/parser.py index c82a270..1a46f1a 100644 --- a/scrapling/parser.py +++ b/scrapling/parser.py @@ -474,7 +474,7 @@ def xpath_first(self, selector: str, identifier: str = '', def css(self, selector: str, identifier: str = '', auto_match: bool = False, auto_save: bool = False, percentage: int = 0 - ) -> Union['Adaptors[Adaptor]', List]: + ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']: """Search current tree with CSS3 selectors **Important: @@ -517,7 +517,7 @@ def css(self, selector: str, identifier: str = '', def xpath(self, selector: str, identifier: str = '', auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any - ) -> Union['Adaptors[Adaptor]', List]: + ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']: """Search current tree with XPath selectors **Important: From f9b85cfbac0e0465a9e07c82915a3b4e88b2e407 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 00:27:13 +0200 Subject: [PATCH 2/6] fix(PlaywrightFetcher): Use more dependable response data --- scrapling/engines/pw.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 0a6d30d..2cd93e5 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -206,7 +206,7 @@ def fetch(self, url: str) -> Response: def handle_response(finished_response: PlaywrightResponse): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response with sync_playwright() as p: @@ -252,7 +252,6 @@ def handle_response(finished_response: PlaywrightResponse): if self.network_idle: page.wait_for_load_state('networkidle') - response_bytes = final_response.body() if final_response else page.content().encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -261,15 +260,15 @@ def handle_response(finished_response: PlaywrightResponse): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=page.content(), - body=response_bytes, + body=page.content().encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, - headers=final_response.all_headers(), - request_headers=final_response.request.all_headers(), + headers=first_response.all_headers(), + request_headers=first_response.request.all_headers(), **self.adaptor_arguments ) page.close() @@ -293,7 +292,7 @@ async def async_fetch(self, url: str) -> Response: async def handle_response(finished_response: PlaywrightResponse): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response async with async_playwright() as p: @@ -339,7 +338,6 @@ async def handle_response(finished_response: PlaywrightResponse): if self.network_idle: await page.wait_for_load_state('networkidle') - response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -348,15 +346,15 @@ async def handle_response(finished_response: PlaywrightResponse): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=await page.content(), - body=response_bytes, + body=(await page.content()).encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()}, - headers=await final_response.all_headers(), - request_headers=await final_response.request.all_headers(), + headers=await first_response.all_headers(), + request_headers=await first_response.request.all_headers(), **self.adaptor_arguments ) await page.close() From 2006be217364e1eb2306303a8f3ffd247193bc5a Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 00:29:42 +0200 Subject: [PATCH 3/6] fix(StealthyFetcher): Use more dependable response data --- scrapling/engines/camo.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index f1d8f95..6d28634 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -89,7 +89,7 @@ def fetch(self, url: str) -> Response: def handle_response(finished_response): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response with Camoufox( @@ -133,7 +133,6 @@ def handle_response(finished_response): if self.network_idle: page.wait_for_load_state('networkidle') - response_bytes = final_response.body() if final_response else page.content().encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -142,15 +141,15 @@ def handle_response(finished_response): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=page.content(), - body=response_bytes, + body=page.content().encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, - headers=final_response.all_headers(), - request_headers=final_response.request.all_headers(), + headers=first_response.all_headers(), + request_headers=first_response.request.all_headers(), **self.adaptor_arguments ) page.close() @@ -169,7 +168,7 @@ async def async_fetch(self, url: str) -> Response: async def handle_response(finished_response): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response async with AsyncCamoufox( @@ -213,7 +212,6 @@ async def handle_response(finished_response): if self.network_idle: await page.wait_for_load_state('networkidle') - response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -222,15 +220,15 @@ async def handle_response(finished_response): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=await page.content(), - body=response_bytes, + body=(await page.content()).encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()}, - headers=await final_response.all_headers(), - request_headers=await final_response.request.all_headers(), + headers=await first_response.all_headers(), + request_headers=await first_response.request.all_headers(), **self.adaptor_arguments ) await page.close() From 03b52aa74cb1f6cd6980ee1ef6a3ee9870ae77ec Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 19:46:22 +0200 Subject: [PATCH 4/6] feat(cli): adding terminal command `scrapling install` Instead of making users install browsers by themself --- .bandit.yml | 2 ++ MANIFEST.in | 3 +++ scrapling/cli.py | 37 +++++++++++++++++++++++++++++++++++++ setup.py | 6 ++++++ 4 files changed, 48 insertions(+) create mode 100644 scrapling/cli.py diff --git a/.bandit.yml b/.bandit.yml index 9d7a1f7..525749a 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -5,3 +5,5 @@ skips: - B410 - B113 # `Requests call without timeout` these requests are done in the benchmark and examples scripts only - B403 # We are using pickle for tests only +- B404 # Using subprocess library +- B602 # subprocess call with shell=True identified diff --git a/MANIFEST.in b/MANIFEST.in index b8fbc5b..8c168dd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,10 @@ include *.js include scrapling/engines/toolbelt/bypasses/*.js include scrapling/*.db include scrapling/*.db* +include scrapling/*.db-* include scrapling/py.typed +include scrapling/.scrapling_dependencies_installed +include .scrapling_dependencies_installed recursive-exclude * __pycache__ recursive-exclude * *.py[co] \ No newline at end of file diff --git a/scrapling/cli.py b/scrapling/cli.py new file mode 100644 index 0000000..7d22f0c --- /dev/null +++ b/scrapling/cli.py @@ -0,0 +1,37 @@ +import os +import subprocess +import sys +from pathlib import Path + +import click + + +def get_package_dir(): + return Path(os.path.dirname(__file__)) + + +def run_command(command, line): + print(f"Installing {line}...") + _ = subprocess.check_call(command, shell=True) + # I meant to not use try except here + + +@click.command(help="Install all Scrapling's Fetchers dependencies") +def install(): + if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists(): + run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers') + run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies') + run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases') + # if no errors raised by above commands, then we add below file + get_package_dir().joinpath(".scrapling_dependencies_installed").touch() + else: + print('The dependencies are already installed') + + +@click.group() +def main(): + pass + + +# Adding commands +main.add_command(install) diff --git a/setup.py b/setup.py index 45b88ad..f092a64 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,11 @@ package_dir={ "scrapling": "scrapling", }, + entry_points={ + 'console_scripts': [ + 'scrapling=scrapling.cli:main' + ], + }, include_package_data=True, classifiers=[ "Operating System :: OS Independent", @@ -50,6 +55,7 @@ "requests>=2.3", "lxml>=4.5", "cssselect>=1.2", + 'click', "w3lib", "orjson>=3", "tldextract", From ef284234e6ed6b51307875757b0c668e3b8abcaa Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 19:56:37 +0200 Subject: [PATCH 5/6] docs: update README file to reflect new installation instructions --- README.md | 44 +++++--------------------------------------- 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 92c76b2..28a6c05 100644 --- a/README.md +++ b/README.md @@ -167,52 +167,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons. ## Installation -Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work. +Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work. ```bash pip3 install scrapling ``` -- For using the `StealthyFetcher`, go to the command line and download the browser with -
Windows OS - -```bash -camoufox fetch --browserforge -``` -
-
MacOS - -```bash -python3 -m camoufox fetch --browserforge -``` -
-
Linux - +Then run this command to install browsers' dependencies needed to use Fetcher classes ```bash -python -m camoufox fetch --browserforge -``` -On a fresh installation of Linux, you may also need the following Firefox dependencies: -- Debian-based distros - ```bash - sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2 - ``` -- Arch-based distros - ```bash - sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib - ``` -
- - See the official Camoufox documentation for more info on installation - -- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with: -```commandline -playwright install chromium -``` -- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with: -```commandline -python -m browserforge update +scrapling install ``` +If you have any installation issues, please open an issue. ## Fetching Websites -Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page. +Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page. ### Features You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way From c5d83a11adc016e85636e0ad393d678bceed9d5a Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 19:57:25 +0200 Subject: [PATCH 6/6] build: Pumping version up --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index eeba624..a80c50f 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -5,7 +5,7 @@ from scrapling.parser import Adaptor, Adaptors __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.91" +__version__ = "0.2.92" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index 700f9d6..4b945f7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.91 +version = 0.2.92 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index f092a64..3af52b9 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.91", + version="0.2.92", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""",