From b69cc7c983d6bf418e82075f817852d0bf8d838b Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Tue, 19 Nov 2024 14:35:24 +0200 Subject: [PATCH 1/6] Fix for the stealth mode on playwright fetcher The JS files weren't being moved while installing with PIP! --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 736106d..b8fbc5b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include LICENSE include *.db include *.js +include scrapling/engines/toolbelt/bypasses/*.js include scrapling/*.db include scrapling/*.db* include scrapling/py.typed From 526c51669300aca3d6b7daea1bc62688d76f42ef Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Wed, 20 Nov 2024 01:28:01 +0200 Subject: [PATCH 2/6] Pump version up to 0.2.3 --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index 8875c1f..dea5816 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -4,7 +4,7 @@ from scrapling.core.custom_types import TextHandler, AttributesHandler __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.2" +__version__ = "0.2.3" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index 8c62480..2f4ff5a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.2 +version = 0.2.3 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index f2197b8..386e3c5 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.2", + version="0.2.3", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""", From 649f4cfae7d264877bb37ebebb15d477535c0628 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Wed, 20 Nov 2024 01:41:46 +0200 Subject: [PATCH 3/6] Pumping up camoufox version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 386e3c5..42e670f 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'httpx[brotli,zstd]', 'playwright', 'rebrowser-playwright', - 'camoufox>=0.3.9', + 'camoufox>=0.3.10', 'browserforge', ], python_requires=">=3.8", From 4c74d9bc972cfce3b5eb46f92593b95b3b4ea095 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Wed, 20 Nov 2024 12:33:54 +0200 Subject: [PATCH 4/6] Fixing a bug with reading response bytes in playwright/camoufox when `network_idle` is used PlayWright doesn't provide a way to get the response in bytes after all wait ends like `page.content()` so that's more efficient to do anyway --- scrapling/engines/camo.py | 2 +- scrapling/engines/pw.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 62fdd39..2f8a1f9 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -114,7 +114,7 @@ def fetch(self, url: str) -> Response: response = Response( url=res.url, text=page.content(), - body=res.body(), + body=page.content().encode('utf-8'), status=res.status, reason=res.status_text, encoding=encoding, diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index 3395ece..c83bfe3 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -224,7 +224,7 @@ def fetch(self, url: str) -> Response: response = Response( url=res.url, text=page.content(), - body=res.body(), + body=page.content().encode('utf-8'), status=res.status, reason=res.status_text, encoding=encoding, From 19ad82caef95030478bc8c28f72902b38745feb5 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Wed, 20 Nov 2024 13:27:18 +0200 Subject: [PATCH 5/6] Calculate status text manually if it's not returned by PlayWright API --- scrapling/engines/camo.py | 8 ++- scrapling/engines/pw.py | 8 ++- scrapling/engines/toolbelt/__init__.py | 1 + scrapling/engines/toolbelt/custom.py | 80 +++++++++++++++++++++++++- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index 2f8a1f9..e55d951 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -4,6 +4,7 @@ from scrapling.engines.toolbelt import ( Response, do_nothing, + StatusText, get_os_name, intercept_route, check_type_validity, @@ -111,12 +112,17 @@ def fetch(self, url: str) -> Response: if 'charset=' in content_type.lower(): encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip() + status_text = res.status_text + # PlayWright API sometimes give empty status text for some reason! + if not status_text: + status_text = StatusText.get(res.status) + response = Response( url=res.url, text=page.content(), body=page.content().encode('utf-8'), status=res.status, - reason=res.status_text, + reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, headers=res.all_headers(), diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py index c83bfe3..b049090 100644 --- a/scrapling/engines/pw.py +++ b/scrapling/engines/pw.py @@ -6,6 +6,7 @@ from scrapling.engines.toolbelt import ( Response, do_nothing, + StatusText, js_bypass_path, intercept_route, generate_headers, @@ -221,12 +222,17 @@ def fetch(self, url: str) -> Response: if 'charset=' in content_type.lower(): encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip() + status_text = res.status_text + # PlayWright API sometimes give empty status text for some reason! + if not status_text: + status_text = StatusText.get(res.status) + response = Response( url=res.url, text=page.content(), body=page.content().encode('utf-8'), status=res.status, - reason=res.status_text, + reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, headers=res.all_headers(), diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py index ac3e03d..15fd80c 100644 --- a/scrapling/engines/toolbelt/__init__.py +++ b/scrapling/engines/toolbelt/__init__.py @@ -6,6 +6,7 @@ from .custom import ( Response, do_nothing, + StatusText, BaseFetcher, get_variable_name, check_type_validity, diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index de34099..8957d1a 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -4,8 +4,9 @@ import inspect import logging -from scrapling.core.utils import setup_basic_logging +from scrapling.core.custom_types import MappingProxyType from scrapling.parser import Adaptor, SQLiteStorageSystem +from scrapling.core.utils import setup_basic_logging, cache from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable @@ -67,6 +68,83 @@ def __init__( self.adaptor_arguments.update({'automatch_domain': automatch_domain}) +class StatusText: + """A class that gets the status text of response status code. + + Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status + """ + _phrases = MappingProxyType({ + 100: "Continue", + 101: "Switching Protocols", + 102: "Processing", + 103: "Early Hints", + 200: "OK", + 201: "Created", + 202: "Accepted", + 203: "Non-Authoritative Information", + 204: "No Content", + 205: "Reset Content", + 206: "Partial Content", + 207: "Multi-Status", + 208: "Already Reported", + 226: "IM Used", + 300: "Multiple Choices", + 301: "Moved Permanently", + 302: "Found", + 303: "See Other", + 304: "Not Modified", + 305: "Use Proxy", + 307: "Temporary Redirect", + 308: "Permanent Redirect", + 400: "Bad Request", + 401: "Unauthorized", + 402: "Payment Required", + 403: "Forbidden", + 404: "Not Found", + 405: "Method Not Allowed", + 406: "Not Acceptable", + 407: "Proxy Authentication Required", + 408: "Request Timeout", + 409: "Conflict", + 410: "Gone", + 411: "Length Required", + 412: "Precondition Failed", + 413: "Payload Too Large", + 414: "URI Too Long", + 415: "Unsupported Media Type", + 416: "Range Not Satisfiable", + 417: "Expectation Failed", + 418: "I'm a teapot", + 421: "Misdirected Request", + 422: "Unprocessable Entity", + 423: "Locked", + 424: "Failed Dependency", + 425: "Too Early", + 426: "Upgrade Required", + 428: "Precondition Required", + 429: "Too Many Requests", + 431: "Request Header Fields Too Large", + 451: "Unavailable For Legal Reasons", + 500: "Internal Server Error", + 501: "Not Implemented", + 502: "Bad Gateway", + 503: "Service Unavailable", + 504: "Gateway Timeout", + 505: "HTTP Version Not Supported", + 506: "Variant Also Negotiates", + 507: "Insufficient Storage", + 508: "Loop Detected", + 510: "Not Extended", + 511: "Network Authentication Required" + }) + + @classmethod + @cache(maxsize=128) + def get(cls, status_code: int) -> str: + """Get the phrase for a given HTTP status code.""" + return cls._phrases.get(status_code, "Unknown Status Code") + + def check_if_engine_usable(engine: Callable) -> Union[Callable, None]: """This function check if the passed engine can be used by a Fetcher-type class or not. From 17d79344842b751df81c0a0a3a55039b8f2661f7 Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Wed, 20 Nov 2024 13:27:39 +0200 Subject: [PATCH 6/6] Pump version to 0.2.4 --- scrapling/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapling/__init__.py b/scrapling/__init__.py index dea5816..12b8d40 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -4,7 +4,7 @@ from scrapling.core.custom_types import TextHandler, AttributesHandler __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.2.3" +__version__ = "0.2.4" __copyright__ = "Copyright (c) 2024 Karim Shoair" diff --git a/setup.cfg b/setup.cfg index 2f4ff5a..bcc127c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.2.3 +version = 0.2.4 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python. diff --git a/setup.py b/setup.py index 42e670f..15c7bf3 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="scrapling", - version="0.2.3", + version="0.2.4", description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It simplifies the process of extracting data from websites, even when they undergo structural changes, and offers impressive speed improvements over many popular scraping tools.""",