Skip to content

Commit

Permalink
Merge branch 'main' into docs
Browse files Browse the repository at this point in the history
  • Loading branch information
D4Vinci committed Nov 20, 2024
2 parents f443d77 + e9b0102 commit 3f39bd5
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 9 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include LICENSE
include *.db
include *.js
include scrapling/engines/toolbelt/bypasses/*.js
include scrapling/*.db
include scrapling/*.db*
include scrapling/py.typed
Expand Down
2 changes: 1 addition & 1 deletion scrapling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from scrapling.core.custom_types import TextHandler, AttributesHandler

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2.2"
__version__ = "0.2.4"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
10 changes: 8 additions & 2 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
get_os_name,
intercept_route,
check_type_validity,
Expand Down Expand Up @@ -111,12 +112,17 @@ def fetch(self, url: str) -> Response:
if 'charset=' in content_type.lower():
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()

status_text = res.status_text
# PlayWright API sometimes give empty status text for some reason!
if not status_text:
status_text = StatusText.get(res.status)

response = Response(
url=res.url,
text=page.content(),
body=res.body(),
body=page.content().encode('utf-8'),
status=res.status,
reason=res.status_text,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
Expand Down
10 changes: 8 additions & 2 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
js_bypass_path,
intercept_route,
generate_headers,
Expand Down Expand Up @@ -221,12 +222,17 @@ def fetch(self, url: str) -> Response:
if 'charset=' in content_type.lower():
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()

status_text = res.status_text
# PlayWright API sometimes give empty status text for some reason!
if not status_text:
status_text = StatusText.get(res.status)

response = Response(
url=res.url,
text=page.content(),
body=res.body(),
body=page.content().encode('utf-8'),
status=res.status,
reason=res.status_text,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
Expand Down
1 change: 1 addition & 0 deletions scrapling/engines/toolbelt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .custom import (
Response,
do_nothing,
StatusText,
BaseFetcher,
get_variable_name,
check_type_validity,
Expand Down
80 changes: 79 additions & 1 deletion scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import inspect
import logging

from scrapling.core.utils import setup_basic_logging
from scrapling.core.custom_types import MappingProxyType
from scrapling.parser import Adaptor, SQLiteStorageSystem
from scrapling.core.utils import setup_basic_logging, cache
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable


Expand Down Expand Up @@ -67,6 +68,83 @@ def __init__(
self.adaptor_arguments.update({'automatch_domain': automatch_domain})


class StatusText:
"""A class that gets the status text of response status code.
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
"""
_phrases = MappingProxyType({
100: "Continue",
101: "Switching Protocols",
102: "Processing",
103: "Early Hints",
200: "OK",
201: "Created",
202: "Accepted",
203: "Non-Authoritative Information",
204: "No Content",
205: "Reset Content",
206: "Partial Content",
207: "Multi-Status",
208: "Already Reported",
226: "IM Used",
300: "Multiple Choices",
301: "Moved Permanently",
302: "Found",
303: "See Other",
304: "Not Modified",
305: "Use Proxy",
307: "Temporary Redirect",
308: "Permanent Redirect",
400: "Bad Request",
401: "Unauthorized",
402: "Payment Required",
403: "Forbidden",
404: "Not Found",
405: "Method Not Allowed",
406: "Not Acceptable",
407: "Proxy Authentication Required",
408: "Request Timeout",
409: "Conflict",
410: "Gone",
411: "Length Required",
412: "Precondition Failed",
413: "Payload Too Large",
414: "URI Too Long",
415: "Unsupported Media Type",
416: "Range Not Satisfiable",
417: "Expectation Failed",
418: "I'm a teapot",
421: "Misdirected Request",
422: "Unprocessable Entity",
423: "Locked",
424: "Failed Dependency",
425: "Too Early",
426: "Upgrade Required",
428: "Precondition Required",
429: "Too Many Requests",
431: "Request Header Fields Too Large",
451: "Unavailable For Legal Reasons",
500: "Internal Server Error",
501: "Not Implemented",
502: "Bad Gateway",
503: "Service Unavailable",
504: "Gateway Timeout",
505: "HTTP Version Not Supported",
506: "Variant Also Negotiates",
507: "Insufficient Storage",
508: "Loop Detected",
510: "Not Extended",
511: "Network Authentication Required"
})

@classmethod
@cache(maxsize=128)
def get(cls, status_code: int) -> str:
"""Get the phrase for a given HTTP status code."""
return cls._phrases.get(status_code, "Unknown Status Code")


def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
"""This function check if the passed engine can be used by a Fetcher-type class or not.
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scrapling
version = 0.2.2
version = 0.2.4
author = Karim Shoair
author_email = [email protected]
description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="scrapling",
version="0.2.2",
version="0.2.4",
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
impressive speed improvements over many popular scraping tools.""",
Expand Down Expand Up @@ -57,7 +57,7 @@
'httpx[brotli,zstd]',
'playwright',
'rebrowser-playwright',
'camoufox>=0.3.9',
'camoufox>=0.3.10',
'browserforge',
],
python_requires=">=3.8",
Expand Down

0 comments on commit 3f39bd5

Please sign in to comment.