Skip to content

Feature/scraping strategy - refactor: Remove WebScrapingStrategy and fix metadata extraction (#995) #1161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: next
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from .content_scraping_strategy import (
ContentScrapingStrategy,
WebScrapingStrategy,
# WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from .async_logger import (
Expand Down Expand Up @@ -100,7 +100,7 @@
"CrawlerHub",
"CacheMode",
"ContentScrapingStrategy",
"WebScrapingStrategy",
# "WebScrapingStrategy",
"LXMLWebScrapingStrategy",
"BrowserConfig",
"CrawlerRunConfig",
Expand Down
6 changes: 3 additions & 3 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .chunking_strategy import ChunkingStrategy, RegexChunking

from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy

from .cache_context import CacheMode
Expand Down Expand Up @@ -725,7 +725,7 @@ class CrawlerRunConfig():
parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
Default: LXMLWebScrapingStrategy.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.

Expand Down Expand Up @@ -979,7 +979,7 @@ def __init__(
self.remove_forms = remove_forms
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy

Expand Down
Loading