Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SOCKS Proxy support #682

Merged
merged 5 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ jobs:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13-dev"
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
env:
- MINIMAL: "true"
PROXY_TEST: "false"
- MINIMAL: "false"
PROXY_TEST: "true"
include:
# custom python versions
- os: ubuntu-20.04
Expand All @@ -36,6 +40,19 @@ jobs:
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.12"
services:
socks_proxy:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
ports:
- 1080:1080
socks_proxy_auth:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
env:
PROXY_USER: user
PROXY_PASSWORD: pass
ports:
- 1081:1080

steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -97,6 +114,8 @@ jobs:
run: |
python -m pip install pytest pytest-cov
pytest --cov=./ --cov-report=xml
env:
PROXY_TEST: ${{ matrix.env.PROXY_TEST }}

# coverage
- name: Upload coverage to Codecov
Expand Down
18 changes: 18 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
services:
socks_proxy:
image: serjs/go-socks5-proxy
ports:
- 1080:1080
socks_proxy_auth:
image: serjs/go-socks5-proxy
ports:
- 1081:1080
environment:
PROXY_USER: user
PROXY_PASSWORD: pass
# tor_proxy:
# image: dperson/torproxy
# ports:
# - 9050:9050


1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def get_long_description():
"htmldate[speed] >= 1.8.1",
"py3langid >= 0.2.2",
"pycurl >= 7.45.3",
"urllib3[socks]",
"zstandard >= 0.20.0",
],
"gui": [
Expand Down
37 changes: 37 additions & 0 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config
from trafilatura.utils import decode_file, decode_response, handle_compressed_file, load_html


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

ZERO_CONFIG = DEFAULT_CONFIG
Expand All @@ -59,6 +60,7 @@ def _reset_downloads_global_objects():
"""
Force global objects to be re-created
"""
trafilatura.downloads.PROXY_URL = None
trafilatura.downloads.HTTP_POOL = None
trafilatura.downloads.NO_CERT_POOL = None
trafilatura.downloads.RETRY_STRATEGY = None
Expand Down Expand Up @@ -154,6 +156,39 @@ def test_fetch():
_reset_downloads_global_objects()


IS_PROXY_TEST = os.environ.get("PROXY_TEST", "false") == "true"

PROXY_URLS = (
("socks5://localhost:1080", True),
("socks5://user:pass@localhost:1081", True),
("socks5://localhost:10/", False),
("bogus://localhost:1080", False),
)


def proxied(f):
"Run the download using a potentially malformed proxy address."
for proxy_url, is_working in PROXY_URLS:
_reset_downloads_global_objects()
trafilatura.downloads.PROXY_URL = proxy_url
if is_working:
f()
else:
with pytest.raises(AssertionError):
f()
_reset_downloads_global_objects()


@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled")
def test_proxied_is_live_page():
proxied(test_is_live_page)


@pytest.mark.skipif(not IS_PROXY_TEST, reason="proxy tests disabled")
def test_proxied_fetch():
proxied(test_fetch)


def test_config():
'''Test how configuration options are read and stored.'''
# default config is none
Expand Down Expand Up @@ -241,6 +276,8 @@ def test_queue():
test_response_object()
test_is_live_page()
test_fetch()
test_proxied_is_live_page()
test_proxied_fetch()
test_config()
test_decode()
test_queue()
47 changes: 32 additions & 15 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import logging
import os
import random

from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -16,9 +17,21 @@
import certifi
import urllib3

from courlan import UrlStore
from courlan.network import redirection_test

from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks


try:
import pycurl
from urllib3.contrib.socks import SOCKSProxyManager
PROXY_URL = os.environ.get("http_proxy")
except ImportError:
PROXY_URL = None

try:
import pycurl
CURL_SHARE = pycurl.CurlShare()
# available options:
# https://curl.se/libcurl/c/curl_share_setopt.html
Expand All @@ -30,27 +43,28 @@
except ImportError:
HAS_PYCURL = False

from courlan import UrlStore
from courlan.network import redirection_test

try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version

from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, is_acceptable_length, make_chunks


LOGGER = logging.getLogger(__name__)

NUM_CONNECTIONS = 50

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
NO_CERT_POOL = None
RETRY_STRATEGY = None


def create_pool(**args):
"Configure urllib3 download pool according to user-defined settings."
manager_class = SOCKSProxyManager if PROXY_URL else urllib3.PoolManager
manager_args = {"proxy_url": PROXY_URL} if PROXY_URL else {}
manager_args["num_pools"] = 50
return manager_class(**manager_args, **args)


DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = (
"trafilatura/" + version("trafilatura") + " (+https://github.com/adbar/trafilatura)"
Expand Down Expand Up @@ -161,20 +175,18 @@ def _send_urllib_request(
try:
if no_ssl is False:
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(
HTTP_POOL = create_pool(
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
ca_certs=certifi.where(),
num_pools=NUM_CONNECTIONS,
ca_certs=certifi.where()
) # cert_reqs='CERT_REQUIRED'
pool_manager = HTTP_POOL
else:
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(
NO_CERT_POOL = create_pool(
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
cert_reqs="CERT_NONE",
num_pools=NUM_CONNECTIONS,
cert_reqs="CERT_NONE"
)
pool_manager = NO_CERT_POOL
# execute request
Expand Down Expand Up @@ -288,6 +300,8 @@ def _pycurl_is_live_page(url: str) -> bool:
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)
# Perform the request
try:
curl.perform()
Expand Down Expand Up @@ -410,6 +424,9 @@ def _send_pycurl_request(
headerbytes = BytesIO()
curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)

if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)

# TCP_FASTOPEN
# curl.setopt(pycurl.FAILONERROR, 1)
# curl.setopt(pycurl.ACCEPT_ENCODING, '')
Expand Down
Loading