Skip to content

Commit

Permalink
Merge branch 'searxng:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
iamrony777 committed Jun 3, 2023
2 parents 235759f + 80aaef6 commit cbe8eb0
Show file tree
Hide file tree
Showing 169 changed files with 14,655 additions and 22,473 deletions.
2 changes: 1 addition & 1 deletion docs/admin/engines/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ Global Settings

``limiter`` :
Rate limit the number of request on the instance, block some bots. The
:ref:`limiter plugin` requires a :ref:`settings redis` database.
:ref:`limiter src` requires a :ref:`settings redis` database.

.. _image_proxy:

Expand Down
45 changes: 45 additions & 0 deletions docs/src/searx.botdetection.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
.. _botdetection:

=============
Bot Detection
=============

.. contents:: Contents
:depth: 2
:local:
:backlinks: entry

.. automodule:: searx.botdetection
:members:

.. automodule:: searx.botdetection.limiter
:members:


Rate limit
==========

.. automodule:: searx.botdetection.ip_limit
:members:

.. automodule:: searx.botdetection.link_token
:members:


Probe HTTP headers
==================

.. automodule:: searx.botdetection.http_accept
:members:

.. automodule:: searx.botdetection.http_accept_encoding
:members:

.. automodule:: searx.botdetection.http_accept_language
:members:

.. automodule:: searx.botdetection.http_connection
:members:

.. automodule:: searx.botdetection.http_user_agent
:members:
13 changes: 0 additions & 13 deletions docs/src/searx.plugins.limiter.rst

This file was deleted.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ httpx-socks[asyncio]==0.7.2
setproctitle==1.3.2
redis==4.5.5
markdown-it-py==2.2.0
typing_extensions==4.6.2
typing_extensions==4.6.3
fasttext-predict==0.9.2.1
pytomlpp==1.0.13
27 changes: 27 additions & 0 deletions searx/botdetection/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _botdetection src:
X-Forwarded-For
===============
.. attention::
A correct setup of the HTTP request headers ``X-Forwarded-For`` and
``X-Real-IP`` is essential to be able to assign a request to an IP correctly:
- `NGINX RequestHeader`_
- `Apache RequestHeader`_
.. _NGINX RequestHeader:
https://docs.searxng.org/admin/installation-nginx.html#nginx-s-searxng-site
.. _Apache RequestHeader:
https://docs.searxng.org/admin/installation-apache.html#apache-s-searxng-site
.. autofunction:: searx.botdetection.get_real_ip
"""

from ._helpers import dump_request
from ._helpers import get_real_ip
from ._helpers import too_many_requests
121 changes: 121 additions & 0 deletions searx/botdetection/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring, invalid-name
from __future__ import annotations

from ipaddress import (
IPv4Network,
IPv6Network,
IPv6Address,
ip_address,
ip_network,
)
import flask
import werkzeug

from searx.tools import config
from searx import logger

logger = logger.getChild('botdetection')


def dump_request(request: flask.Request):
return (
request.path
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
+ " || Content-Type: %s" % request.headers.get('Content-Type')
+ " || Content-Length: %s" % request.headers.get('Content-Length')
+ " || Connection: %s" % request.headers.get('Connection')
+ " || User-Agent: %s" % request.headers.get('User-Agent')
)


def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
"""Returns a HTTP 429 response object and writes a ERROR message to the
'botdetection' logger. This function is used in part by the filter methods
to return the default ``Too Many Requests`` response.
"""

logger.debug("BLOCK %s: %s", network.compressed, log_msg)
return flask.make_response(('Too Many Requests', 429))


def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network:
"""Returns the (client) network of whether the real_ip is part of."""

ip = ip_address(real_ip)
if isinstance(ip, IPv6Address):
prefix = cfg['real_ip.ipv6_prefix']
else:
prefix = cfg['real_ip.ipv4_prefix']
network = ip_network(f"{real_ip}/{prefix}", strict=False)
# logger.debug("get_network(): %s", network.compressed)
return network


def get_real_ip(request: flask.Request) -> str:
"""Returns real IP of the request. Since not all proxies set all the HTTP
headers and incoming headers can be faked it may happen that the IP cannot
be determined correctly.
.. sidebar:: :py:obj:`flask.Request.remote_addr`
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
This function tries to get the remote IP in the order listed below,
additional some tests are done and if inconsistencies or errors are
detected, they are logged.
The remote IP of the request is taken from (first match):
- X-Forwarded-For_ header
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
- :py:obj:`flask.Request.remote_addr`
.. _ProxyFix:
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
"""

forwarded_for = request.headers.get("X-Forwarded-For")
real_ip = request.headers.get('X-Real-IP')
remote_addr = request.remote_addr
# logger.debug(
# "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
# )

if not forwarded_for:
logger.error("X-Forwarded-For header is not set!")
else:
from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import

forwarded_for = [x.strip() for x in forwarded_for.split(',')]
x_for: int = get_cfg()['real_ip.x_for']
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]

if not real_ip:
logger.error("X-Real-IP header is not set!")

if forwarded_for and real_ip and forwarded_for != real_ip:
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)

if forwarded_for and remote_addr and forwarded_for != remote_addr:
logger.warning(
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
)

if real_ip and remote_addr and real_ip != remote_addr:
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)

request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
# logger.debug("get_real_ip() -> %s", request_ip)
return request_ip
39 changes: 39 additions & 0 deletions searx/botdetection/http_accept.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept``
----------------------
The ``http_accept`` method evaluates a request as the request of a bot if the
Accept_ header ..
- did not contain ``text/html``
.. _Accept:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
"""
# pylint: disable=unused-argument

from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)

import flask
import werkzeug

from searx.tools import config
from ._helpers import too_many_requests


def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:

if 'text/html' not in request.accept_mimetypes:
return too_many_requests(network, "HTTP header Accept did not contain text/html")
return None
41 changes: 41 additions & 0 deletions searx/botdetection/http_accept_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_encoding``
-------------------------------
The ``http_accept_encoding`` method evaluates a request as the request of a
bot if the Accept-Encoding_ header ..
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
- did not contain ``text/html``
.. _Accept-Encoding:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
"""
# pylint: disable=unused-argument

from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)

import flask
import werkzeug

from searx.tools import config
from ._helpers import too_many_requests


def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:

accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list):
return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
return None
35 changes: 35 additions & 0 deletions searx/botdetection/http_accept_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_accept_language``
-------------------------------
The ``http_accept_language`` method evaluates a request as the request of a bot
if the Accept-Language_ header is unset.
.. _Accept-Language:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
"""
# pylint: disable=unused-argument
from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)

import flask
import werkzeug

from searx.tools import config
from ._helpers import too_many_requests


def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:
if request.headers.get('Accept-Language', '').strip() == '':
return too_many_requests(network, "missing HTTP header Accept-Language")
return None
37 changes: 37 additions & 0 deletions searx/botdetection/http_connection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Method ``http_connection``
--------------------------
The ``http_connection`` method evaluates a request as the request of a bot if
the Connection_ header is set to ``close``.
.. _Connection:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
"""
# pylint: disable=unused-argument

from __future__ import annotations
from ipaddress import (
IPv4Network,
IPv6Network,
)

import flask
import werkzeug

from searx.tools import config
from ._helpers import too_many_requests


def filter_request(
network: IPv4Network | IPv6Network,
request: flask.Request,
cfg: config.Config,
) -> werkzeug.Response | None:

if request.headers.get('Connection', '').strip() == 'close':
return too_many_requests(network, "HTTP header 'Connection=close")
return None
Loading

0 comments on commit cbe8eb0

Please sign in to comment.