From 7dbfd7e2bad05d01039c7d7a3038bca6dee3b1a1 Mon Sep 17 00:00:00 2001 From: Elias Dorneles <eliasdorneles@gmail.com> Date: Tue, 29 Sep 2015 09:44:14 -0300 Subject: [PATCH 1/8] import tldextract lazily --- frontera/utils/url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/utils/url.py b/frontera/utils/url.py index 8a990d75a..129f7ab75 100644 --- a/frontera/utils/url.py +++ b/frontera/utils/url.py @@ -4,7 +4,6 @@ import hashlib from six import moves from w3lib.util import unicode_to_str -import tldextract # Python 2.x urllib.always_safe become private in Python 3.x; @@ -39,6 +38,7 @@ def parse_domain_from_url(url): https://google.es/mail google.es google.es https google es ------------------------------------------------------------------------------------------------------- """ + import tldextract extracted = tldextract.extract(url) scheme, _, _, _, _, _ = parse_url(url) From 619e27759fe8cd0220a0cbaecc4b23a20777015c Mon Sep 17 00:00:00 2001 From: Elias Dorneles <eliasdorneles@gmail.com> Date: Tue, 29 Sep 2015 12:59:38 -0300 Subject: [PATCH 2/8] add option for using tldextract, add tests for domain mware --- frontera/contrib/middlewares/domain.py | 14 ++++-- frontera/settings/default_settings.py | 5 ++ frontera/tests/test_domain_mware.py | 63 ++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 frontera/tests/test_domain_mware.py diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index b229bb52d..45c7dab45 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -1,16 +1,19 @@ import re from frontera.core.components import Middleware -from frontera.utils.url import parse_domain_from_url_fast +from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url -def parse_domain_info(url, test_mode=False): +def parse_domain_info(url, test_mode=False, use_tldextract=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = match.groups()[0] if match else '?' scheme = sld = tld = subdomain = '-' else: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url) + if use_tldextract: + netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url(url) + else: + netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url) return { 'netloc': netloc, 'name': name, @@ -99,8 +102,9 @@ def request_error(self, request, error): return self._add_domain(request) def _add_domain(self, obj): - obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode) + use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False) + obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode, use_tldextract) if 'redirect_urls' in obj.meta: - obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode) + obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode, use_tldextract) for url in obj.meta['redirect_urls']] return obj diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index bb8fe77db..86d7ba7ad 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -23,6 +23,11 @@ URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +#-------------------------------------------------------- +# Domain mw +#-------------------------------------------------------- +TLDEXTRACT_DOMAIN_INFO = False + #-------------------------------------------------------- # Logging #-------------------------------------------------------- diff --git a/frontera/tests/test_domain_mware.py b/frontera/tests/test_domain_mware.py new file mode 100644 index 000000000..ad5cd8f33 --- /dev/null +++ b/frontera/tests/test_domain_mware.py @@ -0,0 +1,63 @@ +import unittest +from frontera.contrib.middlewares.domain import DomainMiddleware +from frontera.core.manager import FrontierManager +from frontera.core.models import Request + + +class FakeManager(object): + settings = {} + test_mode = False + + +class DomainMiddlewareTest(unittest.TestCase): + def setUp(self): + self.fake_manager = FakeManager() + + def test_create(self): + DomainMiddleware(self.fake_manager) + + def test_should_parse_domain_info(self): + seeds = [ + Request('http://example.com'), + Request('https://www.google.com'), + ] + + mware = DomainMiddleware(self.fake_manager) + result = mware.add_seeds(seeds) + + self.assertEquals(len(result), len(seeds)) + + for r in result: + self.assertIn('domain', r.meta, 'Missing domain info for %r' % r) + + expected = [ + {'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http', + 'sld': '', 'subdomain': '', 'tld': ''}, + {'name': 'www.google.com', 'netloc': 'www.google.com', 'scheme': 'https', + 'sld': '', 'subdomain': '', 'tld': ''}, + ] + self.assertEquals(expected, [r.meta['domain'] for r in result]) + + def test_should_parse_tldextract_extra_domain_info(self): + seeds = [ + Request('http://example.com'), + Request('https://www.google.com'), + ] + + self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True} + + mware = DomainMiddleware(self.fake_manager) + result = mware.add_seeds(seeds) + + self.assertEquals(len(result), len(seeds)) + + for r in result: + self.assertIn('domain', r.meta, 'Missing domain info for %r' % r) + + expected = [ + {'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http', + 'sld': 'example', 'subdomain': '', 'tld': 'com'}, + {'name': 'google.com', 'netloc': 'www.google.com', 'scheme': 'https', + 'sld': 'google', 'subdomain': 'www', 'tld': 'com'}, + ] + self.assertEquals(expected, [r.meta['domain'] for r in result]) From 27b50796c2f0370649fdacca706116aa59f1c18c Mon Sep 17 00:00:00 2001 From: Elias Dorneles <eliasdorneles@gmail.com> Date: Tue, 29 Sep 2015 13:02:53 -0300 Subject: [PATCH 3/8] declare tldextract as optional feature in setup.py --- requirements.txt | 3 +-- requirements/tldextract.txt | 1 + setup.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 requirements/tldextract.txt diff --git a/requirements.txt b/requirements.txt index db82a0e4a..31e22a3b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ six>=1.8.0 w3lib>=1.10.0 -tldextract>=1.5.1 -SQLAlchemy>=0.9.8 \ No newline at end of file +SQLAlchemy>=0.9.8 diff --git a/requirements/tldextract.txt b/requirements/tldextract.txt new file mode 100644 index 000000000..c616d8715 --- /dev/null +++ b/requirements/tldextract.txt @@ -0,0 +1 @@ +tldextract>=1.5.1 diff --git a/setup.py b/setup.py index 0008edc61..4ded0942d 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ install_requires=[ 'six>=1.8.0', 'w3lib>=1.10.0', - 'tldextract>=1.5.1', 'SQLAlchemy>=0.9.8' ], extras_require={ @@ -49,6 +48,9 @@ 'logging': [ "colorlog>=2.4.0", ], + 'tldextract': [ + 'tldextract>=1.5.1', + ] }, tests_require=[ "pytest>=2.6.4", From f0ebc374f9896f7e227dea46e31c759090477f8c Mon Sep 17 00:00:00 2001 From: Elias Dorneles <eliasdorneles@gmail.com> Date: Tue, 29 Sep 2015 13:14:28 -0300 Subject: [PATCH 4/8] update docs --- docs/source/topics/frontera-settings.rst | 15 +++++++++++++++ docs/source/topics/frontier-objects.rst | 3 +++ 2 files changed, 18 insertions(+) diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 18c5f8c5d..c7b780355 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -225,6 +225,21 @@ Default: ``frontera.utils.fingerprint.sha1`` The function used to calculate the ``domain`` fingerprint. +.. setting:: TLDEXTRACT_DOMAIN_INFO + +TLDEXTRACT_DOMAIN_INFO +---------------------- + +Default: ``False`` + +If set to ``True``, will use `tldextract`_ to attach extra domain information +(second-level, top-level and subdomain) to meta field (see :ref:`frontier-objects-additional-data`). + + +.. _tldextract: https://pypi.python.org/pypi/tldextract + + + Default settings ================ diff --git a/docs/source/topics/frontier-objects.rst b/docs/source/topics/frontier-objects.rst index f386f134e..2e0cc644b 100644 --- a/docs/source/topics/frontier-objects.rst +++ b/docs/source/topics/frontier-objects.rst @@ -50,6 +50,9 @@ An example of a generated fingerprint for a :class:`Request <frontera.core.model '198d99a8b2284701d6c147174cd69a37a7dea90f' +.. _frontier-objects-additional-data: + + Adding additional data to objects ================================= From 13226de80f2a69b3f23d9c5aa4940147100f147f Mon Sep 17 00:00:00 2001 From: Elias Dorneles <eliasdorneles@gmail.com> Date: Tue, 29 Sep 2015 13:24:29 -0300 Subject: [PATCH 5/8] include tldextract reqs file in test requirements --- requirements/tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/tests.txt b/requirements/tests.txt index a3264e374..8f0fb3dc7 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -3,3 +3,4 @@ MySQL-python>=1.2.5 PyMySQL>=0.6.3 psycopg2>=2.5.4 scrapy>=0.24 +-r tldextract.txt From 70d1b026eda28b4c3f2f7c19dafcbcf40fb2fb97 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov <sixty-one@yandex.ru> Date: Tue, 29 Sep 2015 18:27:46 +0200 Subject: [PATCH 6/8] Adding tldextract in test requirements. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4ded0942d..2401d00f0 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ "MySQL-python>=1.2.5", "PyMySQL>=0.6.3", "psycopg2>=2.5.4", - "scrapy>=0.24" + "scrapy>=0.24", + "tldextract>=1.5.1", ] ) From 25069c22ad09a6b8b05ba4d066757f18a7a90dc3 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov <sixty-one@yandex.ru> Date: Tue, 29 Sep 2015 18:45:35 +0200 Subject: [PATCH 7/8] A bit optimized option check. --- frontera/contrib/middlewares/domain.py | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index 45c7dab45..16a74d64d 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -4,24 +4,7 @@ from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url -def parse_domain_info(url, test_mode=False, use_tldextract=False): - if test_mode: - match = re.match('([A-Z])\w+', url) - netloc = name = match.groups()[0] if match else '?' - scheme = sld = tld = subdomain = '-' - else: - if use_tldextract: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url(url) - else: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url) - return { - 'netloc': netloc, - 'name': name, - 'scheme': scheme, - 'sld': sld, - 'tld': tld, - 'subdomain': subdomain, - } + class DomainMiddleware(Middleware): @@ -77,6 +60,8 @@ class DomainMiddleware(Middleware): def __init__(self, manager): self.manager = manager + use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False) + self.parse_domain_func = parse_domain_from_url if use_tldextract else parse_domain_from_url_fast @classmethod def from_manager(cls, manager): @@ -102,9 +87,24 @@ def request_error(self, request, error): return self._add_domain(request) def _add_domain(self, obj): - use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False) - obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode, use_tldextract) + obj.meta['domain'] = self.parse_domain_info(obj.url, self.manager.test_mode) if 'redirect_urls' in obj.meta: - obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode, use_tldextract) + obj.meta['redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode) for url in obj.meta['redirect_urls']] return obj + + def parse_domain_info(self, url, test_mode=False): + if test_mode: + match = re.match('([A-Z])\w+', url) + netloc = name = match.groups()[0] if match else '?' + scheme = sld = tld = subdomain = '-' + else: + netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url) + return { + 'netloc': netloc, + 'name': name, + 'scheme': scheme, + 'sld': sld, + 'tld': tld, + 'subdomain': subdomain, + } From 5544d5a286af0b1203ecc691ae1263ebce3a26b2 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov <sixty-one@yandex.ru> Date: Tue, 29 Sep 2015 18:55:12 +0200 Subject: [PATCH 8/8] Removing blank lines. --- frontera/contrib/middlewares/domain.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index 16a74d64d..8e94fd789 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -4,9 +4,6 @@ from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url - - - class DomainMiddleware(Middleware): """ This :class:`Middleware <frontera.core.components.Middleware>` will add a ``domain`` info field for every