From 7dbfd7e2bad05d01039c7d7a3038bca6dee3b1a1 Mon Sep 17 00:00:00 2001
From: Elias Dorneles <eliasdorneles@gmail.com>
Date: Tue, 29 Sep 2015 09:44:14 -0300
Subject: [PATCH 1/8] import tldextract lazily

---
 frontera/utils/url.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontera/utils/url.py b/frontera/utils/url.py
index 8a990d75a..129f7ab75 100644
--- a/frontera/utils/url.py
+++ b/frontera/utils/url.py
@@ -4,7 +4,6 @@
 import hashlib
 from six import moves
 from w3lib.util import unicode_to_str
-import tldextract
 
 
 # Python 2.x urllib.always_safe become private in Python 3.x;
@@ -39,6 +38,7 @@ def parse_domain_from_url(url):
      https://google.es/mail    google.es           google.es       https     google      es
     -------------------------------------------------------------------------------------------------------
     """
+    import tldextract
     extracted = tldextract.extract(url)
     scheme, _, _, _, _, _ = parse_url(url)
 

From 619e27759fe8cd0220a0cbaecc4b23a20777015c Mon Sep 17 00:00:00 2001
From: Elias Dorneles <eliasdorneles@gmail.com>
Date: Tue, 29 Sep 2015 12:59:38 -0300
Subject: [PATCH 2/8] add option for using tldextract, add tests for domain
 mware

---
 frontera/contrib/middlewares/domain.py | 14 ++++--
 frontera/settings/default_settings.py  |  5 ++
 frontera/tests/test_domain_mware.py    | 63 ++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 frontera/tests/test_domain_mware.py

diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py
index b229bb52d..45c7dab45 100644
--- a/frontera/contrib/middlewares/domain.py
+++ b/frontera/contrib/middlewares/domain.py
@@ -1,16 +1,19 @@
 import re
 
 from frontera.core.components import Middleware
-from frontera.utils.url import parse_domain_from_url_fast
+from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url
 
 
-def parse_domain_info(url, test_mode=False):
+def parse_domain_info(url, test_mode=False, use_tldextract=False):
     if test_mode:
         match = re.match('([A-Z])\w+', url)
         netloc = name = match.groups()[0] if match else '?'
         scheme = sld = tld = subdomain = '-'
     else:
-        netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url)
+        if use_tldextract:
+            netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url(url)
+        else:
+            netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url)
     return {
         'netloc': netloc,
         'name': name,
@@ -99,8 +102,9 @@ def request_error(self, request, error):
         return self._add_domain(request)
 
     def _add_domain(self, obj):
-        obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode)
+        use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False)
+        obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode, use_tldextract)
         if 'redirect_urls' in obj.meta:
-            obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode)
+            obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode, use_tldextract)
                                             for url in obj.meta['redirect_urls']]
         return obj
diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py
index bb8fe77db..86d7ba7ad 100644
--- a/frontera/settings/default_settings.py
+++ b/frontera/settings/default_settings.py
@@ -23,6 +23,11 @@
 URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1'
 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1'
 
+#--------------------------------------------------------
+# Domain mw
+#--------------------------------------------------------
+TLDEXTRACT_DOMAIN_INFO = False
+
 #--------------------------------------------------------
 # Logging
 #--------------------------------------------------------
diff --git a/frontera/tests/test_domain_mware.py b/frontera/tests/test_domain_mware.py
new file mode 100644
index 000000000..ad5cd8f33
--- /dev/null
+++ b/frontera/tests/test_domain_mware.py
@@ -0,0 +1,63 @@
+import unittest
+from frontera.contrib.middlewares.domain import DomainMiddleware
+from frontera.core.manager import FrontierManager
+from frontera.core.models import Request
+
+
+class FakeManager(object):
+    settings = {}
+    test_mode = False
+
+
+class DomainMiddlewareTest(unittest.TestCase):
+    def setUp(self):
+        self.fake_manager = FakeManager()
+
+    def test_create(self):
+        DomainMiddleware(self.fake_manager)
+
+    def test_should_parse_domain_info(self):
+        seeds = [
+            Request('http://example.com'),
+            Request('https://www.google.com'),
+        ]
+
+        mware = DomainMiddleware(self.fake_manager)
+        result = mware.add_seeds(seeds)
+
+        self.assertEquals(len(result), len(seeds))
+
+        for r in result:
+            self.assertIn('domain', r.meta, 'Missing domain info for %r' % r)
+
+        expected = [
+            {'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http',
+             'sld': '', 'subdomain': '', 'tld': ''},
+            {'name': 'www.google.com', 'netloc': 'www.google.com', 'scheme': 'https',
+             'sld': '', 'subdomain': '', 'tld': ''},
+        ]
+        self.assertEquals(expected, [r.meta['domain'] for r in result])
+
+    def test_should_parse_tldextract_extra_domain_info(self):
+        seeds = [
+            Request('http://example.com'),
+            Request('https://www.google.com'),
+        ]
+
+        self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True}
+
+        mware = DomainMiddleware(self.fake_manager)
+        result = mware.add_seeds(seeds)
+
+        self.assertEquals(len(result), len(seeds))
+
+        for r in result:
+            self.assertIn('domain', r.meta, 'Missing domain info for %r' % r)
+
+        expected = [
+            {'name': 'example.com', 'netloc': 'example.com', 'scheme': 'http',
+             'sld': 'example', 'subdomain': '', 'tld': 'com'},
+            {'name': 'google.com', 'netloc': 'www.google.com', 'scheme': 'https',
+             'sld': 'google', 'subdomain': 'www', 'tld': 'com'},
+        ]
+        self.assertEquals(expected, [r.meta['domain'] for r in result])

From 27b50796c2f0370649fdacca706116aa59f1c18c Mon Sep 17 00:00:00 2001
From: Elias Dorneles <eliasdorneles@gmail.com>
Date: Tue, 29 Sep 2015 13:02:53 -0300
Subject: [PATCH 3/8] declare tldextract as optional feature in setup.py

---
 requirements.txt            | 3 +--
 requirements/tldextract.txt | 1 +
 setup.py                    | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)
 create mode 100644 requirements/tldextract.txt

diff --git a/requirements.txt b/requirements.txt
index db82a0e4a..31e22a3b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 six>=1.8.0
 w3lib>=1.10.0
-tldextract>=1.5.1
-SQLAlchemy>=0.9.8
\ No newline at end of file
+SQLAlchemy>=0.9.8
diff --git a/requirements/tldextract.txt b/requirements/tldextract.txt
new file mode 100644
index 000000000..c616d8715
--- /dev/null
+++ b/requirements/tldextract.txt
@@ -0,0 +1 @@
+tldextract>=1.5.1
diff --git a/setup.py b/setup.py
index 0008edc61..4ded0942d 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,6 @@
     install_requires=[
         'six>=1.8.0',
         'w3lib>=1.10.0',
-        'tldextract>=1.5.1',
         'SQLAlchemy>=0.9.8'
     ],
     extras_require={
@@ -49,6 +48,9 @@
         'logging': [
             "colorlog>=2.4.0",
         ],
+        'tldextract': [
+            'tldextract>=1.5.1',
+        ]
     },
     tests_require=[
         "pytest>=2.6.4",

From f0ebc374f9896f7e227dea46e31c759090477f8c Mon Sep 17 00:00:00 2001
From: Elias Dorneles <eliasdorneles@gmail.com>
Date: Tue, 29 Sep 2015 13:14:28 -0300
Subject: [PATCH 4/8] update docs

---
 docs/source/topics/frontera-settings.rst | 15 +++++++++++++++
 docs/source/topics/frontier-objects.rst  |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst
index 18c5f8c5d..c7b780355 100644
--- a/docs/source/topics/frontera-settings.rst
+++ b/docs/source/topics/frontera-settings.rst
@@ -225,6 +225,21 @@ Default: ``frontera.utils.fingerprint.sha1``
 The function used to calculate the ``domain`` fingerprint.
 
 
+.. setting:: TLDEXTRACT_DOMAIN_INFO
+
+TLDEXTRACT_DOMAIN_INFO
+----------------------
+
+Default: ``False``
+
+If set to ``True``, will use `tldextract`_ to attach extra domain information
+(second-level, top-level and subdomain) to meta field (see :ref:`frontier-objects-additional-data`).
+
+
+.. _tldextract: https://pypi.python.org/pypi/tldextract
+
+
+
 Default settings
 ================
 
diff --git a/docs/source/topics/frontier-objects.rst b/docs/source/topics/frontier-objects.rst
index f386f134e..2e0cc644b 100644
--- a/docs/source/topics/frontier-objects.rst
+++ b/docs/source/topics/frontier-objects.rst
@@ -50,6 +50,9 @@ An example of a generated fingerprint for a :class:`Request <frontera.core.model
     '198d99a8b2284701d6c147174cd69a37a7dea90f'
 
 
+.. _frontier-objects-additional-data:
+
+
 Adding additional data to objects
 =================================
 

From 13226de80f2a69b3f23d9c5aa4940147100f147f Mon Sep 17 00:00:00 2001
From: Elias Dorneles <eliasdorneles@gmail.com>
Date: Tue, 29 Sep 2015 13:24:29 -0300
Subject: [PATCH 5/8] include tldextract reqs file in test requirements

---
 requirements/tests.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/tests.txt b/requirements/tests.txt
index a3264e374..8f0fb3dc7 100644
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
@@ -3,3 +3,4 @@ MySQL-python>=1.2.5
 PyMySQL>=0.6.3
 psycopg2>=2.5.4
 scrapy>=0.24
+-r tldextract.txt

From 70d1b026eda28b4c3f2f7c19dafcbcf40fb2fb97 Mon Sep 17 00:00:00 2001
From: Alexander Sibiryakov <sixty-one@yandex.ru>
Date: Tue, 29 Sep 2015 18:27:46 +0200
Subject: [PATCH 6/8] Adding tldextract in test requirements.

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4ded0942d..2401d00f0 100644
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,7 @@
         "MySQL-python>=1.2.5",
         "PyMySQL>=0.6.3",
         "psycopg2>=2.5.4",
-        "scrapy>=0.24"
+        "scrapy>=0.24",
+        "tldextract>=1.5.1",
     ]
 )

From 25069c22ad09a6b8b05ba4d066757f18a7a90dc3 Mon Sep 17 00:00:00 2001
From: Alexander Sibiryakov <sixty-one@yandex.ru>
Date: Tue, 29 Sep 2015 18:45:35 +0200
Subject: [PATCH 7/8] A bit optimized option check.

---
 frontera/contrib/middlewares/domain.py | 42 +++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py
index 45c7dab45..16a74d64d 100644
--- a/frontera/contrib/middlewares/domain.py
+++ b/frontera/contrib/middlewares/domain.py
@@ -4,24 +4,7 @@
 from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url
 
 
-def parse_domain_info(url, test_mode=False, use_tldextract=False):
-    if test_mode:
-        match = re.match('([A-Z])\w+', url)
-        netloc = name = match.groups()[0] if match else '?'
-        scheme = sld = tld = subdomain = '-'
-    else:
-        if use_tldextract:
-            netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url(url)
-        else:
-            netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(url)
-    return {
-        'netloc': netloc,
-        'name': name,
-        'scheme': scheme,
-        'sld': sld,
-        'tld': tld,
-        'subdomain': subdomain,
-    }
+
 
 
 class DomainMiddleware(Middleware):
@@ -77,6 +60,8 @@ class DomainMiddleware(Middleware):
 
     def __init__(self, manager):
         self.manager = manager
+        use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False)
+        self.parse_domain_func = parse_domain_from_url if use_tldextract else parse_domain_from_url_fast
 
     @classmethod
     def from_manager(cls, manager):
@@ -102,9 +87,24 @@ def request_error(self, request, error):
         return self._add_domain(request)
 
     def _add_domain(self, obj):
-        use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False)
-        obj.meta['domain'] = parse_domain_info(obj.url, self.manager.test_mode, use_tldextract)
+        obj.meta['domain'] = self.parse_domain_info(obj.url, self.manager.test_mode)
         if 'redirect_urls' in obj.meta:
-            obj.meta['redirect_domains'] = [parse_domain_info(url, self.manager.test_mode, use_tldextract)
+            obj.meta['redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode)
                                             for url in obj.meta['redirect_urls']]
         return obj
+
+    def parse_domain_info(self, url, test_mode=False):
+        if test_mode:
+            match = re.match('([A-Z])\w+', url)
+            netloc = name = match.groups()[0] if match else '?'
+            scheme = sld = tld = subdomain = '-'
+        else:
+            netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url)
+        return {
+            'netloc': netloc,
+            'name': name,
+            'scheme': scheme,
+            'sld': sld,
+            'tld': tld,
+            'subdomain': subdomain,
+        }

From 5544d5a286af0b1203ecc691ae1263ebce3a26b2 Mon Sep 17 00:00:00 2001
From: Alexander Sibiryakov <sixty-one@yandex.ru>
Date: Tue, 29 Sep 2015 18:55:12 +0200
Subject: [PATCH 8/8] Removing blank lines.

---
 frontera/contrib/middlewares/domain.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py
index 16a74d64d..8e94fd789 100644
--- a/frontera/contrib/middlewares/domain.py
+++ b/frontera/contrib/middlewares/domain.py
@@ -4,9 +4,6 @@
 from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url
 
 
-
-
-
 class DomainMiddleware(Middleware):
     """
     This :class:`Middleware <frontera.core.components.Middleware>` will add a ``domain`` info field for every