Skip to content

Commit

Permalink
Fix FF options depreaction warning + tagging tests
Browse files Browse the repository at this point in the history
Add tagger tests
  • Loading branch information
brandonrobertz committed Apr 17, 2020
1 parent 2db198d commit 179c48a
Show file tree
Hide file tree
Showing 6 changed files with 1,119 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ __pycache__
*.7z
*.log
graph*.dot
.idea

# default output directory
autoscrape-data*/
Expand Down
2 changes: 1 addition & 1 deletion autoscrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__title__ = 'autoscrape-py'
__author__ = 'Brandon Roberts ([email protected])'
__license__ = 'AGPLv3'
__version__ = '1.6.5'
__version__ = '1.6.6'


from autoscrape.scrapers.test import TestScraper
Expand Down
2 changes: 1 addition & 1 deletion autoscrape/backends/selenium/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __init__(self, driver="Firefox", leave_host=False,
logger.debug(" - Using binary: %s" % (browser_binary))
binary = FirefoxBinary(browser_binary)
self.driver = webdriver.Firefox(
firefox_options=firefox_options,
options=firefox_options,
firefox_profile=firefox_profile,
firefox_binary=binary,
)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_long_description():

setuptools.setup(
name='autoscrape',
version='1.6.5',
version='1.6.6',
description='An automated, programming-free web scraper for interactive sites',
long_description=get_long_description(),
author='Brandon Roberts',
Expand Down
1,044 changes: 1,044 additions & 0 deletions tests/data/test_page_large.cleaned.html

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions tests/test_tag_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import unittest
import urllib

import autoscrape


class TestTagGeneration(unittest.TestCase):
@classmethod
def setUpClass(self):
self.filename = "test_page_large.cleaned.html"
self.url = "http://localhost:8000/%s" % (self.filename)

def test_test_server_running(self):
"""
Make sure our test HTML server is online. We need this to be
loaded to continue the remaining tests.
"""
html = None
try:
html = urllib.request.urlopen(self.url).read()
except urllib.error.URLError:
pass
msg = "Test server not running! HINT: python -m http.server --directory ./tests/data/"
self.assertIsNotNone(
html, msg=msg
)

def _get_page_source(self):
with open("tests/data/%s" % (self.filename), "r") as f:
return f.read()

def test_requests_backend_can_load_page(self):
"""
Load the test page and make sure it matches our test page.
"""
self.requests_browser = autoscrape.backends.requests.browser.RequestsBrowser()
self.requests_browser.fetch(self.url)
loaded_html = self.requests_browser.page_html
self.assertIsNotNone(loaded_html)
raw_html = self._get_page_source()
self.assertEqual(raw_html, loaded_html)

def test_selenium_backend_can_load_page(self):
"""
Test that we can load the test page and that it doesn't get
mutated by the browser (it's clean so this shouldn't happen).
We do this so that later we can measure differences in the CSS
path algorithms, not differences between represented pages.
"""
self.selenium_browser = autoscrape.backends.selenium.browser.SeleniumBrowser()
self.selenium_browser.fetch(self.url)
loaded_html = self.selenium_browser.page_html
self.assertIsNotNone(loaded_html)
raw_html = self._get_page_source()
self.assertEqual(raw_html, loaded_html)

def test_backend_tags_match(self):
self.requests_browser = autoscrape.backends.requests.browser.RequestsBrowser()
self.requests_browser.fetch(self.url)
self.selenium_browser = autoscrape.backends.selenium.browser.SeleniumBrowser()
self.selenium_browser.fetch(self.url)
sel_clickable = self.selenium_browser.get_clickable()
req_clickable = self.requests_browser.get_clickable()
self.assertTrue(len(sel_clickable) > 0)
self.assertTrue(len(req_clickable) > 0)
self.assertEqual(len(sel_clickable), len(req_clickable))
self.assertEqual(sel_clickable, req_clickable)


if __name__ == "__main__":
unittest.main()

0 comments on commit 179c48a

Please sign in to comment.