Skip to content

Commit

Permalink
facilitate retrying of urls
Browse files Browse the repository at this point in the history
  • Loading branch information
lordlabuckdas committed Jun 23, 2021
1 parent 007903f commit 5b47118
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 11 deletions.
13 changes: 8 additions & 5 deletions snare/cloner.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ async def process_link(self, url, level, check_host=False):
):
return None
if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
await self.new_urls.put((url, level + 1))
await self.new_urls.put({"url": url, "level": level + 1, "try_count": 0})

res = None
try:
Expand Down Expand Up @@ -154,7 +154,9 @@ async def get_body(self, session):
while not self.new_urls.empty():
print(animation[self.itr], end="\r")
self.itr = (self.itr + 1) % len(animation)
current_url, level = await self.new_urls.get()
current_url, level, try_count = (await self.new_urls.get()).values()
if try_count > 2:
continue
if current_url.human_repr() in self.visited_urls:
continue
self.visited_urls.append(current_url.human_repr())
Expand All @@ -170,6 +172,7 @@ async def get_body(self, session):
data = await response.read()
except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
self.logger.error(client_error)
await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1})
else:
await response.release()

Expand All @@ -190,7 +193,7 @@ async def get_body(self, session):
if not carved_url.is_absolute():
carved_url = self.root.join(carved_url)
if carved_url.human_repr() not in self.visited_urls:
await self.new_urls.put((carved_url, level + 1))
await self.new_urls.put({"url": carved_url, "level": level + 1, "try_count": 0})

with open(os.path.join(self.target_path, hash_name), "wb") as index_fh:
index_fh.write(data)
Expand All @@ -209,8 +212,8 @@ async def get_root_host(self):
async def run(self):
session = aiohttp.ClientSession()
try:
await self.new_urls.put((self.root, 0))
await self.new_urls.put((self.error_page, 0))
await self.new_urls.put({"url": self.root, "level": 0, "try_count": 0})
await self.new_urls.put({"url": self.error_page, "level": 0, "try_count": 0})
await self.get_body(session)
except KeyboardInterrupt:
raise
Expand Down
8 changes: 4 additions & 4 deletions snare/tests/test_cloner_get_body.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_get_body(self):
}

async def test():
await self.handler.new_urls.put((yarl.URL(self.root), 0))
await self.handler.new_urls.put({"url": yarl.URL(self.root), "level": 0, "try_count": 0})
await self.handler.get_body(self.session)

with self.assertLogs(level="DEBUG") as log:
Expand Down Expand Up @@ -105,7 +105,7 @@ def test_get_body_css_validate(self):
}

async def test():
await self.handler.new_urls.put((yarl.URL(self.root), 0))
await self.handler.new_urls.put({"url": yarl.URL(self.root), "level": 0, "try_count": 0})
await self.handler.get_body(self.session)
self.q_size = self.handler.new_urls.qsize()

Expand Down Expand Up @@ -134,7 +134,7 @@ def test_get_body_css_validate_scheme(self):
self.expected_content = "http://example.com/"

async def test():
await self.handler.new_urls.put((yarl.URL(self.root), 0))
await self.handler.new_urls.put({"url": yarl.URL(self.root), "level": 0, "try_count": 0})
await self.handler.get_body(self.session)
self.q_size = self.handler.new_urls.qsize()

Expand All @@ -149,7 +149,7 @@ def test_client_error(self):
self.session.get = AsyncMock(side_effect=aiohttp.ClientError)

async def test():
await self.handler.new_urls.put((yarl.URL(self.root), 0))
await self.handler.new_urls.put({"url": yarl.URL(self.root), "level": 0, "try_count": 0})
await self.handler.get_body(self.session)

with self.assertLogs(level="ERROR") as log:
Expand Down
7 changes: 5 additions & 2 deletions snare/tests/test_cloner_process_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def setUp(self):
self.return_url = None
self.return_level = None
self.qsize = None
self.try_count = None

def test_process_link_scheme(self):
test_urls = [
Expand All @@ -45,12 +46,13 @@ def test_process_link_relative(self):

async def test():
self.return_content = await self.handler.process_link(self.url, self.level)
self.return_url, self.return_level = await self.handler.new_urls.get()
self.return_url, self.return_level, self.try_count = (await self.handler.new_urls.get()).values()

self.loop.run_until_complete(test())
self.assertEqual(self.return_content, "/foo/путь/")
self.assertEqual(yarl.URL(self.return_url).human_repr(), self.expected_content)
self.assertEqual(self.return_level, self.level + 1)
self.assertLess(self.try_count, 2, "URL retried for more than 2 times")

self.handler.moved_root = yarl.URL("http://example2.com")
self.expected_content = "http://example2.com/foo/путь/"
Expand All @@ -66,12 +68,13 @@ def test_process_link_absolute(self):

async def test():
self.return_content = await self.handler.process_link(self.url, self.level)
self.return_url, self.return_level = await self.handler.new_urls.get()
self.return_url, self.return_level, self.try_count = (await self.handler.new_urls.get()).values()

self.loop.run_until_complete(test())
self.assertEqual(self.return_content, self.expected_content)
self.assertEqual(yarl.URL(self.url), self.return_url)
self.assertEqual(self.return_level, self.level + 1)
self.assertLess(self.try_count, 2, "URL retried for more than 2 times")

def test_check_host(self):
self.url = "http://foo.com"
Expand Down

0 comments on commit 5b47118

Please sign in to comment.