From 6d0b2ef58ce42edb5a45dff7d92e08357b2b0879 Mon Sep 17 00:00:00 2001 From: dev Date: Thu, 19 Dec 2024 13:23:38 +0530 Subject: [PATCH] Updated entity URL fetcher algorithm to stop more consistently --- src/lib/entity_fetcher.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/lib/entity_fetcher.rb b/src/lib/entity_fetcher.rb index 341b91f..f5b6053 100644 --- a/src/lib/entity_fetcher.rb +++ b/src/lib/entity_fetcher.rb @@ -37,12 +37,14 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti href = entity["href"] entity_urls << (href.start_with?('http') ? href : base_url + (href.start_with?('/') ? href : "/#{href}")) end - - break if entity_urls.length == number_of_entities || page_number.nil? - + entity_urls = entity_urls.uniq + if entity_urls.length == number_of_entities || page_number.nil? + puts "All entity URLs have been successfully fetched. Total entities: #{entity_urls.length}." + break + end page_number += offset end - entity_urls.uniq + entity_urls end def self.fetch_entity_urls_headful(url, headers) @@ -55,7 +57,7 @@ def self.fetch_entity_urls_headful(url, headers) if retry_count < max_retries retry else - puts "Max retries reached. Unable to fetch the content for page #{page_number}." + puts "Max retries reached. Unable to fetch the content for page #{url}." puts e.message end end