From 221ff0b6226a02bf4120b7ae1b037e4468ea5b51 Mon Sep 17 00:00:00 2001 From: dev Date: Mon, 18 Nov 2024 11:53:35 +0530 Subject: [PATCH] Updated workflow to add an offset value for pagination --- action.yml | 6 +++++- src/lib/entity_fetcher.rb | 10 ++++++++-- src/main.rb | 6 +++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/action.yml b/action.yml index aaf5f8b..9e3996e 100644 --- a/action.yml +++ b/action.yml @@ -40,6 +40,8 @@ inputs: description: 'URL to the SHACL file' fetch-urls-headlessly: description: 'Set as true to fetch the entity URLs headlessly' + offset: + description: 'Offset for paginated pages' runs: using: 'composite' @@ -98,6 +100,7 @@ runs: isPaginated=${{ inputs.is-paginated || 'false' }} headless=${{ inputs.headless || 'false' }} fetchUrlsHeadlessly=${{ inputs.fetch-urls-headlessly || 'false' }} + offset=${{ inputs.offset || '1' }} docker pull ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:latest docker run --shm-size=1g -v $(pwd)/output:/usr/src/app/output ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:latest \ @@ -106,7 +109,8 @@ runs: "output/${{ inputs.downloadFile }}" \ "$isPaginated" \ "$headless" \ - "$fetchUrlsHeadlessly" + "$fetchUrlsHeadlessly" \ + "$offset" shell: bash diff --git a/src/lib/entity_fetcher.rb b/src/lib/entity_fetcher.rb index 47a292f..cd5493f 100644 --- a/src/lib/entity_fetcher.rb +++ b/src/lib/entity_fetcher.rb @@ -2,7 +2,7 @@ require 'open-uri' module EntityFetcher - def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers) + def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers, offset) base_url = page_url.split('/')[0..2].join('/') entity_urls = [] @@ -14,6 +14,12 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti page_number = is_paginated.to_i end + if offset + offset = offset.to_i + else + offset = 1 + end + loop do url = "#{page_url}#{page_number}" puts "Fetching entity urls from #{url}..." @@ -33,7 +39,7 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti break if entity_urls.length == number_of_entities || page_number.nil? - page_number += 1 + page_number += offset end entity_urls.uniq end diff --git a/src/main.rb b/src/main.rb index 75a4867..7a42063 100644 --- a/src/main.rb +++ b/src/main.rb @@ -3,16 +3,16 @@ require_relative 'lib/headless_browser' if ARGV.length < 4 - puts "Usage: ruby script_name.rb " + puts "Usage: ruby script_name.rb " exit end -page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly = ARGV[0..5] +page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly, offset = ARGV[0..6] linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"} -entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers) +entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers, offset) base_url = page_url.split('/')[0..2].join('/') sparql_paths = [