Skip to content

Commit

Permalink
Updated workflow to add an offset value for pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
dev-aravind committed Nov 18, 2024
1 parent 2ebfffb commit 221ff0b
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 6 deletions.
6 changes: 5 additions & 1 deletion action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ inputs:
description: 'URL to the SHACL file'
fetch-urls-headlessly:
description: 'Set as true to fetch the entity URLs headlessly'
offset:
description: 'Offset for paginated pages'

runs:
using: 'composite'
Expand Down Expand Up @@ -98,6 +100,7 @@ runs:
isPaginated=${{ inputs.is-paginated || 'false' }}
headless=${{ inputs.headless || 'false' }}
fetchUrlsHeadlessly=${{ inputs.fetch-urls-headlessly || 'false' }}
offset=${{ inputs.offset || '1' }}
docker pull ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:latest
docker run --shm-size=1g -v $(pwd)/output:/usr/src/app/output ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:latest \
Expand All @@ -106,7 +109,8 @@ runs:
"output/${{ inputs.downloadFile }}" \
"$isPaginated" \
"$headless" \
"$fetchUrlsHeadlessly"
"$fetchUrlsHeadlessly" \
"$offset"
shell: bash

Expand Down
10 changes: 8 additions & 2 deletions src/lib/entity_fetcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
require 'open-uri'

module EntityFetcher
def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers)
def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers, offset)
base_url = page_url.split('/')[0..2].join('/')
entity_urls = []

Expand All @@ -14,6 +14,12 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti
page_number = is_paginated.to_i
end

if offset
offset = offset.to_i
else
offset = 1
end

loop do
url = "#{page_url}#{page_number}"
puts "Fetching entity urls from #{url}..."
Expand All @@ -33,7 +39,7 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti

break if entity_urls.length == number_of_entities || page_number.nil?

page_number += 1
page_number += offset
end
entity_urls.uniq
end
Expand Down
6 changes: 3 additions & 3 deletions src/main.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
require_relative 'lib/headless_browser'

if ARGV.length < 4
puts "Usage: ruby script_name.rb <page_url> <entity_identifier> <file_name> <is_paginated> <headless> <fetch_urls_headlessly>"
puts "Usage: ruby script_name.rb <page_url> <entity_identifier> <file_name> <is_paginated> <headless> <fetch_urls_headlessly> <offset>"
exit
end

page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly = ARGV[0..5]
page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly, offset = ARGV[0..6]

linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"}

entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers)
entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers, offset)
base_url = page_url.split('/')[0..2].join('/')

sparql_paths = [
Expand Down

0 comments on commit 221ff0b

Please sign in to comment.