diff --git a/action.yml b/action.yml index 00b6bfb..7b4a1a5 100644 --- a/action.yml +++ b/action.yml @@ -38,6 +38,8 @@ inputs: description: 'URL to send back the data validation report asynchronously using POST "Content-Type: application/json"' shacl: description: 'URL to the SHACL file' + fetch-urls-headlessly: + description: 'Set as true to fetch the entity URLs headlessly' runs: using: 'composite' @@ -95,6 +97,7 @@ runs: run: | isPaginated=${{ inputs.is-paginated || 'false' }} headless=${{ inputs.headless || 'false' }} + fetchUrlsHeadlessly=${{ inputs.fetch-urls-headlessly || 'false' }} docker pull ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:main docker run --shm-size=1g -v $(pwd)/output:/usr/src/app/output ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:main \ @@ -102,7 +105,8 @@ runs: "${{ inputs.entity-identifier }}" \ "output/${{ inputs.downloadFile }}" \ "$isPaginated" \ - "$headless" + "$headless" \ + "$fetchUrlsHeadlessly" shell: bash diff --git a/src/lib/entity_fetcher.rb b/src/lib/entity_fetcher.rb index 530432a..ceef2c2 100644 --- a/src/lib/entity_fetcher.rb +++ b/src/lib/entity_fetcher.rb @@ -2,7 +2,7 @@ require 'open-uri' module EntityFetcher - def self.fetch_entity_urls(page_url, entity_identifier, is_paginated) + def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers) base_url = page_url.split('/')[0..2].join('/') entity_urls = [] @@ -13,28 +13,15 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated) else page_number = is_paginated.to_i end - - max_retries, retry_count = 3, 0 loop do url = "#{page_url}#{page_number}" puts "Fetching entity urls from #{url}..." - begin - linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s - headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"} - main_page_html_text = URI.open(url, headers).read - rescue StandardError => e - retry_count += 1 - if retry_count < max_retries - retry - else - puts "Max retries reached. Unable to fetch the content for page #{page_number}." - puts e.message - break - end + if fetch_entity_urls_headlessly == 'true' + main_doc = Nokogiri::HTML(HeadlessBrowser.fetch_entity_urls_headless(url, headers)) + else + main_doc = Nokogiri::HTML(self.fetch_entity_urls_headful(url, headers)) end - - main_doc = Nokogiri::HTML(main_page_html_text) entities_data = main_doc.css(entity_identifier) number_of_entities = entity_urls.length entities_data.each do |entity| @@ -45,9 +32,24 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated) break if entity_urls.length == number_of_entities || page_number.nil? page_number += 1 - retry_count = 0 end - entity_urls.uniq end + + def self.fetch_entity_urls_headful(url, headers) + retry_count = 0 + max_retries = 3 + begin + main_page_html_text = URI.open(url, headers).read + rescue StandardError => e + retry_count += 1 + if retry_count < max_retries + retry + else + puts "Max retries reached. Unable to fetch the content for page #{page_number}." + puts e.message + end + end + main_page_html_text + end end diff --git a/src/lib/headless_browser.rb b/src/lib/headless_browser.rb index 14708fd..b9fa4fa 100644 --- a/src/lib/headless_browser.rb +++ b/src/lib/headless_browser.rb @@ -3,11 +3,10 @@ require 'linkeddata' module HeadlessBrowser - def self.fetch_json_ld_objects(entity_urls, base_url) + def self.fetch_json_ld_objects(entity_urls, base_url, headers) puts "Loading browser..." browser = Ferrum::Browser.new(browser_path: "/usr/bin/google-chrome-stable", headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil }) - linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s - browser.headers.set({"User-Agent" => "artsdata-crawler/#{linkeddata_version}"}) + browser.headers.set(headers) graph = RDF::Graph.new add_url_sparql_file = File.read('./sparql/add_derived_from.sparql') entity_urls.each do |entity_url| @@ -42,4 +41,13 @@ def self.fetch_json_ld_objects(entity_urls, base_url) SparqlProcessor.perform_sparql_transformations(graph, sparql_paths, base_url) graph end + + def self.fetch_entity_urls_headless(url, headers) + puts "Loading browser..." + browser = Ferrum::Browser.new(browser_path: "/usr/bin/google-chrome-stable", headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil }) + browser.headers.set(headers) + browser.go_to(url) + sleep 15 + browser.body + end end diff --git a/src/lib/rdf_processor.rb b/src/lib/rdf_processor.rb index 0af10f8..930ca64 100644 --- a/src/lib/rdf_processor.rb +++ b/src/lib/rdf_processor.rb @@ -2,7 +2,7 @@ require 'sparql' require_relative 'sparql_processor' module RDFProcessor - def self.process_rdf(entity_urls, base_url) + def self.process_rdf(entity_urls, base_url, headers) graph = RDF::Graph.new add_url_sparql_file = File.read('./sparql/add_derived_from.sparql') @@ -10,8 +10,7 @@ def self.process_rdf(entity_urls, base_url) begin puts "Processing #{entity_url} in non-headless mode" entity_url = entity_url.gsub(' ', '+') - linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s - options = { headers: { 'User-Agent' => "artsdata-crawler/#{linkeddata_version}" } } + options = { headers: headers } loaded_graph = RDF::Graph.load(entity_url, **options) sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url) loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true)) diff --git a/src/main.rb b/src/main.rb index ab6d00d..73a6e40 100644 --- a/src/main.rb +++ b/src/main.rb @@ -3,22 +3,25 @@ require_relative 'lib/headless_browser' if ARGV.length < 4 - puts "Usage: ruby script_name.rb " + puts "Usage: ruby script_name.rb " exit end -page_url, entity_identifier, file_name, is_paginated, headless = ARGV[0..4] +page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly = ARGV[0..5] -entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated) +linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s +headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"} + +entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers) base_url = page_url.split('/')[0..2].join('/') if headless == 'true' - graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url) + graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url, headers) File.open(file_name, 'w') do |file| file.puts(graph.dump(:jsonld)) end else - graph = RDFProcessor.process_rdf(entity_urls, base_url) + graph = RDFProcessor.process_rdf(entity_urls, base_url, headers) File.open(file_name, 'w') do |file| file.puts(graph.dump(:jsonld)) end