diff --git a/README.md b/README.md index 3aba58b..03aebb0 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,8 @@ artsdata-push: version: reportCallbackUrl: shacl: + fetch-urls-headlessly: + offset: ```
@@ -51,10 +53,27 @@ artsdata-push: | `version` | Version of the artifact. Usually a date (e.g., 2020-10-23). Use unreserved characters. (If not provided, version will be set as the current date). | `reportCallbackUrl ` | URL to send back the data validation report asynchronously using POST "Content-Type: application/json". | `shacl` | URL to the SHACL file to perform validations. - +| `fetch-urls-headlessly` | Fetch the URLs of entities using a headless browser(defaults to false). +| `offset` | Offset for pagination strategy (defaults to 1).
## Potential Issues Remember to use only unreserved characters ([0-9a-zA-Z-._]) for input variables where mentioned. + +# Release Instructions + +When preparing a release for the artsdata-pipeline-action, please follow these versioning guidelines: + +## Minor release (e.g., 2.0.7 → 2.0.8): + +For small feature additions or bug fixes. + +## Major release (e.g., 2.0.7 → 2.1.0): + +For larger changes or significant improvements that could impact compatibility. + +## Significant Update (e.g., 2.0.7 → 3.0.0): + +For major overhauls or breaking changes. If there's a drastic change in functionality or usage, increment to the next "big update" version. \ No newline at end of file diff --git a/src/lib/graph_fetcher.rb b/src/lib/graph_fetcher.rb index 10de379..f20ed3d 100644 --- a/src/lib/graph_fetcher.rb +++ b/src/lib/graph_fetcher.rb @@ -10,7 +10,7 @@ def self.load(entity_urls: [], base_url: nil, headers: nil, headless: false) @entity_urls = entity_urls @base_url = base_url @headers = headers ||= {"User-Agent" => "artsdata-crawler"} - @graph = if headless + @graph = if headless == "true" headless_browser = HeadlessBrowser.new(headers) headless_browser.fetch_json_ld_objects(entity_urls) else diff --git a/src/lib/rdf_processor.rb b/src/lib/rdf_processor.rb index f892195..687bc9c 100644 --- a/src/lib/rdf_processor.rb +++ b/src/lib/rdf_processor.rb @@ -7,17 +7,18 @@ def self.process_rdf(entity_urls, base_url, headers) add_url_sparql_file = File.read('./sparql/add_derived_from.sparql') entity_urls.each do |entity_url| - puts "Processing #{entity_url} in non-headless mode" - entity_url = entity_url.gsub(' ', '+') - options = { rdfstar: true, headers: headers } - loaded_graph = RDF::Graph.load(entity_url, **options) - sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url) - loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true)) - graph << loaded_graph - graph + begin + puts "Processing #{entity_url} in non-headless mode" + entity_url = entity_url.gsub(' ', '+') + options = { headers: headers } + loaded_graph = RDF::Graph.load(entity_url, **options) + sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url) + loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true)) + graph << loaded_graph + rescue StandardError => e + puts "Error loading RDF from #{entity_url}: #{e.message}" + end end graph - rescue StandardError => e - puts "Error loading RDF from #{entity_url}: #{e.message}" end -end +end \ No newline at end of file