-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.rb
75 lines (68 loc) · 1.96 KB
/
main.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
require 'nokogiri'
require 'open-uri'
require 'linkeddata'
if ARGV.length != 6
puts "Usage: ruby script_name.rb <page_url> <base_url> <entity_identifier> <file_name> <is_paginated> <href_tag>"
exit
end
page_url = ARGV[0]
base_url = ARGV[1]
entity_identifier = ARGV[2]
file_name = ARGV[3]
is_paginated = ARGV[4]
href_tag = ARGV[5]
max_retries, retry_count = 3, 0
page_number = is_paginated == 'true' ? 1 : nil
graph = RDF::Graph.new
add_url_sparql_file = File.read("./sparqls/add_derived_from.sparql")
replace_blank_nodes_sparql_file = File.read("./sparqls/replace_blank_nodes.sparql")
loop do
url = "#{page_url}#{page_number}"
begin
main_page_html_text = URI.open(url).read
rescue StandardError => e
retry_count += 1
if retry_count < max_retries
retry
else
puts "Max retries reached. Unable to fetch the content for page #{page_number}."
break
end
end
main_doc = Nokogiri::HTML(main_page_html_text)
entities_data = main_doc.css(entity_identifier)
entity_urls = []
entities_data.each do |entity|
href = entity[href_tag]
entity_urls << base_url+href
end
if entity_urls.empty?
puts "No more entities found on page #{page_number}. Exiting..."
break
end
entity_urls.each do |entity_url|
begin
entity_url = entity_url.gsub(' ', '+')
loaded_graph = RDF::Graph.load(entity_url)
# add derivedFrom
sparql = SPARQL.parse(add_url_sparql_file.gsub("subject_url", entity_url), update: true)
loaded_graph.query(sparql)
# replace blank nodes
sparql = SPARQL.parse(replace_blank_nodes_sparql_file.gsub("subject_url", entity_url), update: true)
loaded_graph.query(sparql)
graph << loaded_graph
rescue StandardError => e
puts "Error loading RDF from #{entity_url}: #{e.message}"
break
end
end
if page_number == nil
break
else
page_number += 1
end
retry_count = 0
end
File.open(file_name, 'w') do |file|
file.puts(graph.dump(:jsonld))
end