From 09607ed4356dbebf4f22fda104ad1826f4701309 Mon Sep 17 00:00:00 2001 From: dev Date: Tue, 3 Dec 2024 17:57:31 +0530 Subject: [PATCH] Added workflow to import gtq data with concepts --- .github/workflows/fetch-entities.yml | 55 +++++++ .ruby-version | 1 + .rvmrc | 1 + Gemfile | 6 + Gemfile.lock | 223 +++++++++++++++++++++++++++ main.rb | 82 ++++++++++ 6 files changed, 368 insertions(+) create mode 100644 .github/workflows/fetch-entities.yml create mode 100644 .ruby-version create mode 100644 .rvmrc create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 main.rb diff --git a/.github/workflows/fetch-entities.yml b/.github/workflows/fetch-entities.yml new file mode 100644 index 0000000..195b266 --- /dev/null +++ b/.github/workflows/fetch-entities.yml @@ -0,0 +1,55 @@ +name: Fetch GrandTheatreQuebec Events + +on: + workflow_dispatch: + schedule: + - cron: '0 4 * * 1' + +jobs: + fetch-data: + runs-on: ubuntu-latest + steps: + - name: Fetch data using artsdata pipeline action + uses: culturecreates/artsdata-pipeline-action@v2 + with: + mode: "fetch" + downloadFile: "grandtheatrequebec-events.jsonld" + page-url: "https://grandtheatre.qc.ca/programmation/" + entity-identifier: "article.show a" + token: "${{ secrets.GITHUB_TOKEN }}" + + add-concepts: + runs-on: ubuntu-latest + steps: + + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + bundler-cache: true + + - name: Run ruby code + run: | + bundle exec ruby main.rb + + - name: Commit and push changes + run: | + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + git pull + git add "output/grandtheatrequebec-events-with-concept.jsonld" + git commit -m "Add data generated by the script" + git push + + import-to-artsdata: + runs-on: ubuntu-latest + steps: + - name: Import data using artsdata pipeline action + uses: culturecreates/artsdata-pipeline-action@v2 + with: + artifact: "grandtheatrequebec-ca" + publisher: "${{ secrets.PUBLISHER_URI_GREGORY }}" + downloadUrl: "https://raw.githubusercontent.com/culturecreates/artsdata-planet-gtq/refs/heads/main/output/grandtheatrequebec-events-with-concept.jsonld" + \ No newline at end of file diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..6ebad14 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +3.1.2 \ No newline at end of file diff --git a/.rvmrc b/.rvmrc new file mode 100644 index 0000000..fb21d3c --- /dev/null +++ b/.rvmrc @@ -0,0 +1 @@ +rvm use 3.1.2@artsdata-orion --create \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..4a9b552 --- /dev/null +++ b/Gemfile @@ -0,0 +1,6 @@ +source "https://rubygems.org" + +gem 'linkeddata', '~> 3.3' +gem 'nokogiri' +gem 'open-uri' + diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..130d25b --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,223 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + bcp47_spec (0.2.1) + bigdecimal (3.1.8) + builder (3.3.0) + concurrent-ruby (1.3.4) + connection_pool (2.4.1) + date (3.4.1) + ebnf (2.4.0) + htmlentities (~> 4.3) + rdf (~> 3.3) + scanf (~> 1.0) + sxp (~> 1.3) + unicode-types (~> 1.8) + haml (6.3.0) + temple (>= 0.8.2) + thor + tilt + hamster (3.0.0) + concurrent-ruby (~> 1.0) + htmlentities (4.3.4) + json-canonicalization (1.0.0) + json-ld (3.3.2) + htmlentities (~> 4.3) + json-canonicalization (~> 1.0) + link_header (~> 0.0, >= 0.0.8) + multi_json (~> 1.15) + rack (>= 2.2, < 4) + rdf (~> 3.3) + rexml (~> 3.2) + json-ld-preloaded (3.3.1) + json-ld (~> 3.3) + rdf (~> 3.3) + ld-patch (3.3.0) + ebnf (~> 2.4) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + sparql (~> 3.3) + sxp (~> 1.3) + link_header (0.0.8) + linkeddata (3.3.1) + json-ld (~> 3.3) + json-ld-preloaded (~> 3.3) + ld-patch (~> 3.3) + nokogiri (~> 1.15, >= 1.15.4) + rdf (~> 3.2, >= 3.2.1) + rdf-aggregate-repo (~> 3.2) + rdf-hamster-repo (~> 3.3) + rdf-isomorphic (~> 3.3) + rdf-json (~> 3.3) + rdf-microdata (~> 3.3) + rdf-n3 (~> 3.3) + rdf-normalize (~> 0.7) + rdf-ordered-repo (~> 3.3) + rdf-rdfa (~> 3.3) + rdf-rdfxml (~> 3.3) + rdf-reasoner (~> 0.9) + rdf-tabular (~> 3.3) + rdf-trig (~> 3.3) + rdf-trix (~> 3.3) + rdf-turtle (~> 3.3) + rdf-vocab (~> 3.3) + rdf-xsd (~> 3.3) + shacl (~> 0.4) + shex (~> 0.8) + sparql (~> 3.3) + sparql-client (~> 3.3) + yaml-ld (~> 0.0) + logger (1.6.2) + matrix (0.4.2) + multi_json (1.15.0) + net-http-persistent (4.0.4) + connection_pool (~> 2.2) + nokogiri (1.16.8-aarch64-linux) + racc (~> 1.4) + nokogiri (1.16.8-arm-linux) + racc (~> 1.4) + nokogiri (1.16.8-arm64-darwin) + racc (~> 1.4) + nokogiri (1.16.8-x86-linux) + racc (~> 1.4) + nokogiri (1.16.8-x86_64-darwin) + racc (~> 1.4) + nokogiri (1.16.8-x86_64-linux) + racc (~> 1.4) + open-uri (0.5.0) + stringio + time + uri + psych (5.2.1) + date + stringio + public_suffix (6.0.1) + racc (1.8.1) + rack (3.1.8) + rdf (3.3.2) + bcp47_spec (~> 0.2) + bigdecimal (~> 3.1, >= 3.1.5) + link_header (~> 0.0, >= 0.0.8) + rdf-aggregate-repo (3.3.0) + rdf (~> 3.3) + rdf-hamster-repo (3.3.0) + hamster (~> 3.0) + rdf (~> 3.3) + rdf-isomorphic (3.3.0) + rdf (~> 3.3) + rdf-json (3.3.0) + rdf (~> 3.3) + rdf-microdata (3.3.0) + htmlentities (~> 4.3) + nokogiri (~> 1.15, >= 1.15.4) + rdf (~> 3.3) + rdf-rdfa (~> 3.3) + rdf-xsd (~> 3.3) + rdf-n3 (3.3.0) + ebnf (~> 2.4) + rdf (~> 3.3) + sparql (~> 3.3) + sxp (~> 1.3) + rdf-normalize (0.7.0) + rdf (~> 3.3) + rdf-ordered-repo (3.3.0) + rdf (~> 3.3) + rdf-rdfa (3.3.0) + haml (~> 6.1) + htmlentities (~> 4.3) + rdf (~> 3.3) + rdf-aggregate-repo (~> 3.3) + rdf-vocab (~> 3.3) + rdf-xsd (~> 3.3) + rdf-rdfxml (3.3.0) + builder (~> 3.2, >= 3.2.4) + htmlentities (~> 4.3) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + rdf-reasoner (0.9.0) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + rdf-tabular (3.3.0) + addressable (~> 2.8) + bcp47_spec (~> 0.2) + json-ld (~> 3.3) + rdf (~> 3.3) + rdf-vocab (~> 3.3) + rdf-xsd (~> 3.3) + rdf-trig (3.3.0) + ebnf (~> 2.4) + rdf (~> 3.3) + rdf-turtle (~> 3.3) + rdf-trix (3.3.0) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + rdf-turtle (3.3.0) + ebnf (~> 2.4) + rdf (~> 3.3) + rdf-vocab (3.3.2) + rdf (~> 3.3) + rdf-xsd (3.3.0) + rdf (~> 3.3) + rexml (~> 3.2) + rexml (3.3.9) + scanf (1.0.0) + shacl (0.4.1) + json-ld (~> 3.3) + rdf (~> 3.3) + sparql (~> 3.3) + sxp (~> 1.2) + shex (0.8.0) + ebnf (~> 2.4) + htmlentities (~> 4.3) + json-ld (~> 3.3) + json-ld-preloaded (~> 3.3) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + sparql (~> 3.3) + sxp (~> 1.3) + sparql (3.3.0) + builder (~> 3.2, >= 3.2.4) + ebnf (~> 2.4) + logger (~> 1.5) + rdf (~> 3.3) + rdf-aggregate-repo (~> 3.3) + rdf-xsd (~> 3.3) + sparql-client (~> 3.3) + sxp (~> 1.3) + sparql-client (3.3.0) + net-http-persistent (~> 4.0, >= 4.0.2) + rdf (~> 3.3) + stringio (3.1.2) + sxp (1.3.0) + matrix (~> 0.4) + rdf (~> 3.3) + temple (0.10.3) + thor (1.3.2) + tilt (2.4.0) + time (0.4.1) + date + unicode-types (1.10.0) + uri (1.0.2) + yaml-ld (0.0.3) + json-ld (~> 3.3) + psych (>= 3.3) + rdf (~> 3.3) + rdf-xsd (~> 3.3) + +PLATFORMS + aarch64-linux + arm-linux + arm64-darwin + x86-linux + x86_64-darwin + x86_64-linux + +DEPENDENCIES + linkeddata (~> 3.3) + nokogiri + open-uri + +BUNDLED WITH + 2.5.3 diff --git a/main.rb b/main.rb new file mode 100644 index 0000000..f852ba0 --- /dev/null +++ b/main.rb @@ -0,0 +1,82 @@ +require 'linkeddata' +require 'nokogiri' +require 'open-uri' + +# Initialize the RDF vocabularies +SCHEMA = RDF::Vocab::SCHEMA +PROV = RDF::Vocab::PROV +SKOS = RDF::Vocab::SKOS + +def get_event_concept_from_web_page(event_page_url) + main_page_html_text = URI.open(event_page_url).read + main_doc = Nokogiri::HTML(main_page_html_text) + event_concept = main_doc.css('div.show-category').first.text.strip + puts "Event concept: #{event_concept}" + event_concept +end + +def fetch_concept_uri_from_concept_graph(event_concept, concept_graph) + object = RDF::Literal.new(event_concept, language: :fr) + concept = concept_graph.query([nil, SKOS.prefLabel, object]).first&.subject + if concept + puts "Concept URI: #{concept}" + concept + else + nil + end +end + +def insert_concept_uri_to_event_graph(event, concept_uri, events_graph) + events_graph.insert([event.subject, SCHEMA.additionalType, concept_uri]) + puts "Event concept added to graph\n\n" +end + +# Load the events graph and the concept graph +events_graph = RDF::Graph.load("output/grandtheatrequebec-events.jsonld") +concept_graph = RDF::Graph.load("gtq-event-type-mapping.ttl") + +events = events_graph.query([nil, RDF.type, SCHEMA.Event]) + + events_graph.query([nil, RDF.type, SCHEMA.EventSeries]) + +puts "Total events found: #{events.count}" + +# For each event, extract the event concept from the event page +events.each do |event| + retry_count = 0 + max_retries = 3 + begin + # Extract the URL of the event page + page_url = events_graph.query([event.subject, PROV.wasDerivedFrom, nil]).first.object + puts "Processing #{page_url}" + + # Extract the event concept from the event page + event_concept = get_event_concept_from_web_page(page_url) + + if event_concept + concept_uri = fetch_concept_uri_from_concept_graph(event_concept, concept_graph) + if concept_uri + insert_concept_uri_to_event_graph(event, concept_uri, events_graph) + else + puts "Concept URI not found in the concept graph" + end + else + puts "No event concept found" + end + rescue StandardError => e + puts "An error occurred while processing #{page_url}: #{e.message}" + retry_count += 1 + if retry_count < max_retries + # Retry after 1 second + puts "Retrying..." + sleep 1 + retry + else + puts "Max retries reached. Skipping..." + end + end +end + +# Save the updated events graph +File.open("output/grandtheatrequebec-events-with-concept.jsonld", 'w') do |file| + file.puts(events_graph.dump(:jsonld)) +end