diff --git a/.github/workflows/ruby-unit-tests.yml b/.github/workflows/ruby-unit-tests.yml index 192774d1..cde331e3 100644 --- a/.github/workflows/ruby-unit-tests.yml +++ b/.github/workflows/ruby-unit-tests.yml @@ -6,9 +6,12 @@ on: jobs: test: + strategy: + matrix: + backend: ['ruby', 'ruby-agraph'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: copy config.rb file from template run: cp config/config.test.rb config/config.rb - name: Build docker-compose @@ -16,5 +19,13 @@ jobs: run: docker-compose build - name: Run unit tests working-directory: ./test - run: docker-compose run unit-test wait-for-it solr-ut:8983 -- rake test TESTOPTS='-v' + run: | + ci_env=`bash <(curl -s https://codecov.io/env)` + docker-compose run $ci_env -e CI --rm ${{ matrix.backend }} bundle exec rake test TESTOPTS='-v' + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + with: + flags: unittests + verbose: true + fail_ci_if_error: false # optional (default = false) diff --git a/.gitignore b/.gitignore index a7b2058f..c98b8d52 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ +.bundle/ # Ignore RubyMine editor files .idea/ config/config.rb +config/appliance.rb config/config_*.rb config/*.p12 projectFilesBackup/ @@ -11,6 +13,9 @@ repo* .DS_Store tmp +# Code coverage reports +coverage* + # Ignore eclipse .project .project .pmd diff --git a/Dockerfile b/Dockerfile index 1c463704..cd191621 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,23 @@ -FROM ruby:2.6 +ARG RUBY_VERSION +ARG DISTRO_NAME=bullseye -RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends openjdk-11-jre-headless raptor2-utils wait-for-it +FROM ruby:$RUBY_VERSION-$DISTRO_NAME + +RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \ + openjdk-11-jre-headless \ + raptor2-utils \ + && rm -rf /var/lib/apt/lists/* -# The Gemfile Caching Trick -# we install gems before copying the code in its own layer so that gems would not have to get -# installed every single time code is updated RUN mkdir -p /srv/ontoportal/ncbo_cron +RUN mkdir -p /srv/ontoportal/bundle COPY Gemfile* *.gemspec /srv/ontoportal/ncbo_cron/ + WORKDIR /srv/ontoportal/ncbo_cron -RUN gem install bundler -v "$(grep -A 1 "BUNDLED WITH" Gemfile.lock | tail -n 1)" + +RUN gem update --system +RUN gem install bundler +ENV BUNDLE_PATH=/srv/ontoportal/bundle RUN bundle install + COPY . /srv/ontoportal/ncbo_cron +CMD ["/bin/bash"] diff --git a/Gemfile b/Gemfile index dfa7e64c..810f575f 100644 --- a/Gemfile +++ b/Gemfile @@ -3,7 +3,7 @@ source 'https://rubygems.org' gemspec gem 'faraday', '~> 1.9' -gem 'ffi' +gem 'ffi', '~> 1.15.5' gem "google-apis-analytics_v3" gem 'mail', '2.6.6' gem 'minitest', '< 5.0' @@ -20,15 +20,16 @@ gem 'sys-proctable' # Monitoring gem 'cube-ruby', require: 'cube' -# NCBO -gem 'goo', github: 'ncbo/goo', branch: 'master' -gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'master' -gem 'ncbo_resource_index', github: 'ncbo/resource_index' -gem 'ontologies_linked_data', github: 'lifewatch-eric/ontologies_linked_data', branch: 'master' -gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'master' - +gem 'goo', git: 'https://github.com/ontoportal-lirmm/goo.git', branch: 'ecoportal' +gem 'sparql-client', github: 'ontoportal-lirmm/sparql-client', branch: 'master' +gem 'ontologies_linked_data', git: 'https://github.com/lifewatch-eric/ontologies_linked_data.git', branch: 'master' +gem 'ncbo_annotator', github: 'ontoportal-lirmm/ncbo_annotator', branch: 'master' +# Testing group :test do gem 'email_spec' + gem 'simplecov' + gem 'simplecov-cobertura' # for codecov.io gem 'test-unit-minitest' end +gem "binding_of_caller", "~> 1.0" diff --git a/Gemfile.lock b/Gemfile.lock index 09c35148..23de97c3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/lifewatch-eric/ontologies_linked_data.git - revision: 2b5d8ebf6bc13ae182db549da8785a1b49a2e5e2 + revision: 24700f186e2de4196ce471627338c9fc6ebdf73f branch: master specs: ontologies_linked_data (0.0.1) @@ -10,8 +10,6 @@ GIT json libxml-ruby multi_json - ncbo_resource_index - oauth2 oj omni_logger pony @@ -21,9 +19,9 @@ GIT rubyzip GIT - remote: https://github.com/ncbo/goo.git - revision: d0dd816a0afd5b85a02c4b909502cf3a93e6be58 - branch: master + remote: https://github.com/ontoportal-lirmm/goo.git + revision: c310e3854705b241a6259faad14cf6cd4eb97053 + branch: ecoportal specs: goo (0.0.2) addressable (~> 2.8) @@ -36,8 +34,8 @@ GIT uuid GIT - remote: https://github.com/ncbo/ncbo_annotator.git - revision: ed325ae9f79e3b0a0061b1af0b02f624de1d0eef + remote: https://github.com/ontoportal-lirmm/ncbo_annotator.git + revision: 57204d8e54432ba660af4c49806e2a3019a23fa2 branch: master specs: ncbo_annotator (0.0.1) @@ -47,42 +45,8 @@ GIT ruby-xxHash GIT - remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 052e17698a59ff548a749fa7e365156c39aa7305 - branch: master - specs: - ontologies_linked_data (0.0.1) - activesupport - bcrypt - goo - json - libxml-ruby - multi_json - ncbo_resource_index - oj - omni_logger - pony - rack - rack-test - rsolr - rubyzip - -GIT - remote: https://github.com/ncbo/resource_index.git - revision: 24a7f14a6da4f4a0eaba1016ca5a378dfccd7441 - specs: - ncbo_resource_index (0.0.1) - elasticsearch (= 2.0.0) - mysql2 (= 0.5.2) - pony - ref - ruby-xxHash - sequel - typhoeus - -GIT - remote: https://github.com/ncbo/sparql-client.git - revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 + remote: https://github.com/ontoportal-lirmm/sparql-client.git + revision: aed51baf4106fd0f3d0e3f9238f0aad9406aa3f0 branch: master specs: sparql-client (1.0.1) @@ -107,29 +71,30 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (4.0.13) - i18n (~> 0.6, >= 0.6.9) - minitest (~> 4.2) - multi_json (~> 1.3) - thread_safe (~> 0.1) - tzinfo (~> 0.3.37) - addressable (2.3.5) - bcrypt (3.1.16) + activesupport (3.2.22.5) + i18n (~> 0.6, >= 0.6.4) + multi_json (~> 1.0) + addressable (2.8.5) + public_suffix (>= 2.0.2, < 6.0) + bcrypt (3.1.19) + binding_of_caller (1.0.0) + debug_inspector (>= 0.0.1) builder (3.2.4) coderay (1.1.3) - concurrent-ruby (1.1.9) + concurrent-ruby (1.2.2) + connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) + debug_inspector (1.1.0) declarative (0.0.20) + docile (1.4.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) email_spec (2.1.1) htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - ethon (0.15.0) - ffi (>= 1.15.0) - faraday (1.10.0) + faraday (1.10.3) faraday-em_http (~> 1.0) faraday-em_synchrony (~> 1.0) faraday-excon (~> 1.1) @@ -153,9 +118,9 @@ GEM faraday-rack (1.0.0) faraday-retry (1.0.3) ffi (1.15.5) - google-apis-analytics_v3 (0.10.0) - google-apis-core (>= 0.7, < 2.a) - google-apis-core (0.7.0) + google-apis-analytics_v3 (0.13.0) + google-apis-core (>= 0.11.0, < 2.a) + google-apis-core (0.11.1) addressable (~> 2.5, >= 2.5.1) googleauth (>= 0.16.2, < 2.a) httpclient (>= 2.8.1, < 3.a) @@ -164,13 +129,12 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - googleauth (1.2.0) + googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) - memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) - signet (~> 0.15) + signet (>= 0.16, < 2.a) htmlentities (4.3.4) http-accept (1.7.0) http-cookie (1.0.5) @@ -178,38 +142,29 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.1) - json_pure (2.6.1) - jwt (2.3.0) - launchy (2.4.3) - addressable (~> 2.3) - libxml-ruby (3.2.2) - logger (1.5.0) + json (2.6.3) + json_pure (2.6.3) + jwt (2.7.1) + launchy (2.5.2) + addressable (~> 2.8) + libxml-ruby (4.1.1) + logger (1.5.3) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) mime-types (>= 1.16, < 4) - memoist (0.16.2) method_source (1.0.0) - mime-types (3.4.1) + mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2022.0105) - mini_mime (1.1.2) + mime-types-data (3.2023.0808) + mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis multi_json (1.15.0) - multipart-post (2.1.1) - mysql2 (0.5.2) + multipart-post (2.3.0) net-http-persistent (2.9.4) netrc (0.11.0) - oauth2 (2.0.9) - faraday (>= 0.17.3, < 3.0) - jwt (>= 1.0, < 3.0) - multi_xml (~> 0.5) - rack (>= 1.2, < 4) - snaky_hash (~> 2.0) - version_gem (~> 1.1) oj (2.18.5) omni_logger (0.1.4) logger @@ -220,15 +175,18 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - rack (2.2.3) - rack-test (1.1.0) - rack (>= 1.0, < 3) + public_suffix (5.0.3) + rack (3.0.8) + rack-test (2.1.0) + rack (>= 1.3) rake (13.0.6) rdf (1.0.8) addressable (>= 2.2) - redis (4.6.0) - ref (2.0.0) - representable (3.1.1) + redis (5.0.7) + redis-client (>= 0.9.0) + redis-client (0.17.0) + connection_pool + representable (3.2.0) declarative (< 0.1.0) trailblazer-option (>= 0.1.1, < 0.2.0) uber (< 0.2.0) @@ -238,7 +196,7 @@ GEM mime-types (>= 1.16, < 4.0) netrc (~> 0.8) retriable (3.1.2) - rexml (3.2.5) + rexml (3.2.6) rsolr (2.5.0) builder (>= 2.1.2) faraday (>= 0.9, < 3, != 2.0.0) @@ -247,41 +205,45 @@ GEM rubyzip (2.3.2) rufus-scheduler (2.0.24) tzinfo (>= 0.3.22) - sequel (5.54.0) - signet (0.15.0) - addressable (~> 2.3) - faraday (>= 0.17.3, < 2.0) + signet (0.18.0) + addressable (~> 2.8) + faraday (>= 0.17.5, < 3.a) jwt (>= 1.5, < 3.0) multi_json (~> 1.10) - snaky_hash (2.0.1) - hashie - version_gem (~> 1.1, >= 1.1.1) - sys-proctable (1.2.6) - ffi + simplecov (0.22.0) + docile (~> 1.1) + simplecov-html (~> 0.11) + simplecov_json_formatter (~> 0.1) + simplecov-cobertura (2.1.0) + rexml + simplecov (~> 0.19) + simplecov-html (0.12.3) + simplecov_json_formatter (0.1.4) + sys-proctable (1.3.0) + ffi (~> 1.1) systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) trailblazer-option (0.1.2) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (0.3.60) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) + webrick (1.8.1) PLATFORMS - ruby x86_64-darwin-21 - x86_64-linux DEPENDENCIES + binding_of_caller (~> 1.0) cube-ruby email_spec faraday (~> 1.9) - ffi + ffi (~> 1.15.5) goo! google-apis-analytics_v3 mail (= 2.6.6) @@ -297,9 +259,11 @@ DEPENDENCIES rake redis rest-client + simplecov + simplecov-cobertura sparql-client! sys-proctable test-unit-minitest BUNDLED WITH - 2.3.14 + 2.3.23 diff --git a/README.md b/README.md new file mode 100644 index 00000000..150667ae --- /dev/null +++ b/README.md @@ -0,0 +1,81 @@ +# NCBO CRON + +A project with CRON job for the NCBO BioPortal + +- ncbo_cron daemon +- Process or delete ontology +- Generate annotator dictionary and cache +- Calculate metrics +- Process mapping counts +- Bulk load mappings + +## Run the ncbo_cron daemon + +To run it use the `bin/ncbo_cron` command + +Running this command without option will run the job according to the settings defined in the NcboCron config file. Or by default in [ncbo_cron/lib/ncbo_cron/config.rb](https://github.com/ncbo/ncbo_cron/blob/master/lib/ncbo_cron/config.rb) + +But the user can add arguments to change some settings. + +Here an example to run the flush old graph job every 3 hours and to disable the automatic pull of new submissions: + +``` +bin/ncbo_cron --flush-old-graphs "0 */3 * * *" --disable-pull +``` + +It will run by default as a daemon + +But it will not run as a daemon if you use one of the following options: + +* console (to open a pry console) +* view_queue (view the queue of jobs waiting for processing) +* queue_submission (adding a submission to the processing submission queue) +* kill (stop the ncbo_cron daemon) + + +## Stop the ncbo_cron daemon + +The PID of the ncbo_cron process is in /var/run/ncbo_cron/ncbo_cron.pid + +To stop the ncbo_cron daemon: +``` +bin/ncbo_cron -k +``` + + +## Run manually + +### Process an ontology +bin/ncbo_ontology_process -o STY + +### Mappings bulk load +To load a lot of mappings without using the REST API (which can take a long time) + +- Put the mappings detailed in JSON in a file. Example (the first mapping have the minimum required informations): +```javascript +[ + { + "creator":"admin", + "relation" : ["http://www.w3.org/2004/02/skos/core#exactMatch"], + "classes" : { "http://class_id1/id1" : "ONT_ACRONYM1", + "http://class_id2/id2" : "ONT_ACRONYM2"} + }, + { + "creator":"admin", + "source_contact_info":"admin@my_bioportal.org", + "relation" : ["http://www.w3.org/2004/02/skos/core#exactMatch", "http://purl.org/linguistics/gold/freeTranslation"], + "Source":"REST", + "source_name":"Reconciliation of multilingual mapping", + "comment" : "Interportal mapping with all possible informations (to the NCBO bioportal)", + "classes" : { "http://purl.lirmm.fr/ontology/STY/T071" : "STY", + "http://purl.bioontology.org/ontology/STY/T071" : "ncbo:STY"} + } +] +``` + +- Run the job + +bin/ncbo_mappings_bulk_load -b /path/to/mapping/file.json -l /path/to/log/file.log + + + diff --git a/bin/ecoportal_model_migration b/bin/ecoportal_model_migration new file mode 100755 index 00000000..5713a747 --- /dev/null +++ b/bin/ecoportal_model_migration @@ -0,0 +1,362 @@ +#!/usr/bin/env ruby + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +# Setup the bundled gems in our environment +require 'bundler/setup' +require 'binding_of_caller' +# Configure the process for the current cron configuration. +require_relative '../lib/ncbo_cron' +config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) +abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists +require_relative '../config/config' + +require 'optparse' +options = {} +opt_parser = OptionParser.new do |opts| + # Set a banner, displayed at the top of the help screen. + # opts.banner = "Usage: ncbo_ontology_process [options]" + opts.separator 'A script that migrate data from old EcoPortal model to the new (AgroPortal) model' + opts.on('-o', '--ontologies ACRONYM[,ACRONYM,...]', 'Migrate submission model of the of this ontology acronym(s).') do |acronym| + options[:ontologies] = acronym.split(',') + end + opts.on('--migrate-all', 'Migrate all submission models') do |d| + options[:migrate_all] = true + end + + options[:logfile] = "ecoportal_migration.log" + opts.on('-l', '--logfile FILE', "Write log to FILE (default is 'deletions.log')") do |filename| + options[:logfile] = filename + end + + # Display the help screen, all programs are assumed to have this option. + opts.on('-h', '--help', 'Display this screen') do + puts opts + exit + end +end +# Parse the command-line. The 'parse' method simply parses ARGV, while the 'parse!' method parses ARGV and removes +# any options found there, as well as any parameters for the options. +opt_parser.parse! + +### OLD Models to Migrate +class LinkedData::Models::Affiliation < LinkedData::Models::Base + AFFILIATION_IDENTIFIER_SCHEMES = { ISNI: 'https://isni.org/', ROR: 'https://ror.org/', GRID: 'https://www.grid.ac/' } + model :affiliation, name_with: lambda { |cc| uuid_uri_generator(cc) } + attribute :affiliationIdentifierScheme, enforce: [:existence], enforcedValues: AFFILIATION_IDENTIFIER_SCHEMES.keys + attribute :affiliationIdentifier, enforce: [:existence] + attribute :affiliation, enforce: [:existence] + attribute :acronym + attribute :homepage, enforce: [:uri] + attribute :email + + attribute :schemeURI, handler: :scheme_uri_infer + + embedded true + + def scheme_uri_infer + self.bring(:affiliationIdentifierScheme) if self.bring?(:affiliationIdentifierScheme) + AFFILIATION_IDENTIFIER_SCHEMES[self.affiliationIdentifierScheme.to_sym] if self.affiliationIdentifierScheme + end + +end + +class LinkedData::Models::CreatorIdentifier < LinkedData::Models::Base + IDENTIFIER_SCHEMES = { ORCID: 'https://orcid.org', ISNI: 'https://isni.org/', ROR: 'https://ror.org/', GRID: 'https://www.grid.ac/' } + model :creator_identifier, name_with: lambda { |cc| uuid_uri_generator(cc) } + attribute :nameIdentifierScheme, enforce: [:existence], enforcedValues: IDENTIFIER_SCHEMES.keys + attribute :nameIdentifier, enforce: [:existence] + attribute :schemeURI, handler: :scheme_uri_infer + + embedded true + + def scheme_uri_infer + self.bring(:nameIdentifierScheme) if self.bring?(:nameIdentifierScheme) + IDENTIFIER_SCHEMES[self.nameIdentifierScheme.to_sym] if self.nameIdentifierScheme + end +end + +class LinkedData::Models::Creator < LinkedData::Models::Base + model :creator, name_with: lambda { |c| uuid_uri_generator(c) } + attribute :nameType, default: lambda { |_| "Personal" }, enforcedValues: %w[Organizational Personal] + attribute :givenName + attribute :familyName + attribute :creatorName, enforce: [:existence] + attribute :creatorIdentifiers, enforce: [:creator_identifier, :list] + attribute :affiliations, enforce: [:affiliation, :list] + attribute :email + embedded true + embed :creatorIdentifiers, :affiliations +end + +class LinkedData::Models::Title < LinkedData::Models::Base + model :title, name_with: lambda { |cc| uuid_uri_generator(cc) } + attribute :title, enforce: [:existence] + attribute :lang, enforce: [:existence] + attribute :titleType, enforce: [:existence], enforcedValues: ["AlternativeTitle", "Subtitle", "TranslatedTitle", "Other"] + + embedded true +end + +class LinkedData::Models::OntologySubmission + attribute :resourceType, namespace: :datacite + attribute :oldIdentifier, property: :identifier, namespace: :datacite + attribute :titles, namespace: :datacite, enforce: [:existence, :title, :list] + attribute :creators, namespace: :datacite, enforce: [:existence, :creator, :list] + attribute :oldPublisher, property: :publisher, namespace: :datacite + attribute :oldPublication, property: :publication +end + +# Migrator class +class EcoPortalMigrator + + def initialize(submission:, logger:) + @submission = submission + @logger = logger + end + + def migrate_submission + sub = @submission + sub.bring_remaining + + if not_valid?(sub, "submission not valid") + sub.URI = sub.id if sub.URI.nil? + sub.publication = [RDF::URI.new(sub.publication.first)] + end + + log_info "Start submission #{sub.id} migration" + + stop_to_fix('submission not valid') if not_valid?(sub, "is not valid can't be migrated") + + log_info "Submission #{sub.id} is valid" + + array_migrate(sub, 'identifier', sub.oldIdentifier) + sub.identifier = sub.identifier.map{|p| RDF::URI.new(p)} + stop_to_fix('identifier not valid') if not_valid?(sub, 'identifier migration failed') + log_info "Submission #{sub.id} identifier migrated: #{sub.identifier} (old: #{sub.oldIdentifier})" + + + if sub.resourceType + case sub.resourceType + when 'SKOS', 'Thesaurus' + url = "http://w3id.org/nkos/nkostype#thesaurus" + when "Controlled vocabulary" + url = 'http://w3id.org/nkos/nkostype#terminology' + when 'Taxonomy' + url = 'http://w3id.org/nkos/nkostype#terminology' + else + url = 'http://w3id.org/nkos/nkostype#ontology' + end + sub.hasFormalityLevel = RDF::URI.new(url) + end + + stop_to_fix('hasFormalityLevel not valid') if not_valid?(sub, 'resourceType to hasFormalityLevel migration failed') + log_info ">> Submission #{sub.id} hasFormalityLevel migrated: #{sub.hasFormalityLevel} (old: #{sub.resourceType})" + + migrate_title(sub) + stop_to_fix('alternative not valid') if not_valid?(sub, 'titles to alternative migration failed') + log_info ">> Submission #{sub.id} alternative migrated: #{sub.alternative} (old: #{sub.titles})" + + log_info "Start submission #{sub.id} creators migration" + migrate_creators(sub) + log_info "#{sub.id} creators migration ended" + + if sub.valid? + sub.save rescue stop_to_fix('not valid submission') + log_info ">> #{sub.id} migrated successfully" + else + stop_to_fix "#{sub.id} migration failed submission not valid" + end + + log_info "> Submission #{sub.id} migration ended" + end + + private + + + def logger + @logger + end + + def admin_user(username = 'admin') + user = LinkedData::Models::User.find(username).first + raise "The user #{username} does not exist" if user.nil? + user.bring_remaining + end + + def array_migrate(sub, attr, new_val) + old_val = sub.send(attr.to_s) + sub.send("#{attr}=", (Array(old_val) + [new_val]).uniq) if new_val + end + + def create_identifier(notation, schema_agency) + new_id = LinkedData::Models::AgentIdentifier.new + new_id.notation = notation + new_id.schemaAgency = schema_agency + new_id.creator = admin_user + + id = LinkedData::Models::AgentIdentifier.generate_identifier(new_id.notation, new_id.schemaAgency) + log_info "Start identifier #{id} migration" + identifier = LinkedData::Models::AgentIdentifier.find(RDF::URI.new(id)).first + + if identifier + new_id = identifier + log_info "Identifier #{id} already exist, re-use it." + else + if new_id.valid? + new_id.save + log_info "Identifier #{id} created." + else + old_id = new_id + new_id = nil + stop_to_fix "identifier with id #{id} migration failed" + end + end + + new_id + end + + def migrate_creator_identifier(creator, agent) + Array(creator.creatorIdentifiers).each do |id| + id.bring_remaining + new_id = create_identifier(id.nameIdentifier, id.nameIdentifierScheme) + array_migrate(agent, 'identifiers', new_id) + log_info "Creator #{creator.id} added identifier: #{new_id.id})" + end + end + + def not_valid?(sub, msg) + unless sub.valid? + log_error msg + log_error "Submission #{sub.id} errors: #{sub.errors}" + true + end + end + + def stop_to_fix(msg) + log_error "Stop to fix #{msg}" + caller_binding = binding.of_caller(1) + binding.pry(caller_binding) + log_error "End stop to fix" + end + + + def log_error(msg) + logger.error "> #{msg}" + end + + def log_info(msg) + logger.info "> #{msg}" + end + + def migrate_affiliations(creator, agent) + Array(creator.affiliations).each do |affiliation| + affiliation.bring_remaining + log_info "Start affiliation #{affiliation.id} migration" + + new_affiliation = LinkedData::Models::Agent.new + new_affiliation.agentType = 'organization' + new_affiliation.name = affiliation.affiliation + new_affiliation.creator = admin_user + + scheme_url = LinkedData::Models::AgentIdentifier::IDENTIFIER_SCHEMES[affiliation.affiliationIdentifierScheme.to_sym] + + stop_to_fix("#{affiliation.id} has not a good scheme url: #{affiliation.affiliationIdentifierScheme}") unless scheme_url + + log_info "Start identifiers of #{affiliation.id} migration" + new_id = create_identifier(affiliation.affiliationIdentifier, affiliation.affiliationIdentifierScheme) + array_migrate(new_affiliation, 'identifiers', new_id) + + if new_affiliation.valid? + new_affiliation.save + array_migrate(agent, 'affiliations', new_affiliation) + else + affiliation = LinkedData::Models::Agent.where(name: new_affiliation.name).first + if affiliation + log_info "found an existant affiliation with the same name #{new_affiliation.name}" + array_migrate(agent, 'affiliations', affiliation) + else + stop_to_fix "affiliation migration failed" + end + end + + log_info "Affiliation #{new_affiliation.name} migration ended" + end + + end + + def migrate_creators(sub) + Array(sub.creators).each do |creator| + creator.bring_remaining + log_info "Start creator #{creator.id} migration" + new_agent = LinkedData::Models::Agent.new + new_agent.agentType = creator.nameType.eql?('Personal') ? 'person' : 'organization' + new_agent.name = creator.creatorName + new_agent.creator = admin_user + + log_info "Start identifiers of #{creator.id} migration" + migrate_creator_identifier(creator, new_agent) + log_info "All identifiers of #{creator.id} are migrated." + + log_info "Start affiliations of #{creator.id} migration" + migrate_affiliations(creator, new_agent) + log_info "All affiliations of #{creator.id} are migrated." + + if new_agent.valid? + new_agent.save + array_migrate(sub, 'hasCreator', new_agent) + else + creator = LinkedData::Models::Agent.where(name: new_agent.name).first + if creator + log_info "found an existant creator with the same name #{new_agent.name}" + array_migrate(sub, 'hasCreator', creator) + else + stop_to_fix "creator with id #{creator.id.to_s} migration failed" + end + end + log_info "Creator #{creator.id} migration ended" + end + + end + + def migrate_title(sub) + Array(sub.titles).each do |title| + title.bring_remaining + title = title.title + array_migrate(sub, 'alternative', title) + end + end + +end + +logger = Logger.new(options[:logfile]) +# a formatter to write simultaneously into a file and stout +logger.formatter = proc do |severity, datetime, progname, msg| + out = "#{severity} [#{datetime}] #{msg} \n" + puts out + out +end + +begin + puts "EcoPortal migration details are logged to #{options[:logfile]}" + + if options[:migrate_all] + submissions = LinkedData::Models::Ontology.all.each { |o| o.latest_submission } + else + submissions = options[:ontologies].map do |acronym| + ont = LinkedData::Models::Ontology.find(acronym).first + ont.latest_submission(status: :any) + end + end + + submissions.each do |sub| + EcoPortalMigrator.new(submission: sub, logger: logger).migrate_submission + end + +rescue Exception => e + logger.error "Failed, exception: #{e.to_json}." + binding.pry + exit(1) +end + diff --git a/bin/ncbo_cron b/bin/ncbo_cron index 625502b7..82ec9c04 100755 --- a/bin/ncbo_cron +++ b/bin/ncbo_cron @@ -133,9 +133,15 @@ opt_parser = OptionParser.new do |opts| opts.on("-c", "--pull-cron SCHED", String, "cron schedule for ontology pull", "(default: #{options[:pull_schedule]})") do |c| options[:pull_schedule] = c end + opts.on("-cl", "--pull-long-cron SCHED", String, "cron schedule for ontology less frequent pull", "(default: #{options[:pull_schedule_long]})") do |c| + options[:pull_schedule_long] = c + end opts.on("-f", "--flush-old-graphs SCHED", String, "cron schedule to delete class graphs of archive submissions", "(default: #{options[:cron_flush]})") do |c| options[:cron_flush] = c end + opts.on("--remove-zombie-graphs", "flush class graphs from deleted ontologies") do |v| + options[:remove_zombie_graphs] = true + end opts.on("-w", "--warm-long-queries SCHED", String, "cron schedule to warmup long time running queries", "(default: #{options[:cron_warmq]})") do |c| options[:cron_warmq] = c end @@ -279,8 +285,8 @@ runner.execute do |opts| if options[:enable_pull] pull_thread = Thread.new do - logger.debug "Setting up pull cron job"; logger.flush pull_options = options.dup + logger.info "Setting up pull cron job: #{pull_options[:pull_schedule]}"; logger.flush pull_options.delete(:minutes_between) pull_options.delete(:seconds_between) pull_options[:job_name] = "ncbo_cron_pull_thread" @@ -292,15 +298,32 @@ runner.execute do |opts| logger.info "Starting ncbo pull"; logger.flush logger.info "Logging pull details to #{pull_log_path}"; logger.flush puller = NcboCron::Models::OntologyPull.new - pulled_onts = puller.do_remote_ontology_pull(logger: pull_logger, - enable_pull_umls: options[:enable_pull_umls], - cache_clear: true) + pulled_onts = puller.do_remote_ontology_pull(false, logger: pull_logger, enable_pull_umls: options[:enable_pull_umls], pull_long_ontologies: pull_options[:pull_long_ontologies]) logger.info "Finished ncbo pull"; logger.flush logger.info "Pull summary:\n#{pulled_onts.map {|o| o.id.to_s}}" end end + + long_pull_thread = Thread.new do + long_pull_options = options.dup + logger.info "Setting up pull long cron job: #{long_pull_options[:pull_schedule_long]}"; logger.flush + long_pull_options[:job_name] = "ncbo_cron_pull_long_thread" + long_pull_options[:scheduler_type] = :cron + long_pull_options[:cron_schedule] = long_pull_options[:pull_schedule_long] + pull_log_path = File.join(log_path, "#{log_filename_noExt}-long-pull.log") + pull_logger = Logger.new(pull_log_path) + NcboCron::Scheduler.scheduled_locking_job(long_pull_options) do + logger.info "Starting ncbo long pull"; logger.flush + logger.info "Logging pull details to #{pull_log_path}"; logger.flush + long_puller = NcboCron::Models::OntologyPull.new + pulled_long_onts = long_puller.do_remote_ontology_pull(true, logger: pull_logger, enable_pull_umls: options[:enable_pull_umls], pull_long_ontologies: long_pull_options[:pull_long_ontologies]) + logger.info "Finished ncbo pull"; logger.flush + logger.info "Pull summary:\n#{pulled_long_onts.map {|o| o.id.to_s}}" + end + end end + # Flush old graphs. Remove zombie graphs if config.remove_zombie_graphs = true if options[:enable_flush] flush_thread = Thread.new do flush_options = options.dup @@ -317,7 +340,7 @@ runner.execute do |opts| logger.info "Logging flush details to #{flush_log_path}"; logger.flush t0 = Time.now parser = NcboCron::Models::OntologySubmissionParser.new - flush_onts = parser.process_flush_classes(flush_logger) + flush_onts = parser.process_flush_classes(flush_logger, flush_options[:remove_zombie_graphs]) logger.info "Flushed #{flush_onts.length} submissions in #{Time.now - t0} sec."; logger.flush logger.info "Finished flush"; logger.flush end @@ -544,6 +567,7 @@ runner.execute do |opts| # Need to join here to avoid dropping out of the process parsing_thread.join if parsing_thread pull_thread.join if pull_thread + long_pull_thread.join if long_pull_thread flush_thread.join if flush_thread warmq_thread.join if warmq_thread analytics_thread.join if analytics_thread diff --git a/bin/ncbo_mappings_bulk_load b/bin/ncbo_mappings_bulk_load new file mode 100755 index 00000000..bdeb520b --- /dev/null +++ b/bin/ncbo_mappings_bulk_load @@ -0,0 +1,97 @@ +#!/usr/bin/env ruby + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +# Setup the bundled gems in our environment +require 'bundler/setup' + +# Configure the process for the current cron configuration. +require_relative '../lib/ncbo_cron' +config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) +abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists +require_relative '../config/config'; + +require 'optparse' +options = {} +opt_parser = OptionParser.new do |opts| + # Set a banner, displayed at the top of the help screen. + #opts.banner = "Usage: ncbo_mappings_bulk_load [options]" + opts.on('-b', '--load FILEPATH', 'Load mappings from this file') do |filepath| + options[:loadfile] = filepath + end + options[:logfile] = "logs/bulk_load_mapping.log" + opts.on( '-l', '--logfile FILE', "Write log to FILE (default is 'bulk_load_mapping.log')" ) do |filename| + options[:logfile] = filename + end + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end +# Parse the command-line. The 'parse' method simply parses ARGV, while the 'parse!' method parses ARGV and removes +# any options found there, as well as any parameters for the options. +opt_parser.parse! + + +####################################################################################################################### +# +# MAIN +# + +logger = Logger.new(options[:logfile]) +puts "Processing details are logged to #{options[:logfile]}" + +if options[:loadfile] + logger.info("Loading file: #{options[:loadfile].to_s}") + t0 = Time.now + logger.info("Begin: #{t0.to_s}") + mapping_count = 0 + json_load = JSON.parse(File.read(options[:loadfile]), {:symbolize_names => true}) + mappings_failed = [] + json_load.each do |mapping| + begin + loaded_mapping = LinkedData::Mappings.bulk_load_mapping(mapping, logger) + str_result = "Mapping ##{mapping_count} loaded: " + loaded_mapping.classes.each do |cls| + if cls.respond_to?"id" + # case it is an internal mapping + str_result += cls.id.to_s + else + # case it is an external or interportal mapping + str_result += cls[:id].to_s + end + str_result += " and " + end + logger.info(str_result[0..-5]) + # remove the mapping from the error array if successfully added + if mappings_failed.include? mapping[:classes].to_s + mappings_failed.delete(mapping[:classes].to_s) + end + mapping_count += 1 + sleep 0.4 + rescue => e + logger.error("MAPPING BETWEEN: #{mapping[:classes].to_s} HAS FAILED. Message: #{e.message.to_s}") + mappings_failed.push(mapping[:classes].to_s) + if e.class.to_s != "ArgumentError" + # If unexpected error: retry + #system "bprestart" => could generate error because not run by a user with enough permissions + # add the mappings to the error array (removed if manage to add the mapping after) + logger.info(e.backtrace) + logger.info("sleeping for 40s...") + sleep 40 + retry + end + next + end + end + logger.info("End: #{Time.now.to_s}") + logger.info("Execution time: #{Time.now - t0}") + logger.info("Mappings failed: #{mappings_failed.to_s}") + logger.info("Mappinds uploaded count: #{mapping_count.to_s}") +else + puts opt_parser.help + exit(1) +end + diff --git a/bin/ncbo_mappings_delete b/bin/ncbo_mappings_delete new file mode 100755 index 00000000..2242ff04 --- /dev/null +++ b/bin/ncbo_mappings_delete @@ -0,0 +1,100 @@ +#!/usr/bin/env ruby + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +# Setup the bundled gems in our environment +require 'bundler/setup' + +# Configure the process for the current cron configuration. +require_relative '../lib/ncbo_cron' +config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) +abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists +require_relative '../config/config'; + +require 'optparse' +options = {} +opt_parser = OptionParser.new do |opts| + # Set a banner, displayed at the top of the help screen. + #opts.banner = "Usage: ncbo_ontology_process [options]" + opts.on('-o', '--ontologies ACRONYM[,ACRONYM,...]', 'Delete all mappings from/to this ontology acronym(s).') do |acronym| + options[:ontologies] = acronym.split(',') + end + opts.on('--delete-all', 'Delete ALL REST mapping of the BioPortal instance') do |d| + options[:delete_all] = true + end + options[:logfile] = "delete_mapping.log" + opts.on( '-l', '--logfile FILE', "Write log to FILE (default is 'delete_mapping.log')" ) do |filename| + options[:logfile] = filename + end + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end +# Parse the command-line. The 'parse' method simply parses ARGV, while the 'parse!' method parses ARGV and removes +# any options found there, as well as any parameters for the options. +opt_parser.parse! + + +####################################################################################################################### +# +# MAIN +# + +logger = Logger.new(options[:logfile]) +puts "Processing details are logged to #{options[:logfile]}" + +if options[:delete_all] + puts "WARNING: you are going to delete ALL REST MAPPINGS from the Bioportal appliance." + puts "Type 'yes' to continue: " + $stdout.flush + confirm = $stdin.gets + abort("Aborting...\n\n") unless (confirm.strip == 'yes') + + logger.info("DELETING ALL MAPPINGS") + # Go through all RestBackupMapping and delete all of them + LinkedData::Models::RestBackupMapping.all.each do |m| + backup = LinkedData::Models::RestBackupMapping.find(RDF::URI.new(m.id.to_s)).first + backup.delete + end + # Go through all MappingProcess and delete all of them + LinkedData::Models::MappingProcess.all.each do |m| + process = LinkedData::Models::MappingProcess.find(RDF::URI.new(m.id.to_s)).first + process.delete + end + # Go through all triples that link mappings to class to delete them (use of metadata/def/mappingRest predicate) + LinkedData::Mappings.delete_all_rest_mappings_from_sparql + +elsif options[:ontologies] + # Go through all REST mapping and delete only the one corresponding to the ontology acronyms + options[:ontologies].each do |acronym| + ont = LinkedData::Models::Ontology.find(acronym).first + if ont.nil? + msg = "Error, ontology not found: #{acronym}" + logger.error(msg) + puts msg + next + end + logger.info("DELETING #{acronym} MAPPINGS") + LinkedData::Models::RestBackupMapping.all.each do |m| + mapping_id = RDF::URI.new(m.id.to_s) + begin + backup = LinkedData::Models::RestBackupMapping.find(mapping_id).include(:class_urns).first + backup.class_urns.each do |urn| + if urn.split(":")[1] == acronym + logger.info("#{LinkedData::Mappings.delete_rest_mapping(mapping_id)} has been deleted") + end + end + rescue + logger.error("#{m.id.to_s} FAILED") + next + end + end + end +else + puts opt_parser.help + exit(1) +end + diff --git a/bin/ncbo_ontology_annotate_generate_cache b/bin/ncbo_ontology_annotate_generate_cache index 07286e7c..b030bafb 100755 --- a/bin/ncbo_ontology_annotate_generate_cache +++ b/bin/ncbo_ontology_annotate_generate_cache @@ -86,12 +86,13 @@ elsif (options[:remove_cache]) puts "You are about to clear the alternate Annotator cache repository and kick off a lengthy re-population process. Are you sure?" end +=begin if (options[:remove_cache]) puts "Type 'yes' to continue: " $stdout.flush confirm = $stdin.gets abort("Aborting...\n\n") unless (confirm.strip == 'yes') -end +=end puts "Processing details are logged to #{options[:logfile]}" msg = "" diff --git a/bin/ncbo_ontology_index b/bin/ncbo_ontology_index index 746d8d95..a1a9d67f 100755 --- a/bin/ncbo_ontology_index +++ b/bin/ncbo_ontology_index @@ -124,12 +124,13 @@ elsif options[:all] puts "You are about to clear the index on #{options[:solr_core_url]} and kick off a lengthy re-indexing process. Are you sure?" end +=begin if options[:all] puts "Type 'yes' to continue: " $stdout.flush confirm = $stdin.gets abort("Aborting...\n\n") unless (confirm.strip == 'yes') -end +=end begin logger = Logger.new(options[:logfile]) diff --git a/bin/ncbo_ontology_process b/bin/ncbo_ontology_process index d96f0d87..05a1169b 100755 --- a/bin/ncbo_ontology_process +++ b/bin/ncbo_ontology_process @@ -67,10 +67,10 @@ end # if options[:all] puts "About to perform the following tasks on ALL ontologies: #{options[:tasks]}" - puts "Type 'yes' to continue: " - $stdout.flush - confirm = $stdin.gets - abort("Aborting...\n") unless (confirm.strip == 'yes') + #puts "Type 'yes' to continue: " + #$stdout.flush + #confirm = $stdin.gets + #abort("Aborting...\n") unless (confirm.strip == 'yes') else puts "Processing the following tasks: #{options[:tasks]} on ontologies: #{options[:ontologies]}" end diff --git a/bin/ncbo_ontology_property_index b/bin/ncbo_ontology_property_index index 0d908777..8998a322 100755 --- a/bin/ncbo_ontology_property_index +++ b/bin/ncbo_ontology_property_index @@ -114,12 +114,13 @@ elsif options[:all] puts "You are about to clear the property index on #{options[:solr_core_url]} and kick off a full re-indexing process. Are you sure?" end +=begin if options[:all] puts "Type 'yes' to continue: " $stdout.flush confirm = $stdin.gets abort("Aborting...\n\n") unless (confirm.strip == 'yes') -end +=end begin logger = Logger.new(options[:logfile]) diff --git a/bin/ncbo_ontology_pull b/bin/ncbo_ontology_pull new file mode 100755 index 00000000..428f3bd4 --- /dev/null +++ b/bin/ncbo_ontology_pull @@ -0,0 +1,45 @@ +#!/usr/bin/env ruby + +$0 = "ncbo_ontology_pull" + +# Exit cleanly from an early interrupt +Signal.trap("INT") { exit 1 } + +# Setup the bundled gems in our environment +require 'bundler/setup' +# redis store for looking up queued jobs +require 'redis' + +require_relative '../lib/ncbo_cron' +require_relative '../config/config' +require 'optparse' + +ontology_acronym = '' +opt_parser = OptionParser.new do |opts| + opts.on('-o', '--ontology ACRONYM', 'Ontology acronym to pull if new version exist') do |acronym| + ontology_acronym = acronym + end + + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen') do + puts opts + exit + end +end +opt_parser.parse! + +logger = Logger.new($stdout) +logger.info "Starting ncbo pull"; logger.flush +puller = NcboCron::Models::OntologyPull.new +begin + puller.do_ontology_pull(ontology_acronym, logger: logger , isLong: false ,enable_pull_umls:true, options: {}) +rescue NcboCron::Models::OntologyPull::RemoteFileException => e + logger.error e.message + logger.flush +rescue StandardError => e + logger.error e.message + logger.flush +end +logger.info "Finished ncbo pull"; logger.flush + + diff --git a/bin/ncbo_ontology_submissions_eradicate b/bin/ncbo_ontology_submissions_eradicate new file mode 100755 index 00000000..ef2c7a19 --- /dev/null +++ b/bin/ncbo_ontology_submissions_eradicate @@ -0,0 +1,107 @@ +#!/usr/bin/env ruby + +$0 = 'ncbo_cron' + +# Exit cleanly from an early interrupt +Signal.trap('INT') { exit 1 } + +# Setup the bundled gems in our environment +require 'bundler/setup' +# redis store for looking up queued jobs +require 'redis' + +require_relative '../lib/ncbo_cron' +require_relative '../config/config' +require 'optparse' +ontology_acronym = '' +submission_id_from = 0 +submission_id_to = 0 + +opt_parser = OptionParser.new do |opts| + opts.banner = 'Usage: ncbo_ontology_sumissions_eradicate [options]' + opts.on('-o', '--ontology ACRONYM', 'Ontology acronym which we want to eradicate (remove triples+files) specific submissions') do |acronym| + ontology_acronym = acronym + end + + opts.on('--from id', 'Submission id to start from deleting (included)') do |id| + submission_id_from = id.to_i + end + + opts.on('--to id', 'Submission id to end deleting (included)') do |id| + submission_id_to = id.to_i + end + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen') do + puts opts + exit + end +end +opt_parser.parse! + + + + + +def ontology_exists?(ontology_acronym) + ont = LinkedData::Models::Ontology.find(ontology_acronym) + .include(submissions: [:submissionId]) + .first + if ont.nil? + logger.error "ontology not found: #{options[:ontology]}" + exit(1) + end + ont.bring(:submissions) if ont.bring?(:submissions) + ont +end + + +def get_submission_to_delete(submissions, from, to) + min, max = [from, to].minmax + submissions.select { |s| s.submissionId.between?(min, max) }.sort { |s1, s2| s1.submissionId <=> s2.submissionId} +end + +def eradicate(ontology_acronym, submissions , logger) + logger ||= Logger.new($stderr) + submissions.each do |submission| + begin + logger.info "Start removing submission #{submission.submissionId.to_s}" + NcboCron::Models::OntologySubmissionEradicator.new.eradicate submission + logger.info"Submission #{submission.submissionId.to_s} deleted successfully" + rescue NcboCron::Models::OntologySubmissionEradicator::RemoveNotArchivedSubmissionException + logger.info "Submission #{submission.submissionId.to_s} is not archived" + ask? logger, 'Do you want to force remove ? (Y/n)' + NcboCron::Models::OntologySubmissionEradicator.new.eradicate submission, true + logger.info"Submission #{submission.submissionId.to_s} deleted successfully" + rescue NcboCron::Models::OntologySubmissionEradicator::RemoveSubmissionFileException => e + logger.error "RemoveSubmissionFileException in submission #{submission.submissionId.to_s} : #{e.message}" + rescue NcboCron::Models::OntologySubmissionEradicator::RemoveSubmissionDataException => e + logger.error "RemoveSubmissionDataException in submission #{submission.submissionId.to_s} : #{e.message}" + rescue Exception => e + logger.error "Error in submission #{submission.submissionId.to_s} remove: #{e.message}" + end + end +end + +def ask?(logger, prompt) + logger.info prompt + choice = gets.chomp.downcase + exit(1) if choice.eql? 'n' +end + +begin + logger = Logger.new($stderr) + + logger.info 'Start of NCBO ontology submissions eradicate' + + ont = ontology_exists? ontology_acronym + + submissions = ont.submissions + submissions_to_delete = get_submission_to_delete submissions, submission_id_from, submission_id_to + + logger.info "You are attempting to remove the following submissions of #{ontology_acronym} : #{submissions_to_delete.map{ |s| s.submissionId }.join(', ')}" + logger.info 'They will be deleted from the triple store and local files' + ask? logger, 'Do you confirm ? (Y/n)' + + eradicate ontology_acronym , submissions_to_delete, logger + exit(0) +end \ No newline at end of file diff --git a/config/config.rb.old b/config/config.rb.old new file mode 100644 index 00000000..e35d9647 --- /dev/null +++ b/config/config.rb.old @@ -0,0 +1,52 @@ +LinkedData.config do |config| + config.enable_monitoring = false + config.cube_host = "localhost" + config.goo_host = "localhost" + config.goo_port = 8081 + config.search_server_url = "http://localhost:8082/solr/core1" + config.repository_folder = "/srv/ncbo/repository" + config.goo_redis_host = "localhost" + config.goo_redis_port = 6380 + config.http_redis_host = "localhost" + config.http_redis_port = 6382 + config.enable_http_cache = true + + #Email notifications + config.enable_notifications = false + config.email_sender = "admin@example.org" # Default sender for emails + config.email_override = "override@example.org" # all email gets sent here. Disable with email_override_disable. + config.email_disable_override = true + config.smtp_host = "localhost" + config.smtp_port = 25 + config.smtp_auth_type = :none # :none, :plain, :login, :cram_md5 + config.smtp_domain = "example.org" + + #PURL server config parameters + config.enable_purl = false + config.purl_host = "purl.example.org" + config.purl_port = 80 + config.purl_username = "admin" + config.purl_password = "password" + config.purl_maintainers = "admin" + config.purl_target_url_prefix = "http://example.org" + +end + +Annotator.config do |config| + config.mgrep_dictionary_file = "/srv/mgrep/dictionary/dictionary.txt" + config.mgrep_host = "localhost" + config.mgrep_port = 55555 + config.annotator_redis_host = "localhost" + config.annotator_redis_port = 6379 +end + +NcboCron.config do |config| + config.redis_host = Annotator.settings.annotator_redis_host + config.redis_port = Annotator.settings.annotator_redis_port + config.pull_umls_url = "http://localhost:8080/umls_turtle/" + config.enable_pull_umls = false + config.search_index_all_url = "http://localhost:8082/solr/core2" + config.enable_flush = false +end + +Goo.use_cache = true diff --git a/config/config.rb.sample b/config/config.rb.sample index 15125224..b954b77d 100644 --- a/config/config.rb.sample +++ b/config/config.rb.sample @@ -1,58 +1,164 @@ -LinkedData.config do |config| - config.enable_monitoring = false - config.cube_host = "localhost" - config.goo_host = "localhost" - config.goo_port = 8080 - config.search_server_url = "http://localhost:8983/solr/term_search_core1" - config.property_search_server_url = "http://localhost:8983/solr/prop_search_core1" - config.repository_folder = "./test/data/ontology_files/repo" - config.http_redis_host = "localhost" - config.http_redis_port = 6379 - config.goo_redis_host = "localhost" - config.goo_redis_port = 6379 - - # Email notifications. - config.enable_notifications = true - config.email_sender = "sender@domain.com" # Default sender for emails - config.email_override = "test@domain.com" # By default, all email gets sent here. Disable with email_override_disable. - config.smtp_host = "smtp-unencrypted.stanford.edu" - config.smtp_user = nil - config.smtp_password = nil - config.smtp_auth_type = :none - config.smtp_domain = "localhost.localhost" -end - -Annotator.config do |config| - config.mgrep_dictionary_file ||= "./test/tmp/dict" - config.stop_words_default_file ||= "./config/default_stop_words.txt" - config.mgrep_host ||= "localhost" - config.mgrep_port ||= 55555 - config.annotator_redis_host ||= "localhost" - config.annotator_redis_port ||= 6379 -end - -NcboCron.config do |config| - config.redis_host ||= "localhost" - config.redis_port ||= 6379 - config.search_index_all_url = "http://localhost:8983/solr/term_search_core2" - config.property_search_index_all_url = "http://localhost:8983/solr/prop_search_core2" - - # Ontologies Report config - config.ontology_report_path = "./test/reports/ontologies_report.json" - - # Google Analytics config - config.analytics_service_account_email_address = "123456789999-sikipho0wk8q0atflrmw62dj4kpwoj3c@developer.gserviceaccount.com" - config.analytics_path_to_key_file = "config/bioportal-analytics.p12" - config.analytics_profile_id = "ga:1234567" - config.analytics_app_name = "BioPortal" - config.analytics_app_version = "1.0.0" - config.analytics_start_date = "2013-10-01" - config.analytics_filter_str = "ga:networkLocation!@stanford;ga:networkLocation!@amazon" - - # this is a Base64.encode64 encoded personal access token - # you need to run Base64.decode64 on it before using it in your code - # this is a workaround because Github does not allow storing access tokens in a repo - config.git_repo_access_token = "YOUR GITHUB REPO PERSONAL ACCESS TOKEN, encoded using Base64" +# Sample for Bioportal (with the good subdomain created: sparql.bioportal.lirmm.fr, data.bioportal.lirmm.fr) + +$SITE_URL = "bioportal.lirmm.fr" + +begin + LinkedData.config do |config| + config.repository_folder = "/srv/ncbo/repository" + config.goo_host = "localhost" + config.goo_port = 8081 + config.search_server_url = "http://localhost:8082/solr/term_search_core1" + config.property_search_server_url = "http://localhost:8082/solr/prop_search_core1" + config.rest_url_prefix = "http://data.#{$SITE_URL}/" + config.replace_url_prefix = true + config.id_url_prefix = "http://data.bioontology.org/" + config.enable_security = true # enable private ontologies hiding + config.apikey = "" + config.ui_host = "#{$SITE_URL}" + config.sparql_endpoint_url = "http://sparql.#{$SITE_URL}/test" + config.enable_monitoring = false + config.cube_host = "localhost" + config.enable_slices = true + config.enable_resource_index = false + + # Used to define other bioportal that can be mapped to + # Example to map to ncbo bioportal : {"ncbo" => {"api" => "http://data.bioontology.org", "ui" => "http://bioportal.bioontology.org", "apikey" => ""} + # Then create the mapping using the following class in JSON : "http://purl.bioontology.org/ontology/MESH/C585345": "ncbo:MESH" + # Where "ncbo" is the namespace used as key in the interportal_hash + config.interportal_hash = {"ncbo" => {"api" => "http://data.bioontology.org", "ui" => "http://bioportal.bioontology.org", "apikey" => ""}, + "agroportal" => {"api" => "http://data.agroportal.lirmm.fr", "ui" => "http://agroportal.lirmm.fr", "apikey" => ""}} + + # Caches + config.http_redis_host = "localhost" + config.http_redis_port = 6380 + config.enable_http_cache = true + config.goo_redis_host = "localhost" + config.goo_redis_port = 6382 + + # Email notifications + config.enable_notifications = true + config.email_sender = "notifications@#{$SITE_URL}" # Default sender for emails + config.email_override = "override@example.org" # all email gets sent here. Disable with email_override_disable. + config.email_disable_override = true + config.smtp_host = "smtp.lirmm.fr" + config.smtp_port = 25 + config.smtp_auth_type = :none # :none, :plain, :login, :cram_md5 + config.smtp_domain = "lirmm.fr" + # Emails of the instance administrators to get mail notifications when new user or new ontology + config.admin_emails = ["my.mail@example.org"] + + # PURL server config parameters + config.enable_purl = false + config.purl_host = "purl.example.org" + config.purl_port = 80 + config.purl_username = "admin" + config.purl_password = "password" + config.purl_maintainers = "admin" + config.purl_target_url_prefix = "http://example.org" + + # Ontology Google Analytics Redis + # disabled + config.ontology_analytics_redis_host = "localhost" + config.enable_ontology_analytics = true + config.ontology_analytics_redis_port = 6379 +end +rescue NameError + puts "(CNFG) >> LinkedData not available, cannot load config" +end + +begin + Annotator.config do |config| + config.mgrep_dictionary_file = "/srv/mgrep/dictionary/dictionary.txt" + config.stop_words_default_file = "/srv/ncbo/ncbo_cron/config/default_stop_words.txt" + config.mgrep_host = "localhost" + config.mgrep_port = 55555 + config.mgrep_alt_host = "localhost" + config.mgrep_alt_port = 55555 + config.annotator_redis_host = "localhost" + config.annotator_redis_port = 6379 + config.annotator_redis_prefix = "" + config.annotator_redis_alt_prefix = "c2" + config.enable_recognizer_param = true + # This setting allows you to ask for other recognizer in URL params (if installed and class with "annotate_direct" created). Example: ?recognizer=alvis or mallet +end +rescue NameError + puts "(CNFG) >> Annotator not available, cannot load config" +end + +begin + OntologyRecommender.config do |config| +end +rescue NameError + puts "(CNFG) >> OntologyRecommender not available, cannot load config" +end + +begin + LinkedData::OntologiesAPI.config do |config| + config.enable_unicorn_workerkiller = true + config.enable_throttling = false + config.enable_monitoring = false + config.cube_host = "localhost" + config.http_redis_host = "localhost" + config.http_redis_port = 6380 + config.ontology_rank = "" +end +rescue NameError + puts "(CNFG) >> OntologiesAPI not available, cannot load config" +end + +begin + NcboCron.config do |config| + config.redis_host = Annotator.settings.annotator_redis_host + config.redis_port = Annotator.settings.annotator_redis_port + # If no URL has been specified when reindexing ontologies, use the following + config.search_index_all_url = "http://localhost:8082/solr/term_search_core2" + config.property_search_index_all_url = "http://localhost:8082/solr/prop_search_core2" + + # Minutes between every process new ontologies check + config.minutes_between = 3 + + # Schedules: run every 4 hours, starting at 00:30 + config.cron_schedule = "30 */4 * * *" + # Pull schedule: run daily at 6 a.m. (18:00) + config.pull_schedule = "00 18 * * *" + # Pull long schedule for ontology that are pulled less frequently: run weekly on monday at 11 a.m. (23:00) + config.pull_schedule_long = "00 23 * * 1" + config.pull_long_ontologies = ["BIOREFINERY", "TRANSMAT", "GO"] + # Delete class graphs of archive submissions: run twice per week on tuesday and friday at 10 a.m. (22:00) + config.cron_flush = "00 22 * * 2,5" + # Remove graphs from deleted ontologies when flushing class graphs + config.remove_zombie_graphs = true + # Warmup long time running queries: run every 3 hours (beginning at 00:00) + config.cron_warmq = "00 */3 * * *" + # Create mapping counts schedule: run twice per week on Wednesday and Saturday at 12:30AM + config.cron_mapping_counts = "30 0 * * 3,6" + + config.enable_ontologies_report = true + # Ontologies report generation schedule: run daily at 1:30 a.m. + config.cron_ontologies_report = "30 1 * * *" + # Ontologies Report file location + config.ontology_report_path = "/srv/ncbo/reports/ontologies_report.json" + + # Ontology analytics refresh schedule: run daily at 4:30 a.m. + config.cron_ontology_analytics = "30 4 * * *" + config.enable_ontology_analytics = true + config.analytics_service_account_email_address = "account-1@bioportal.iam.gserviceaccount.com" + config.analytics_path_to_key_file = "/srv/bioportal-ff92c5b03b63.p12" # you have to get this file from Google + config.analytics_profile_id = "ga:111823321" # replace with your ga view id + config.analytics_app_name = "bioportal" + config.analytics_app_version = "1.0.0" + config.analytics_start_date = "2015-11-13" + # To filter connexions coming from Stanford + config.analytics_filter_str = "ga:networkLocation!@stanford;ga:networkLocation!@amazon" + + # this is a Base64.encode64 encoded personal access token + # you need to run Base64.decode64 on it before using it in your code + # this is a workaround because Github does not allow storing access tokens in a repo + config.git_repo_access_token = "YOUR GITHUB REPO PERSONAL ACCESS TOKEN, encoded using Base64" + end +rescue NameError + #binding.pry + puts "(CNFG) >> NcboCron not available, cannot load config" end Goo.use_cache = true diff --git a/config/config.test.rb b/config/config.test.rb index 97eaf1f7..0729a4b0 100644 --- a/config/config.test.rb +++ b/config/config.test.rb @@ -1,37 +1,42 @@ # This file is designed to be used for unit testing with docker-compose # -GOO_PATH_QUERY = ENV.include?('GOO_PATH_QUERY') ? ENV['GOO_PATH_QUERY'] : '/sparql/' -GOO_PATH_DATA = ENV.include?('GOO_PATH_DATA') ? ENV['GOO_PATH_DATA'] : '/data/' -GOO_PATH_UPDATE = ENV.include?('GOO_PATH_UPDATE') ? ENV['GOO_PATH_UPDATE'] : '/update/' -GOO_BACKEND_NAME = ENV.include?('GOO_BACKEND_NAME') ? ENV['GOO_BACKEND_NAME'] : 'localhost' -GOO_PORT = ENV.include?('GOO_PORT') ? ENV['GOO_PORT'] : 9000 -GOO_HOST = ENV.include?('GOO_HOST') ? ENV['GOO_HOST'] : 'localhost' -SOLR_HOST = ENV.include?('SOLR_HOST') ? ENV['SOLR_HOST'] : 'localhost' -REDIS_HOST = ENV.include?('REDIS_HOST') ? ENV['REDIS_HOST'] : 'localhost' -REDIS_PORT = ENV.include?('REDIS_PORT') ? ENV['REDIS_PORT'] : 6379 -MGREP_HOST = ENV.include?('MGREP_HOST') ? ENV['MGREP_HOST'] : 'localhost' -MGREP_PORT = ENV.include?('MGREP_PORT') ? ENV['MGREP_PORT'] : 55555 +GOO_PATH_QUERY = ENV.include?("GOO_PATH_QUERY") ? ENV["GOO_PATH_QUERY"] : "/sparql/" +GOO_PATH_DATA = ENV.include?("GOO_PATH_DATA") ? ENV["GOO_PATH_DATA"] : "/data/" +GOO_PATH_UPDATE = ENV.include?("GOO_PATH_UPDATE") ? ENV["GOO_PATH_UPDATE"] : "/update/" +GOO_BACKEND_NAME = ENV.include?("GOO_BACKEND_NAME") ? ENV["GOO_BACKEND_NAME"] : "localhost" +GOO_PORT = ENV.include?("GOO_PORT") ? ENV["GOO_PORT"] : 9000 +GOO_HOST = ENV.include?("GOO_HOST") ? ENV["GOO_HOST"] : "localhost" +REDIS_HOST = ENV.include?("REDIS_HOST") ? ENV["REDIS_HOST"] : "localhost" +REDIS_PORT = ENV.include?("REDIS_PORT") ? ENV["REDIS_PORT"] : 6379 +MGREP_HOST = ENV.include?("MGREP_HOST") ? ENV["MGREP_HOST"] : "localhost" +MGREP_PORT = ENV.include?("MGREP_PORT") ? ENV["MGREP_PORT"] : 55555 +SOLR_TERM_SEARCH_URL = ENV.include?("SOLR_TERM_SEARCH_URL") ? ENV["SOLR_TERM_SEARCH_URL"] : "http://localhost:8983/solr/term_search_core1" +SOLR_PROP_SEARCH_URL = ENV.include?("SOLR_PROP_SEARCH_URL") ? ENV["SOLR_PROP_SEARCH_URL"] : "http://localhost:8983/solr/prop_search_core1" LinkedData.config do |config| config.goo_host = GOO_HOST.to_s config.goo_port = GOO_PORT.to_i + config.goo_backend_name = GOO_BACKEND_NAME.to_s + config.goo_path_query = GOO_PATH_QUERY.to_s + config.goo_path_data = GOO_PATH_DATA.to_s + config.goo_path_update = GOO_PATH_UPDATE.to_s config.goo_redis_host = REDIS_HOST.to_s config.goo_redis_port = REDIS_PORT.to_i config.http_redis_host = REDIS_HOST.to_s config.http_redis_port = REDIS_PORT.to_i config.ontology_analytics_redis_host = REDIS_HOST.to_s config.ontology_analytics_redis_port = REDIS_PORT.to_i - config.search_server_url = "http://#{SOLR_HOST}:8983/solr/term_search_core1".to_s - config.property_search_server_url = "http://#{SOLR_HOST}:8983/solr/prop_search_core1".to_s + config.search_server_url = SOLR_TERM_SEARCH_URL.to_s + config.property_search_server_url = SOLR_PROP_SEARCH_URL.to_s # Email notifications. config.enable_notifications = true - config.email_sender = 'sender@domain.com' # Default sender for emails - config.email_override = 'test@domain.com' # By default, all email gets sent here. Disable with email_override_disable. - config.smtp_host = 'smtp-unencrypted.stanford.edu' + config.email_sender = "sender@domain.com" # Default sender for emails + config.email_override = "test@domain.com" # By default, all email gets sent here. Disable with email_override_disable. + config.smtp_host = "smtp-unencrypted.stanford.edu" config.smtp_user = nil config.smtp_password = nil config.smtp_auth_type = :none - config.smtp_domain = 'localhost.localhost' + config.smtp_domain = "localhost.localhost" end Annotator.config do |config| @@ -39,11 +44,11 @@ config.annotator_redis_port = REDIS_PORT.to_i config.mgrep_host = MGREP_HOST.to_s config.mgrep_port = MGREP_PORT.to_i - config.mgrep_dictionary_file = './test/data/dictionary.txt' + config.mgrep_dictionary_file = "./test/data/dictionary.txt" end NcboCron.config do |config| config.redis_host = REDIS_HOST.to_s config.redis_port = REDIS_PORT.to_i - config.ontology_report_path = './test/ontologies_report.json' + config.ontology_report_path = "./test/ontologies_report.json" end diff --git a/config/french_stop_words.txt b/config/french_stop_words.txt new file mode 100644 index 00000000..ae1a5b09 --- /dev/null +++ b/config/french_stop_words.txt @@ -0,0 +1,125 @@ +alors +au +aucuns +aussi +autre +avant +avec +avoir +bon +car +ce +cela +ces +ceux +chaque +ci +comme +comment +dans +des +du +dedans +dehors +depuis +deux +devrait +doit +donc +dos +début +elle +elles +en +encore +essai +est +et +eu +fait +faites +fois +font +force +haut +hors +ici +il +ils +je +juste +la +le +les +leur +là +ma +maintenant +mais +mes +mine +moins +mon +mot +même +ni +nommés +notre +nous +nouveaux +ou +où +par +parce +parole +pas +personnes +peut +peu +pièce +plupart +pour +pourquoi +quand +que +quel +quelle +quelles +quels +qui +sa +sans +ses +seulement +si +sien +son +sont +sous +soyez +sujet +sur +ta +tandis +tellement +tels +tes +ton +tous +tout +trop +très +tu +valeur +voie +voient +vont +votre +vous +vu +ça +étaient +état +étions +été +être diff --git a/lib/ncbo_cron.rb b/lib/ncbo_cron.rb index 309b15db..884e6b33 100644 --- a/lib/ncbo_cron.rb +++ b/lib/ncbo_cron.rb @@ -6,6 +6,7 @@ require 'ncbo_annotator' require_relative 'ncbo_cron/config' require_relative 'ncbo_cron/ontology_submission_parser' +require_relative 'ncbo_cron/ontology_submission_eradicator' require_relative 'ncbo_cron/ontology_pull' require_relative 'ncbo_cron/scheduler' require_relative 'ncbo_cron/query_caching' diff --git a/lib/ncbo_cron/config.rb b/lib/ncbo_cron/config.rb index 49db0fb4..798768b2 100644 --- a/lib/ncbo_cron/config.rb +++ b/lib/ncbo_cron/config.rb @@ -28,6 +28,8 @@ def config(&block) @settings.enable_processing ||= true @settings.enable_pull ||= true @settings.enable_flush ||= true + # Don't remove graphs from deleted ontologies by default when flushing classes + @settings.remove_zombie_graphs ||= false @settings.enable_warmq ||= true @settings.enable_mapping_counts ||= true # enable ontology analytics @@ -56,13 +58,20 @@ def config(&block) @settings.enable_pull_umls ||= false @settings.enable_obofoundry_sync ||= true - # Schedulues + # Schedules + # 30 */4 * * * - run every 4 hours, starting at 00:30 @settings.cron_schedule ||= "30 */4 * * *" - # Pull schedule + # Pull schedule. + # 00 18 * * * - run daily at 6 a.m. (18:00) @settings.pull_schedule ||= "00 18 * * *" - # Delete class graphs of archive submissions + # run weekly on monday at 11 a.m. (23:00) + @settings.pull_schedule_long ||= "00 23 * * 1" + @settings.pull_long_ontologies ||= [] + # Delete class graphs of archive submissions. + # 00 22 * * 2 - run once per week on tuesday at 10 a.m. (22:00) @settings.cron_flush ||= "00 22 * * 2" - # Warmup long time running queries + # Warmup long time running queries. + # 00 */3 * * * - run every 3 hours (beginning at 00:00) @settings.cron_warmq ||= "00 */3 * * *" # Create mapping counts schedule # 30 0 * * 6 - run once per week on Saturday at 12:30AM diff --git a/lib/ncbo_cron/ontologies_report.rb b/lib/ncbo_cron/ontologies_report.rb index 43f0505f..99463a0a 100644 --- a/lib/ncbo_cron/ontologies_report.rb +++ b/lib/ncbo_cron/ontologies_report.rb @@ -345,7 +345,7 @@ def good_classes(submission, report) page_size = 1000 classes_size = 10 good_classes = Array.new - paging = LinkedData::Models::Class.in(submission).include(:prefLabel, :synonym, metrics: :classes).page(page_num, page_size) + paging = LinkedData::Models::Class.in(submission).include(:prefLabel, :synonym, submission: [metrics: :classes]).page(page_num, page_size) cls_count = submission.class_count(@logger).to_i # prevent a COUNT SPARQL query if possible paging.page_count_set(cls_count) if cls_count > -1 diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index e06fcd77..097821fe 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -26,6 +26,13 @@ def fetch_ontology_analytics # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" + # If the user add filter through the configuration file + if !NcboCron.settings.analytics_filter_str.nil? && NcboCron.settings.analytics_filter_str != "" + analytics_filter = ";" + NcboCron.settings.analytics_filter_str + else + analytics_filter = "" + end + ont_acronyms.each do |acronym| max_results = 10000 num_results = 10000 diff --git a/lib/ncbo_cron/ontology_pull.rb b/lib/ncbo_cron/ontology_pull.rb index ac6da70e..31f8cba9 100644 --- a/lib/ncbo_cron/ontology_pull.rb +++ b/lib/ncbo_cron/ontology_pull.rb @@ -8,14 +8,17 @@ module Models class OntologyPull class RemoteFileException < StandardError - end + attr_reader :submission - def initialize() + def initialize(submission) + super + @submission = submission + end end - def do_remote_ontology_pull(options = {}) + def do_remote_ontology_pull(isLong = false, options = {}) logger = options[:logger] || Logger.new($stdout) - logger.info "UMLS auto-pull #{options[:enable_pull_umls] == true}" + logger.info "UMLS auto-pull #{options[:enable_pull_umls] == true}. Is long: #{isLong}" logger.flush ontologies = LinkedData::Models::Ontology.where.include(:acronym).all ont_to_include = [] @@ -23,65 +26,28 @@ def do_remote_ontology_pull(options = {}) ontologies.select! { |ont| ont_to_include.include?(ont.acronym) } unless ont_to_include.empty? enable_pull_umls = options[:enable_pull_umls] umls_download_url = options[:pull_umls_url] - ontologies.sort! {|a, b| a.acronym.downcase <=> b.acronym.downcase} + ontologies.sort! { |a, b| a.acronym.downcase <=> b.acronym.downcase } new_submissions = [] ontologies.each do |ont| begin - last = ont.latest_submission(status: :any) - next if last.nil? - last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) - if !enable_pull_umls && last.hasOntologyLanguage.umls? - next - end - last.bring(:pullLocation) if last.bring?(:pullLocation) - next if last.pullLocation.nil? - last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) - - if last.hasOntologyLanguage.umls? && umls_download_url - last.pullLocation= RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) - logger.info("Using alternative download for umls #{last.pullLocation.to_s}") + begin + new_submission = self.do_ontology_pull(ont.acronym, + isLong: isLong, + enable_pull_umls: enable_pull_umls, + umls_download_url: umls_download_url, + logger: logger, options: options) + new_submissions << new_submission if new_submission + rescue RemoteFileException => error + logger.info "RemoteFileException: No submission file at pull location #{error.submission.pullLocation.to_s} for ontology #{ont.acronym}." logger.flush + LinkedData::Utils::Notifications.remote_ontology_pull(error.submission) end - - if last.remote_file_exists?(last.pullLocation.to_s) - logger.info "Checking download for #{ont.acronym}" - logger.info "Location: #{last.pullLocation.to_s}"; logger.flush - file, filename = last.download_ontology_file() - file = File.open(file.path, "rb") - remote_contents = file.read - md5remote = Digest::MD5.hexdigest(remote_contents) - - if last.uploadFilePath && File.exist?(last.uploadFilePath) - file_contents = open(last.uploadFilePath) { |f| f.read } - md5local = Digest::MD5.hexdigest(file_contents) - new_file_exists = (not md5remote.eql?(md5local)) - else - # There is no existing file, so let's create a submission with the downloaded one - new_file_exists = true - end - - if new_file_exists - logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" - logger.flush() - new_submissions << create_submission(ont, last, file, filename, logger) - end - - file.close - else - begin - raise RemoteFileException - rescue RemoteFileException - logger.info "RemoteFileException: No submission file at pull location #{last.pullLocation.to_s} for ontology #{ont.acronym}." - logger.flush - LinkedData::Utils::Notifications.remote_ontology_pull(last) - end - end - rescue Exception => e - logger.error "Problem retrieving #{ont.acronym} in OntologyPull:\n" + e.message + "\n" + e.backtrace.join("\n\t") - logger.flush() - next end + rescue Exception => e + logger.error "Problem retrieving #{ont.acronym} in OntologyPull:\n" + e.message + "\n" + e.backtrace.join("\n\t") + logger.flush() + next end if options[:cache_clear] == true @@ -93,8 +59,63 @@ def do_remote_ontology_pull(options = {}) new_submissions end - def create_submission(ont, sub, file, filename, logger=nil, - add_to_pull=true,new_version=nil,new_released=nil) + def do_ontology_pull(ontology_acronym, enable_pull_umls: false, isLong: false, umls_download_url: '', logger: nil, options:) + ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first + new_submission = nil + raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil? + + last = ont.latest_submission(status: :any) + raise StandardError, "No submission found for #{ontology_acronym}" if last.nil? + + last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) + + last.bring(:pullLocation) if last.bring?(:pullLocation) + raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil? + + last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) + + if not_pull_submission(last, ont, isLong, enable_pull_umls, options) + raise StandardError, "Pull umls not enabled" + end + + if isLong && !options[:pull_long_ontologies].nil? + return nil unless options[:pull_long_ontologies].include?(ont.acronym) + else + unless options[:pull_long_ontologies].nil? + return nil if options[:pull_long_ontologies].include?(ont.acronym) + end + end + + if last.hasOntologyLanguage.umls? && umls_download_url + last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) + logger.info("Using alternative download for umls #{last.pullLocation.to_s}") + logger.flush + end + + if last.remote_file_exists?(last.pullLocation.to_s) + logger.info "Checking download for #{ont.acronym}" + logger.info "Location: #{last.pullLocation.to_s}"; logger.flush + file, filename = last.download_ontology_file + file, md5local, md5remote, new_file_exists = new_file_exists?(file, last) + + if new_file_exists + logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" + logger.flush() + new_submission = create_submission(ont, last, file, filename, logger) + else + logger.info "There is no new file found for #{ont.acronym}" + logger.flush() + end + + file.close + new_submission + else + raise RemoteFileException.new(last) + end + end + + def create_submission(ont, sub, file, filename, logger = nil, + add_to_pull = true, new_version = nil, new_released = nil) logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) new_sub = LinkedData::Models::OntologySubmission.new @@ -118,33 +139,18 @@ def create_submission(ont, sub, file, filename, logger=nil, new_sub.submissionStatus = nil new_sub.creationDate = nil new_sub.missingImports = nil + new_sub.masterFileName = nil new_sub.metrics = nil full_file_path = File.expand_path(file_location) # check if OWLAPI is able to parse the file before creating a new submission - owlapi = LinkedData::Parser::OWLAPICommand.new( - full_file_path, - File.expand_path(new_sub.data_folder.to_s), - logger: logger) - owlapi.disable_reasoner - parsable = true - - begin - owlapi.parse - rescue Exception => e - logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.error("A new submission has NOT been created.") - logger.flush - parsable = false - end - - if parsable + if new_sub.parsable?(logger: logger) if new_sub.valid? - new_sub.save() + new_sub.save if add_to_pull submission_queue = NcboCron::Models::OntologySubmissionParser.new - submission_queue.queue_submission(new_sub, {all: true}) + submission_queue.queue_submission(new_sub, { all: true }) logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}") end else @@ -152,12 +158,47 @@ def create_submission(ont, sub, file, filename, logger=nil, logger.flush end else + logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.error("A new submission has NOT been created.") + logger.flush + # delete the bad file File.delete full_file_path if File.exist? full_file_path end + new_sub end + private + + def not_pull_submission(submission, ontology, isLong, enable_pull_umls, options) + if !enable_pull_umls && submission.hasOntologyLanguage.umls? + return true + end + + if isLong && !options[:pull_long_ontologies].nil? + !options[:pull_long_ontologies].include?(ontology.acronym) + else + !options[:pull_long_ontologies].nil? && options[:pull_long_ontologies].include?(ontology.acronym) + end + end + + def new_file_exists?(file, last) + file = File.open(file.path, "rb") + remote_contents = file.read + md5remote = Digest::MD5.hexdigest(remote_contents) + + if last.uploadFilePath && File.exist?(last.uploadFilePath) + file_contents = open(last.uploadFilePath) { |f| f.read } + md5local = Digest::MD5.hexdigest(file_contents) + new_file_exists = (not md5remote.eql?(md5local)) + else + # There is no existing file, so let's create a submission with the downloaded one + new_file_exists = true + end + return file, md5local, md5remote, new_file_exists + end + def redis_goo Redis.new(host: LinkedData.settings.goo_redis_host, port: LinkedData.settings.goo_redis_port, timeout: 30) end @@ -168,7 +209,6 @@ def redis_http end end end - # require 'ontologies_linked_data' # require 'goo' # require 'ncbo_annotator' diff --git a/lib/ncbo_cron/ontology_submission_eradicator.rb b/lib/ncbo_cron/ontology_submission_eradicator.rb new file mode 100644 index 00000000..40f8ef4d --- /dev/null +++ b/lib/ncbo_cron/ontology_submission_eradicator.rb @@ -0,0 +1,39 @@ +module NcboCron + module Models + + class OntologySubmissionEradicator + class RemoveSubmissionFileException < StandardError + end + + class RemoveSubmissionDataException < StandardError + end + + class RemoveNotArchivedSubmissionException < StandardError + end + + def initialize() + end + + def eradicate(submission , force=false) + submission.bring(:submissionStatus) if submission.bring(:submissionStatus) + if submission.archived? || force + delete_submission_data submission + else submission.ready? + raise RemoveNotArchivedSubmissionException, "Submission #{submission.submissionId} is not an archived submission" + end + + end + + private + def delete_submission_data(submission) + begin + submission.delete + rescue Exception => e + raise RemoveSubmissionDataException, e.message + end + end + + + end + end +end diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index fe7a3e06..b0a3309c 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -10,21 +10,30 @@ class OntologySubmissionParser ACTIONS = { :process_rdf => true, + :extract_metadata => true, :index_search => true, :index_properties => true, :run_metrics => true, :process_annotator => true, - :diff => true + :diff => true, + :params => nil } def initialize() end + # Add a submission in the queue def queue_submission(submission, actions={:all => true}) redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) - if actions[:all] - actions = ACTIONS.dup + if !actions[:params].nil? + # Retrieve params added by the user + user_params = actions[:params].dup + actions = ACTIONS.dup + actions[:params] = user_params.dup + else + actions = ACTIONS.dup + end else actions.delete_if {|k, v| !ACTIONS.has_key?(k)} end @@ -32,6 +41,7 @@ def queue_submission(submission, actions={:all => true}) redis.hset(QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty? end + # Process submissions waiting in the queue def process_queue_submissions(options = {}) logger = options[:logger] logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) @@ -79,6 +89,7 @@ def get_prefixed_id(id) "#{IDPREFIX}#{id}" end + # Zombie graphs are submission graphs from ontologies that have been deleted def zombie_classes_graphs query = "SELECT DISTINCT ?g WHERE { GRAPH ?g { ?s ?p ?o }}" class_graphs = [] @@ -98,7 +109,7 @@ def zombie_classes_graphs zombies end - def process_flush_classes(logger) + def process_flush_classes(logger, remove_zombie_graphs=false) onts = LinkedData::Models::Ontology.where.include(:acronym,:summaryOnly).all status_archived = LinkedData::Models::SubmissionStatus.find("ARCHIVED").first deleted = [] @@ -141,6 +152,12 @@ def process_flush_classes(logger) zombie_classes_graphs.each do |zg| logger.info("Zombie class graph #{zg}"); logger.flush + # Not deleting zombie graph by default. Enable it with config.remove_zombie_graphs = true + if !remove_zombie_graphs.nil? && remove_zombie_graphs == true + Goo.sparql_data_client.delete_graph(RDF::URI.new(zg)) + logger.info "DELETED #{zg} graph" + deleted << zg + end end logger.info("finish process_flush_classes"); logger.flush @@ -212,6 +229,7 @@ def archive_old_submissions(logger, sub) logger.debug "Completed archiving submissions previous to #{sub.id.to_s}" end + # Add new ontology terms to the Annotator def process_annotator(logger, sub) parsed = sub.ready?(status: [:rdf, :rdf_labels]) diff --git a/lib/ncbo_cron/scheduler.rb b/lib/ncbo_cron/scheduler.rb index 9abc60e0..75badc71 100644 --- a/lib/ncbo_cron/scheduler.rb +++ b/lib/ncbo_cron/scheduler.rb @@ -31,6 +31,7 @@ def self.scheduled_locking_job(options = {}, &block) seconds_between = options[:seconds_between] scheduler_type = options[:scheduler_type] || :every cron_schedule = options[:cron_schedule] + cron_schedule_long = options[:cron_schedule_long] if scheduler_type == :every # Minutes/seconds string prep diff --git a/test/docker-compose.yml b/test/docker-compose.yml index 5bdb51f5..db957907 100644 --- a/test/docker-compose.yml +++ b/test/docker-compose.yml @@ -1,33 +1,85 @@ -version: '3.8' +x-app: &app + build: + context: ../. + args: + RUBY_VERSION: '2.7' + # Increase the version number in the image tag every time Dockerfile or its arguments is changed + image: ncbo_cron-dev:0.0.1 + environment: &env + # default bundle config resolves to /usr/local/bundle/config inside of the container + # we are setting it to local app directory if we need to use 'bundle config local' + BUNDLE_APP_CONFIG: /srv/ontoportal/ncbo_cron/.bundle + BUNDLE_PATH: /srv/ontoportal/bundle + COVERAGE: 'true' # enable simplecov code coverage + REDIS_HOST: redis-ut + REDIS_PORT: 6379 + SOLR_TERM_SEARCH_URL: http://solr-ut:8983/solr/term_search_core1 + SOLR_PROP_SEARCH_URL: http://solr-ut:8983/solr/prop_search_core1 + MGREP_HOST: mgrep-ut + MGREP_PORT: 55555 + stdin_open: true + tty: true + command: /bin/bash + volumes: + # bundle volume for hosting gems installed by bundle; it speeds up gem install in local development + - bundle:/srv/ontoportal/bundle + - ../.:/srv/ontoportal/ncbo_cron + # mount directory containing development version of the gems if you need to use 'bundle config local' + #- /Users/alexskr/ontoportal:/Users/alexskr/ontoportal + depends_on: &depends_on + solr-ut: + condition: service_healthy + redis-ut: + condition: service_healthy + mgrep-ut: + condition: service_healthy + services: - unit-test: - build: ../. + # environment wtih 4store backend + ruby: + <<: *app environment: - - GOO_BACKEND_NAME=4store - - GOO_PORT=9000 - - GOO_HOST=4store-ut - - REDIS_HOST=redis-ut - - REDIS_PORT=6379 - - SOLR_HOST=solr-ut - - MGREP_HOST=mgrep-ut - - MGREP_PORT=55555 + <<: *env + GOO_BACKEND_NAME: 4store + GOO_PORT: 9000 + GOO_HOST: 4store-ut + GOO_PATH_QUERY: /sparql/ + GOO_PATH_DATA: /data/ + GOO_PATH_UPDATE: /update/ + profiles: + - 4store depends_on: - - solr-ut - - redis-ut - - 4store-ut - - mgrep-ut - #command: "bundle exec rake test TESTOPTS='-v' TEST='./test/parser/test_owl_api_command.rb'" - command: "wait-for-it solr-ut:8983 -- bundle exec rake test TESTOPTS='-v'" + <<: *depends_on + 4store-ut: + condition: service_started - solr-ut: - image: ontoportal/solr-ut:0.1 + # environment with AllegroGraph backend + ruby-agraph: + <<: *app + environment: + <<: *env + GOO_BACKEND_NAME: ag + GOO_PORT: 10035 + GOO_HOST: agraph-ut + GOO_PATH_QUERY: /repositories/bioportal_test + GOO_PATH_DATA: /repositories/bioportal_test/statements + GOO_PATH_UPDATE: /repositories/bioportal_test/statements + # profiles: + #- agraph + depends_on: + <<: *depends_on + agraph-ut: + condition: service_started redis-ut: image: redis - - mgrep-ut: - image: ontoportal/mgrep-ncbo:0.1 + command: ["redis-server", "--save", "", "--appendonly", "no"] + healthcheck: + test: redis-cli ping + interval: 10s + timeout: 3s + retries: 10 4store-ut: image: bde2020/4store @@ -35,4 +87,43 @@ services: bash -c "4s-backend-setup --segments 4 ontoportal_kb && 4s-backend ontoportal_kb && 4s-httpd -D -s-1 -p 9000 ontoportal_kb" + profiles: + - 4store + + solr-ut: + image: ontoportal/solr-ut:0.1 + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}' || exit 1"] + start_period: 3s + interval: 10s + timeout: 5s + retries: 5 + + mgrep-ut: + image: ontoportal/mgrep-ncbo:0.1 + healthcheck: + test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] + start_period: 3s + interval: 10s + timeout: 5s + retries: 5 + + agraph-ut: + image: franzinc/agraph:v7.3.0 + environment: + - AGRAPH_SUPER_USER=test + - AGRAPH_SUPER_PASSWORD=xyzzy + shm_size: 1g + # ports: + # - 10035:10035 + command: > + bash -c "/agraph/bin/agraph-control --config /agraph/etc/agraph.cfg start + ; agtool repos create bioportal_test + ; agtool users add anonymous + ; agtool users grant anonymous root:bioportal_test:rw + ; tail -f /agraph/data/agraph.log" + # profiles: + #- agraph +volumes: + bundle: diff --git a/test/run-unit-tests.sh b/test/run-unit-tests.sh index 385898e6..b2c119da 100755 --- a/test/run-unit-tests.sh +++ b/test/run-unit-tests.sh @@ -3,10 +3,10 @@ # # add config for unit testing [ -f ../config/config.rb ] || cp ../config/config.test.rb ../config/config.rb -docker-compose build +docker compose build # wait-for-it is useful since solr container might not get ready quick enough for the unit tests -docker-compose run --rm unit-test wait-for-it solr-ut:8983 -- rake test TESTOPTS='-v' -#docker-compose run --rm unit-test wait-for-it solr-ut:8983 -- bundle exec rake test TESTOPTS='-v' TEST='./test/controllers/test_annotator_controller.rb' -#docker-compose up --exit-code-from unit-test -docker-compose kill +docker compose run --rm ruby bundle exec rake test TESTOPTS='-v' +#docker compose run --rm ruby-agraph bundle exec rake test TESTOPTS='-v' +#docker-compose run --rm ruby bundle exec rake test TESTOPTS='-v' TEST='./test/controllers/test_annotator_controller.rb' +docker compose kill diff --git a/test/test_case.rb b/test/test_case.rb index 81a10aa6..5f164ecd 100644 --- a/test/test_case.rb +++ b/test/test_case.rb @@ -1,3 +1,21 @@ +# Start simplecov if this is a coverage task or if it is run in the CI pipeline +if ENV['COVERAGE'] == 'true' || ENV['CI'] == 'true' + require 'simplecov' + require 'simplecov-cobertura' + # https://github.com/codecov/ruby-standard-2 + # Generate HTML and Cobertura reports which can be consumed by codecov uploader + SimpleCov.formatters = SimpleCov::Formatter::MultiFormatter.new([ + SimpleCov::Formatter::HTMLFormatter, + SimpleCov::Formatter::CoberturaFormatter + ]) + SimpleCov.start do + add_filter '/test/' + add_filter 'app.rb' + add_filter 'init.rb' + add_filter '/config/' + end +end + require 'ontologies_linked_data' require_relative '../lib/ncbo_cron' require_relative '../config/config' @@ -7,7 +25,7 @@ require 'test/unit' # Check to make sure you want to run if not pointed at localhost -safe_host = Regexp.new(/localhost|-ut|ncbo-dev*|ncbo-unittest*/) +safe_host = Regexp.new(/localhost|-ut/) unless LinkedData.settings.goo_host.match(safe_host) && LinkedData.settings.search_server_url.match(safe_host) && NcboCron.settings.redis_host.match(safe_host) diff --git a/test/test_ontology_pull.rb b/test/test_ontology_pull.rb index 57fa9f47..450ae582 100644 --- a/test/test_ontology_pull.rb +++ b/test/test_ontology_pull.rb @@ -41,14 +41,14 @@ def self.after_suite @@redis.del NcboCron::Models::OntologySubmissionParser::QUEUE_HOLDER end - def test_remote_ontology_pull() + def test_remote_ontology_pull ontologies = init_ontologies(1) ont = LinkedData::Models::Ontology.find(ontologies[0].id).first ont.bring(:submissions) if ont.bring?(:submissions) assert_equal 1, ont.submissions.length pull = NcboCron::Models::OntologyPull.new - pull.do_remote_ontology_pull() + pull.do_remote_ontology_pull # check that the pull creates a new submission when the file has changed ont = LinkedData::Models::Ontology.find(ontologies[0].id).first @@ -72,7 +72,7 @@ def test_remote_ontology_pull() ont = LinkedData::Models::Ontology.find(ontologies[0].id).first ont.bring(:submissions) if ont.bring?(:submissions) assert_equal 2, ont.submissions.length - pull.do_remote_ontology_pull() + pull.do_remote_ontology_pull assert_equal 2, ont.submissions.length end @@ -130,7 +130,7 @@ def test_pull_error_notification pull = NcboCron::Models::OntologyPull.new pull.do_remote_ontology_pull - assert last_email_sent.subject.include? "[BioPortal] Load from URL failure for #{ont.name}" + assert last_email_sent.subject.include? "Load from URL failure for #{ont.name}" user = ont.administeredBy[0] user.bring(:email) assert (last_email_sent.to.first.include? user.email) || (last_email_sent.header['Overridden-Sender'].value.include? user.email) @@ -172,7 +172,7 @@ def init_ontologies(submission_count) sub.pullLocation = RDF::IRI.new(@@url) sub.save() rescue binding.pry end - return ontologies + ontologies end end diff --git a/test/test_ontology_submission_parser.rb b/test/test_ontology_submission_parser.rb index 8cc8d668..cbd97ba6 100644 --- a/test/test_ontology_submission_parser.rb +++ b/test/test_ontology_submission_parser.rb @@ -152,4 +152,49 @@ def test_parse_submissions assert zombies.first["/TEST-ONT-0/submissions/2"] end + def test_extract_metadata + parser = NcboCron::Models::OntologySubmissionParser.new + archived_submissions = [] + not_archived_submissions = [] + + o1 = @@ontologies[0] + o1.bring(:submissions) + o1_sub1 = o1.submissions.select { |x| x.id.to_s["/submissions/1"]}.first + o1_sub1.bring(:submissionStatus) + o1_sub2 = o1.submissions.select { |x| x.id.to_s["/submissions/2"]}.first + o1_sub2.bring(:submissionStatus) + + o2 = @@ontologies[1] + o2.bring(:submissions) + o2_sub1 = o2.submissions.select { |x| x.id.to_s["/submissions/1"]}.first + o2_sub1.bring(:submissionStatus) + o2_sub2 = o2.submissions.select { |x| x.id.to_s["/submissions/2"]}.first + o2_sub2.bring(:submissionStatus) + + options_o1 = { :all => true, :params => { :homepage => "o1 homepage" }} + + options_o2 = { + dummy_action: false, process_rdf: true, index_search: false, :diff => true, + dummy_metrics: true, run_metrics: false, process_annotator: true, + another_dummy_action: true, all: false, :params => { "homepage" => "o2 homepage" } + } + + parser.queue_submission(o1_sub1, options_o1) + parser.queue_submission(o2_sub1, options_o2) + + parser.process_queue_submissions + + o1_sub1 = LinkedData::Models::OntologySubmission.find(RDF::IRI.new(o1_sub1.id)).first + o1_sub1.bring(:submissionStatus) + + o2_sub1 = LinkedData::Models::OntologySubmission.find(RDF::IRI.new(o2_sub1.id)).first + o2_sub1.bring(:submissionStatus) + + o1_sub1_statusCodes = LinkedData::Models::SubmissionStatus.get_status_codes(o1_sub1.submissionStatus) + o2_sub1_statusCodes = LinkedData::Models::SubmissionStatus.get_status_codes(o2_sub1.submissionStatus) + + assert_equal [], ["UPLOADED", "RDF", "RDF_LABELS", "INDEXED"] - o1_sub1_statusCodes + assert_equal [], ["UPLOADED", "RDF", "RDF_LABELS", "ANNOTATOR"] - o2_sub1_statusCodes + end + end