From 91ab9af5f918f3fef3b4fbd705b5e216a59acd7a Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Fri, 19 Apr 2024 16:01:02 -0400 Subject: [PATCH] Update Dockerfile, dependencies * Update to ruby 3.3 * Update gems * Use healthcheck instead of wait-for * Remove activesupport (appears to only be used in tests) * update versions for dependent github actions in ci --- Dockerfile | 17 ++- Gemfile | 3 +- Gemfile.lock | 110 ++++++++-------- README.md | 7 +- bin/wait-for | 118 ------------------ docker-compose.yml | 84 ++++++------- .../rights_feed_volume_repo_spec.rb | 12 +- spec/repository/rights_volume_repo_spec.rb | 5 +- spec/spec_helper.rb | 3 - 9 files changed, 128 insertions(+), 231 deletions(-) delete mode 100755 bin/wait-for diff --git a/Dockerfile b/Dockerfile index 31141c1..9deda09 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ -FROM ruby:3.1 +FROM ruby:3.3 AS base +LABEL org.opencontainers.image.source="https://github.com/hathitrust/datasets" + ARG UNAME=app -ARG UID=1000 -ARG GID=1000 # for rotatelogs RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \ @@ -11,14 +11,19 @@ RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \ # sdrN for volumes and symlinks RUN bash -c 'for i in $(seq 1 24); do ln -s /sdr/$i /sdr$i; done' +WORKDIR /usr/src/app +ENV BUNDLE_PATH /gems +ENV RUBYLIB /usr/src/app/lib + +FROM base AS production + +ARG UID=1000 +ARG GID=1000 RUN gem install bundler RUN groupadd -g $GID -o $UNAME RUN useradd -m -d /usr/src/app -u $UID -g $GID -o -s /bin/bash $UNAME RUN mkdir -p /gems && chown $UID:$GID /gems USER $UNAME COPY --chown=$UID:$GID Gemfile* /usr/src/app/ -WORKDIR /usr/src/app -ENV BUNDLE_PATH /gems -ENV RUBYLIB /usr/src/app/lib RUN bundle install COPY --chown=$UID:$GID . /usr/src/app diff --git a/Gemfile b/Gemfile index 5843582..18ca82e 100644 --- a/Gemfile +++ b/Gemfile @@ -1,6 +1,5 @@ source "https://rubygems.org" -gem "activesupport" gem "mysql2" gem "puma" gem "pairtree", "~> 0.3" @@ -9,6 +8,8 @@ gem "sequel" gem "rubyzip" gem "thor" gem "rake", "~> 12.3" +gem "csv" +gem "base64" group :development, :test do gem "rspec", "~> 3.0" diff --git a/Gemfile.lock b/Gemfile.lock index 23ae5e2..a717dc6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,73 +1,72 @@ GEM remote: https://rubygems.org/ specs: - activesupport (7.0.4.3) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 1.6, < 2) - minitest (>= 5.1) - tzinfo (~> 2.0) ast (2.4.2) + base64 (0.2.0) + bigdecimal (3.1.7) byebug (11.1.3) coderay (1.1.3) - concurrent-ruby (1.2.2) - connection_pool (2.4.0) - diff-lcs (1.5.0) + connection_pool (2.4.1) + csv (3.3.0) + diff-lcs (1.5.1) docile (1.4.0) - i18n (1.12.0) - concurrent-ruby (~> 1.0) - json (2.6.3) + json (2.7.2) language_server-protocol (3.17.0.3) - method_source (1.0.0) - minitest (5.18.0) - mysql2 (0.5.5) - nio4r (2.5.8) + lint_roller (1.1.0) + method_source (1.1.0) + mysql2 (0.5.6) + nio4r (2.7.1) pairtree (0.3.0) - parallel (1.22.1) - parser (3.2.1.1) + parallel (1.24.0) + parser (3.3.0.5) ast (~> 2.4.1) + racc pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - puma (6.2.1) + puma (6.4.2) nio4r (~> 2.0) - rack (2.2.6.4) + racc (1.7.3) + rack (2.2.9) rainbow (3.1.1) rake (12.3.3) redis (4.8.1) - regexp_parser (2.7.0) - rexml (3.2.5) - rspec (3.12.0) - rspec-core (~> 3.12.0) - rspec-expectations (~> 3.12.0) - rspec-mocks (~> 3.12.0) - rspec-core (3.12.1) - rspec-support (~> 3.12.0) - rspec-expectations (3.12.2) + regexp_parser (2.9.0) + rexml (3.2.6) + rspec (3.13.0) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.0) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.0) diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.12.0) - rspec-mocks (3.12.5) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.0) diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.12.0) - rspec-support (3.12.0) - rubocop (1.48.1) + rspec-support (~> 3.13.0) + rspec-support (3.13.1) + rubocop (1.62.1) json (~> 2.3) + language_server-protocol (>= 3.17.0) parallel (~> 1.10) - parser (>= 3.2.0.0) + parser (>= 3.3.0.2) rainbow (>= 2.2.2, < 4.0) regexp_parser (>= 1.8, < 3.0) rexml (>= 3.2.5, < 4.0) - rubocop-ast (>= 1.26.0, < 2.0) + rubocop-ast (>= 1.31.1, < 2.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 2.4.0, < 3.0) - rubocop-ast (1.28.0) - parser (>= 3.2.1.0) - rubocop-performance (1.16.0) - rubocop (>= 1.7.0, < 2.0) - rubocop-ast (>= 0.4.0) + rubocop-ast (1.31.2) + parser (>= 3.3.0.4) + rubocop-performance (1.20.2) + rubocop (>= 1.48.1, < 2.0) + rubocop-ast (>= 1.30.0, < 2.0) ruby-progressbar (1.13.0) rubyzip (2.3.2) - sequel (5.66.0) - sidekiq (6.5.8) + sequel (5.79.0) + bigdecimal + sidekiq (6.5.12) connection_pool (>= 2.2.5, < 3) rack (~> 2.0) redis (>= 4.5.0, < 5) @@ -78,22 +77,29 @@ GEM simplecov-html (0.12.3) simplecov-lcov (0.8.0) simplecov_json_formatter (0.1.4) - standard (1.25.3) + standard (1.35.1) language_server-protocol (~> 3.17.0.2) - rubocop (~> 1.48.1) - rubocop-performance (~> 1.16.0) - thor (1.2.1) - timecop (0.9.6) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unicode-display_width (2.4.2) + lint_roller (~> 1.0) + rubocop (~> 1.62.0) + standard-custom (~> 1.0.0) + standard-performance (~> 1.3) + standard-custom (1.0.2) + lint_roller (~> 1.0) + rubocop (~> 1.50) + standard-performance (1.3.1) + lint_roller (~> 1.1) + rubocop-performance (~> 1.20.2) + thor (1.3.1) + timecop (0.9.8) + unicode-display_width (2.5.0) PLATFORMS ruby DEPENDENCIES - activesupport + base64 byebug + csv mysql2 pairtree (~> 0.3) pry @@ -110,4 +116,4 @@ DEPENDENCIES timecop BUNDLED WITH - 2.3.25 + 2.5.9 diff --git a/README.md b/README.md index 323190b..002bbaa 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,12 @@ The list of changes is filtered into queues. There is a queue for each subset a For each volume in a queue, a job is scheduled to apply the changes to the filesystem. ## Use -Scheduled job to be run daily? weekly? +Deployed via [private ArgoCD control repository](https://github.com/hathitrust/ht_tanka/tree/main/environments/datasets/production) + +This creates a set of workers for handling data set jobs, as well as a set of +cron jobs to generate the dataset full inventory, fetch metadata, queue jobs +for updating the data set, and compiling and processing the logs generated by +the workers. ## Assumptions & Dependencies Atomic filesystem moves. This remains to be tested. diff --git a/bin/wait-for b/bin/wait-for deleted file mode 100755 index 9fd63d8..0000000 --- a/bin/wait-for +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash - -# From https://github.com/eficode/wait-for - -# The MIT License (MIT) -# -# Copyright (c) 2017 Eficode Oy -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -TIMEOUT=15 -QUIET=0 - -echoerr() { - if [ "$QUIET" -ne 1 ]; then printf "%s\n" "$*" 1>&2; fi -} - -usage() { - exitcode="$1" - cat << USAGE >&2 -Usage: - $cmdname host:port [host:port ...] [-t timeout] [-- command args] - -q | --quiet Do not output any status messages - -t TIMEOUT | --timeout=timeout Timeout in seconds, zero for no timeout - -- COMMAND ARGS Execute command with args after the test finishes -USAGE - exit "$exitcode" -} - -wait_for() { - command="$*" - for i in `seq $TIMEOUT` ; do - - result=0 - - for dep in $DEPENDENCIES; do - host=$(printf "%s\n" "$dep"| cut -d : -f 1) - port=$(printf "%s\n" "$dep"| cut -d : -f 2) - if [ "$host" = "" -o "$port" = "" ]; then - echoerr "Error: you need to provide a host and port to test." - usage 2 - fi - nc -z "$host" "$port" > /dev/null 2>&1 - nc_result=$? - if [ $nc_result -ne 0 ] ; then - result=1 - fi - done - - if [ $result -eq 0 ] ; then - if [ -n "$command" ] ; then - exec $command - fi - exit 0 - fi - sleep 1 - done - echo "Operation timed out" >&2 - exit 1 -} - -DEPENDENCIES="" -while [ $# -gt 0 ] -do - case "$1" in - *:* ) - DEPENDENCIES+=" $1" - shift 1 - ;; - -q | --quiet) - QUIET=1 - shift 1 - ;; - -t) - TIMEOUT="$2" - if [ "$TIMEOUT" = "" ]; then break; fi - shift 2 - ;; - --timeout=*) - TIMEOUT="${1#*=}" - shift 1 - ;; - --) - shift - break - ;; - --help) - usage 0 - ;; - *) - echoerr "Unknown argument: $1" - usage 1 - ;; - esac -done - -if [ "${#DEPENDENCIES[@]}" -eq "0" ]; then - echoerr "Error: you need to provide a host and port to test." - usage 2 -fi - -wait_for "$@" diff --git a/docker-compose.yml b/docker-compose.yml index 903ae73..0fba686 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,63 +1,60 @@ -version: '3' +--- +x-condition-healthy: &healthy + condition: service_healthy + +x-healthcheck-defaults: &healthcheck-defaults + interval: 5s + timeout: 10s + start_period: 10s + retries: 5 + +x-common-service: &common-service + build: + context: . + target: base + volumes: + - .:/usr/src/app + - gem_cache:/gems + - ./example/datasets:/tmp/datasets + environment: + REDIS_URL: redis://redis/ services: test: - build: . + <<: *common-service restart: never - volumes: - - .:/usr/src/app - - gem_cache:/gems - command: bin/wait-for --timeout=300 mariadb-test:3306 redis:6379 -- bundle exec rspec + command: bundle exec rspec depends_on: - - redis - - mariadb-test - environment: - REDIS_URL: redis://redis/ + redis: *healthy + mariadb-test: *healthy processor: - build: . + <<: *common-service restart: always - volumes: - - .:/usr/src/app - - gem_cache:/gems - - ./example/datasets:/tmp/datasets command: bundle exec sidekiq -r ./lib/datasets/sidekiq_jobs.rb depends_on: - - redis - - mariadb-dev - environment: - REDIS_URL: redis://redis/ + redis: *healthy + mariadb-dev: *healthy sidekiq_web: - build: . + <<: *common-service restart: always - volumes: - - .:/usr/src/app - - gem_cache:/gems command: bundle exec puma bin/sidekiq_web.ru depends_on: - - redis + redis: *healthy ports: - 9292:9292 - environment: - REDIS_URL: redis://redis/ queuer: - build: . + <<: *common-service restart: never - volumes: - - .:/usr/src/app - - gem_cache:/gems - - ./example/datasets:/tmp/datasets command: bin/datasets.rb depends_on: - - redis - - mariadb-dev - environment: - REDIS_URL: redis://redis/ + redis: *healthy + mariadb-dev: *healthy - mariadb-test: + mariadb-dev: &mariadb image: ghcr.io/hathitrust/db-image restart: always environment: @@ -65,19 +62,18 @@ services: MYSQL_DATABASE: ht MYSQL_USER: datasets MYSQL_PASSWORD: datasets + healthcheck: + <<: *healthcheck-defaults + test: ["CMD", "healthcheck.sh", "--su-mysql", "--connect", "--innodb_initialized"] - mariadb-dev: - image: ghcr.io/hathitrust/db-image - restart: always - environment: - MYSQL_ROOT_PASSWORD: mysqlroot - MYSQL_DATABASE: ht - MYSQL_USER: datasets - MYSQL_PASSWORD: datasets + mariadb-test: *mariadb redis: image: redis restart: always + healthcheck: + <<: *healthcheck-defaults + test: ["CMD", "redis-cli", "ping"] volumes: gem_cache: diff --git a/spec/repository/rights_feed_volume_repo_spec.rb b/spec/repository/rights_feed_volume_repo_spec.rb index 765d7c9..0438fe8 100644 --- a/spec/repository/rights_feed_volume_repo_spec.rb +++ b/spec/repository/rights_feed_volume_repo_spec.rb @@ -4,6 +4,10 @@ module Datasets RSpec.describe Repository::RightsFeedVolumeRepo do let(:start_time) { Time.at(0) } let(:end_time) { Time.now } + let(:one_day) { 86400 } + let(:two_minutes) { 120 } + let(:yesterday) { Time.now - one_day } + let(:tomorrow) { Time.now + one_day } before(:all) do @connection = Sequel.connect(adapter: "mysql2", @@ -81,7 +85,7 @@ def volume_from(hash) it "returns Volume objects" do feed_table.insert(vol_feed_1) rights_table.insert(vol_rights_1) - volumes = repo.changed_between(Time.at(0), vol_rights_1[:time] + 1.day) + volumes = repo.changed_between(Time.at(0), vol_rights_1[:time] + one_day) expect(volumes.first).to be_an_instance_of Volume end @@ -103,7 +107,7 @@ def volume_from(hash) it "does not return volumes with md5check_ok: false" do feed_table.insert vol_feed_1.merge(md5check_ok: false, zip_date: Time.now) rights_table.insert vol_rights_1 - expect(repo.changed_between(1.day.ago, 1.day.from_now)).to be_empty + expect(repo.changed_between(yesterday, tomorrow)).to be_empty end it "does return volumes with md5check_ok: true or null" do @@ -111,13 +115,13 @@ def volume_from(hash) rights_table.insert vol_rights_1 feed_table.insert vol_feed_2.merge(md5check_ok: nil, zip_date: Time.now) rights_table.insert vol_rights_2 - expect(repo.changed_between(1.day.ago, 1.day.from_now)) + expect(repo.changed_between(yesterday, tomorrow)) .to contain_exactly(volume_from(vol_rights_1), volume_from(vol_rights_2)) end it "returns an empty set when nothing to find" do feed_table.insert vol_feed_1 - expect(repo.changed_between(vol_feed_1[:zip_date] + 1.day, vol_feed_1[:zip_date] + 2.minutes)) + expect(repo.changed_between(vol_feed_1[:zip_date] + one_day, vol_feed_1[:zip_date] + two_minutes)) .to be_empty end diff --git a/spec/repository/rights_volume_repo_spec.rb b/spec/repository/rights_volume_repo_spec.rb index 2599db6..3b11584 100644 --- a/spec/repository/rights_volume_repo_spec.rb +++ b/spec/repository/rights_volume_repo_spec.rb @@ -1,12 +1,13 @@ require "spec_helper" require "repository/rights_volume_repo" require "volume" - require "set" require "sequel" module Datasets RSpec.describe Repository::RightsVolumeRepo do + let(:one_day) { 86400 } + before(:all) do @connection = Sequel.connect(adapter: "mysql2", database: "ht", @@ -64,7 +65,7 @@ def volume_from(hash) describe "#rights_changed_between" do it "returns Volume objects" do table.insert(tuple_1) - volumes = repo.changed_between(Time.at(0), tuple_1[:time] + 1.day) + volumes = repo.changed_between(Time.at(0), tuple_1[:time] + one_day) expect(volumes.first).to be_an_instance_of Volume end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 51afa9f..61af326 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -2,9 +2,6 @@ require "simplecov" require "simplecov-lcov" -require "active_support/isolated_execution_state" -require "active_support/core_ext/numeric/time" -require "active_support/core_ext/hash/slice" require "sidekiq" SimpleCov::Formatter::LcovFormatter.config do |c|