HYC-1937 - Expand Migration Utility to Include Matomo Stats (#1119)

* use date inserted for dimensions queries * separate local stat aggregation into helper, impl fetch matomo stats in DownloadStatsMigrationService * update rake task, expand test suite, update arguments * error handling for unsupported sources * require before and after timestamps for migrations from ga4 and matomo * work utils debugging, update unit tests to accommodate changes
UNC-Libraries · Sep 25, 2024 · f673b03 · f673b03
1 parent 4c583d2
commit f673b03
Show file tree

Hide file tree

Showing 10 changed files with 436 additions and 186 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -138,12 +138,12 @@ jobs:
         POSTGRES_PASSWORD: password
         TMPDIR: /tmp
 
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@v3
       with:
         name: test-coverage
         path: coverage
 
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@v3
       with:
         name: test-coverage-report
         path: coverage/coverage.json

diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb
@@ -18,16 +18,16 @@ def track_download
           client_ip = request.remote_ip
           user_agent = request.user_agent
 
-          matomo_id_site = site_id
-          matomo_security_token = auth_token
-          uri = URI("#{base_url}/matomo.php")
+          matomo_site_id = ENV['MATOMO_SITE_ID']
+          matomo_security_token = ENV['MATOMO_AUTH_TOKEN']
+          tracking_uri = URI("#{ENV['MATOMO_BASE_URL']}/matomo.php")
 
           # Some parameters are optional, but included since tracking would not work otherwise
           # https://developer.matomo.org/api-reference/tracking-api
           uri_params = {
             token_auth: matomo_security_token,
             rec: '1',
-            idsite: matomo_id_site,
+            idsite: matomo_site_id,
             action_name: 'Download',
             url: request.url,
             urlref: request.referrer,
@@ -44,11 +44,11 @@ def track_download
             dimension1: work_data[:work_id],
             dimension2: work_data[:title]
           }
-          uri.query = URI.encode_www_form(uri_params)
-          response = HTTParty.get(uri.to_s)
-          Rails.logger.debug("Matomo download tracking URL: #{uri}")
+          tracking_uri.query = URI.encode_www_form(uri_params)
+          response = HTTParty.get(tracking_uri.to_s)
+          Rails.logger.debug("Matomo download tracking URL: #{tracking_uri}")
           if response.code >= 300
-            Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{uri}")
+            Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{tracking_uri}")
           end
           # Send download events to db
           create_download_stat
@@ -92,18 +92,6 @@ def work_data
         @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
       end
 
-      def site_id
-        @site_id ||= ENV['MATOMO_SITE_ID']
-      end
-
-      def auth_token
-        @auth_token ||= ENV['MATOMO_AUTH_TOKEN']
-      end
-
-      def base_url
-        @base_url ||= ENV['MATOMO_BASE_URL']
-      end
-
       def client_id
         cookie = cookies.find { |key, _| key.start_with?('_pk_id') }&.last
         if cookie.present?

diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb
@@ -1,20 +1,21 @@
 # frozen_string_literal: true
 module WorkUtilsHelper
   def self.fetch_work_data_by_fileset_id(fileset_id)
-    work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
-    Rails.logger.warn("No work found for fileset id: #{fileset_id}") if work.blank?
-    # Fetch the admin set related to the work
-    admin_set_name = work['admin_set_tesim']&.first
-    # If the admin set name is not nil, fetch the admin set
+    # Retrieve the fileset data
+    fileset_data = ActiveFedora::SolrService.get("id:#{fileset_id}", rows: 1)['response']['docs'].first || {}
+    Rails.logger.warn("No fileset data found for fileset id: #{fileset_id}") if fileset_data.blank?
+    # Retrieve the work related to the fileset
+    work_data = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
+    Rails.logger.warn("No work found associated with fileset id: #{fileset_id}") if work_data.blank?
     # Set the admin set to an empty hash if the solr query returns nil
-    admin_set = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first || {} : {}
-    Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set.blank?
-
+    admin_set_name = work_data['admin_set_tesim']&.first
+    admin_set_data = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first : {}
+    Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set_data.blank?
     {
-      work_id: work['id'],
-      work_type: work.dig('has_model_ssim', 0),
-      title: work['title_tesim']&.first,
-      admin_set_id: admin_set['id'],
+      work_id: fileset_data['id'],
+      work_type: work_data.dig('has_model_ssim', 0),
+      title: work_data['title_tesim']&.first,
+      admin_set_id: admin_set_data['id'],
       admin_set_name: admin_set_name
     }
   end

diff --git a/app/services/tasks/dimensions_query_service.rb b/app/services/tasks/dimensions_query_service.rb
@@ -195,7 +195,7 @@ def solr_query_builder(pub)
 
     # Query with paramaters to retrieve publications related to UNC
     def generate_query_string(start_date, end_date, page_size, cursor)
-      search_clauses = ['where type = "article"', "date >= \"#{start_date}\"", "date < \"#{end_date}\""].join(' and ')
+      search_clauses = ['where type = "article"', "date_inserted >= \"#{start_date}\"", "date_inserted < \"#{end_date}\""].join(' and ')
       return_fields = ['basics', 'extras', 'abstract', 'issn', 'publisher', 'journal_title_raw', 'linkout', 'concepts'].join(' + ')
       unc_affiliation_variants = ['"UNC-CH"', '"University of North Carolina at Chapel Hill"', '"UNC-Chapel Hill"', '"University of North Carolina-Chapel Hill"', '"University of North Carolina, Chapel Hill"'].join(' OR ')
       <<~QUERY

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
@@ -2,38 +2,32 @@
 module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
-    def list_work_stat_info(output_path, after_timestamp = nil)
-      begin
-        query = FileDownloadStat.all
-        query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
-        total_work_stats = query.count
-        timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
-
-      # Log number of work stats retrieved and timestamp clause
-        Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")
-
-        aggregated_data = {}
-        work_stats_retrieved_from_query_count = 0
-
-        Rails.logger.info('Retrieving work_stats from the database')
-      # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
-        query.find_each(batch_size: PAGE_SIZE) do |stat|
-          truncated_date = stat.date.beginning_of_month
-          # Group the file_id and truncated date to be used as a key
-          key = [stat.file_id, truncated_date]
-          # Initialize the hash for the key if it doesn't exist
-          aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 }
-          # Sum the downloads for each key
-          aggregated_data[key][:downloads] += stat.downloads
-          work_stats_retrieved_from_query_count += 1
-          log_progress(work_stats_retrieved_from_query_count, total_work_stats)
-        end
+    module DownloadMigrationSource
+      MATOMO = :matomo
+      GA4 = :ga4
+      CACHE = :cache
 
-        aggregated_work_stats = aggregated_data.values
-        Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats")
+      def self.all_sources
+        [MATOMO, GA4, CACHE]
+      end
 
-        # Write the work_stats to the specified CSV file
-        write_to_csv(output_path, aggregated_work_stats)
+      def self.valid?(source)
+        all_sources.include?(source)
+      end
+    end
+    def list_work_stat_info(output_path, after_timestamp = nil, before_timestamp = nil, source)
+      aggregated_work_stats = []
+      begin
+        case source
+        when DownloadMigrationSource::CACHE
+          aggregated_work_stats = fetch_local_cache_stats(after_timestamp, output_path)
+          write_to_csv(output_path, aggregated_work_stats)
+        when DownloadMigrationSource::MATOMO
+          aggregated_work_stats = fetch_matomo_stats(after_timestamp, before_timestamp, output_path)
+          write_to_csv(output_path, aggregated_work_stats)
+        else
+          raise ArgumentError, "Unsupported source: #{source}"
+        end
       rescue StandardError => e
         Rails.logger.error("An error occurred while listing work stats: #{e.message}")
         Rails.logger.error(e.backtrace.join("\n"))
@@ -68,6 +62,89 @@ def migrate_to_new_table(csv_path)
 
     private
 
+    # Method to fetch and aggregate work stats from Matomo
+    def fetch_matomo_stats(after_timestamp, before_timestamp, output_path)
+      aggregated_data = {}
+      # Keeps count of stats retrieved from Matomo from all queries
+      all_query_stat_total = 0
+      # Log number of work stats retrieved and timestamp clause
+      timestamp_clause = "in specified range #{after_timestamp} to #{before_timestamp}"
+      Rails.logger.info("Fetching work stats #{timestamp_clause} from Matomo.")
+
+      # Query Matomo API for each month in the range and aggregate the data
+      # Setting period to month will return stats for each month in the range, regardless of the specified date
+      reporting_uri = URI("#{ENV['MATOMO_BASE_URL']}/index.php")
+      # Fetch the first of each month in the range
+      months_array = first_of_each_month_in_range(after_timestamp, before_timestamp)
+      months_array.each_with_index do |first_date_of_month, index|
+        uri_params = {
+          module: 'API',
+          idSite: ENV['MATOMO_SITE_ID'],
+          method: 'Events.getName',
+          period: 'month',
+          date: first_date_of_month,
+          format: JSON,
+          token_auth: ENV['MATOMO_AUTH_TOKEN'],
+          flat: '1',
+          filter_pattern: 'DownloadIR',
+          filter_limit: -1,
+          showColumns: 'nb_events',
+        }
+        reporting_uri.query = URI.encode_www_form(uri_params)
+        response = HTTParty.get(reporting_uri.to_s)
+        month_year_string = first_date_of_month.to_date.strftime('%B %Y')
+        Rails.logger.info("Processing Matomo response for #{month_year_string}. (#{index + 1}/#{months_array.count})")
+        response.parsed_response.each do |stat|
+          # Events_EventName is the file_id, nb_events is the number of downloads
+          update_aggregate_stats(aggregated_data, first_date_of_month, stat['Events_EventName'], stat['nb_events'])
+        end
+        monthly_stat_total = response.parsed_response.length
+        all_query_stat_total += monthly_stat_total
+      end
+      Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{all_query_stat_total} total retrieved stats")
+      # Return the aggregated data
+      aggregated_data.values
+    end
+
+    def update_aggregate_stats(aggregated_data, truncated_date, file_id, downloads)
+      # Group the file_id and truncated date to be used as a key
+      key = [file_id, truncated_date]
+      # Initialize the hash for the key if it doesn't exist
+      aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
+      # Sum the downloads for each key
+      aggregated_data[key][:downloads] += downloads
+    end
+
+    def first_of_each_month_in_range(after_timestamp, before_timestamp)
+      after_date = after_timestamp.to_date.beginning_of_month
+      before_date = before_timestamp.to_date.beginning_of_month
+      (after_date..before_date).select { |d| d.day == 1 }.map(&:to_s)
+    end
+
+    # Method to fetch and aggregate work stats from the local cache
+    def fetch_local_cache_stats(after_timestamp, output_path)
+      aggregated_data = {}
+      work_stats_retrieved_from_query_count = 0
+      query = FileDownloadStat.all
+      query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
+      total_work_stats = query.count
+      timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
+
+    # Log number of work stats retrieved and timestamp clause
+      Rails.logger.info("Fetching #{total_work_stats} work stats #{timestamp_clause} from the hyrax local cache.")
+
+    # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
+      query.find_each(batch_size: PAGE_SIZE) do |stat|
+        update_aggregate_stats(aggregated_data, stat.date.beginning_of_month, stat.file_id, stat.downloads)
+        work_stats_retrieved_from_query_count += 1
+        log_progress(work_stats_retrieved_from_query_count, total_work_stats)
+      end
+
+      Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{total_work_stats} daily stats")
+      # Return the aggregated data
+      aggregated_data.values
+    end
+
     # Log progress at 25%, 50%, 75%, and 100%
     def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation')
       percentages = [0.25, 0.5, 0.75, 1.0]

diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
@@ -5,7 +5,7 @@ require 'optparse/date'
 
 namespace :migrate_download_stats do
   desc 'output rows for download stat migration into a csv'
-  task :list_rows, [:output_dir, :after] => :environment do |_t, _args|
+  task :list_rows, [:output_dir, :after, :before, :source] => :environment do |_t, _args|
     start_time = Time.now
     puts "[#{start_time.utc.iso8601}] starting listing of work data"
     options = {}
@@ -14,6 +14,8 @@ namespace :migrate_download_stats do
     opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]'
     opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
     opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
+    opts.on('-b', '--before ARG', String, 'List objects updated before this timestamp, only meant for matomo and ga4 migrations') { |val| options[:before] = val }
+    opts.on('-s', '--source ARG', String, 'Data source (matomo, ga4, cache)') { |val| options[:source] = val.to_sym }
     args = opts.order!(ARGV) {}
     opts.parse!(args)
 
@@ -22,8 +24,20 @@ namespace :migrate_download_stats do
       exit 1
     end
 
+    unless Tasks::DownloadStatsMigrationService::DownloadMigrationSource.valid?(options[:source])
+      puts "Please provide a valid source: #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource.all_sources.join(', ')}"
+      exit 1
+    end
+
+    # Require both 'before' and 'after' arguments if the source is not 'cache'
+    if options[:source] != Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE && (!options[:before].present? || !options[:after].present?)
+      puts "Both 'before' and 'after' timestamps are required for sources other than #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE}"
+      exit 1
+    end
+
+
     migration_service = Tasks::DownloadStatsMigrationService.new
-    old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after])
+    old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after],  options[:before], options[:source])
     puts "Listing completed in #{Time.now - start_time}s"
     puts "Stored id list to file: #{options[:output_dir]}"
     exit 0

diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb
@@ -345,9 +345,4 @@
     end
   end
 
-  describe '#site_id' do
-    it 'returns the site id from ENV' do
-      expect(controller.send(:site_id)).to eq('5')
-    end
-  end
 end
diff --git a/spec/fixtures/files/matomo_stats_migration_fixture.json b/spec/fixtures/files/matomo_stats_migration_fixture.json
@@ -0,0 +1,80 @@
+{
+    "2024-01-01": [
+       {
+        "label": "file_id_1 - DownloadIR",
+        "nb_events": 120,
+        "Events_EventName": "file_id_1",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_1 - DownloadIR",
+        "nb_events": 70,
+        "Events_EventName": "file_id_1",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_2 - DownloadIR",
+        "nb_events": 100,
+        "Events_EventName": "file_id_2",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_2 - DownloadIR",
+        "nb_events": 50,
+        "Events_EventName": "file_id_2",
+        "Events_EventAction": "DownloadIR"
+        }]
+        ,
+    "2024-02-01": [
+        {
+        "label": "file_id_3 - DownloadIR",
+        "nb_events": 10,
+        "Events_EventName": "file_id_3",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_3 - DownloadIR",
+        "nb_events": 90,
+        "Events_EventName": "file_id_3",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_4 - DownloadIR",
+        "nb_events": 50,
+        "Events_EventName": "file_id_4",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_4 - DownloadIR",
+        "nb_events": 30,
+        "Events_EventName": "file_id_4",
+        "Events_EventAction": "DownloadIR"
+        }
+    ],
+    "2024-03-01": [
+        {
+        "label": "file_id_5 - DownloadIR",
+        "nb_events": 80,
+        "Events_EventName": "file_id_5",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_5 - DownloadIR",
+        "nb_events": 100,
+        "Events_EventName": "file_id_5",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_6 - DownloadIR",
+        "nb_events": 250,
+        "Events_EventName": "file_id_6",
+        "Events_EventAction": "DownloadIR"
+        },
+        {
+        "label": "file_id_6 - DownloadIR",
+        "nb_events": 300,
+        "Events_EventName": "file_id_6",
+        "Events_EventAction": "DownloadIR"
+        }
+    ]
+}