Skip to content

Commit

Permalink
HYC-1937 - Expand Migration Utility to Include Matomo Stats (#1119)
Browse files Browse the repository at this point in the history
* use date inserted for dimensions queries

* separate local stat aggregation into helper, impl fetch matomo stats in DownloadStatsMigrationService

* update rake task, expand test suite, update arguments 

* error handling for unsupported sources

* require before and after timestamps for migrations from ga4 and matomo

* work utils debugging, update unit tests to accommodate changes
  • Loading branch information
davidcam-src authored Sep 25, 2024
1 parent 4c583d2 commit f673b03
Show file tree
Hide file tree
Showing 10 changed files with 436 additions and 186 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,12 @@ jobs:
POSTGRES_PASSWORD: password
TMPDIR: /tmp

- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v3
with:
name: test-coverage
path: coverage

- uses: actions/upload-artifact@v2
- uses: actions/upload-artifact@v3
with:
name: test-coverage-report
path: coverage/coverage.json
Expand Down
28 changes: 8 additions & 20 deletions app/controllers/concerns/hyc/download_analytics_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ def track_download
client_ip = request.remote_ip
user_agent = request.user_agent

matomo_id_site = site_id
matomo_security_token = auth_token
uri = URI("#{base_url}/matomo.php")
matomo_site_id = ENV['MATOMO_SITE_ID']
matomo_security_token = ENV['MATOMO_AUTH_TOKEN']
tracking_uri = URI("#{ENV['MATOMO_BASE_URL']}/matomo.php")

# Some parameters are optional, but included since tracking would not work otherwise
# https://developer.matomo.org/api-reference/tracking-api
uri_params = {
token_auth: matomo_security_token,
rec: '1',
idsite: matomo_id_site,
idsite: matomo_site_id,
action_name: 'Download',
url: request.url,
urlref: request.referrer,
Expand All @@ -44,11 +44,11 @@ def track_download
dimension1: work_data[:work_id],
dimension2: work_data[:title]
}
uri.query = URI.encode_www_form(uri_params)
response = HTTParty.get(uri.to_s)
Rails.logger.debug("Matomo download tracking URL: #{uri}")
tracking_uri.query = URI.encode_www_form(uri_params)
response = HTTParty.get(tracking_uri.to_s)
Rails.logger.debug("Matomo download tracking URL: #{tracking_uri}")
if response.code >= 300
Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{uri}")
Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{tracking_uri}")
end
# Send download events to db
create_download_stat
Expand Down Expand Up @@ -92,18 +92,6 @@ def work_data
@work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
end

def site_id
@site_id ||= ENV['MATOMO_SITE_ID']
end

def auth_token
@auth_token ||= ENV['MATOMO_AUTH_TOKEN']
end

def base_url
@base_url ||= ENV['MATOMO_BASE_URL']
end

def client_id
cookie = cookies.find { |key, _| key.start_with?('_pk_id') }&.last
if cookie.present?
Expand Down
25 changes: 13 additions & 12 deletions app/helpers/work_utils_helper.rb
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
# frozen_string_literal: true
module WorkUtilsHelper
def self.fetch_work_data_by_fileset_id(fileset_id)
work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
Rails.logger.warn("No work found for fileset id: #{fileset_id}") if work.blank?
# Fetch the admin set related to the work
admin_set_name = work['admin_set_tesim']&.first
# If the admin set name is not nil, fetch the admin set
# Retrieve the fileset data
fileset_data = ActiveFedora::SolrService.get("id:#{fileset_id}", rows: 1)['response']['docs'].first || {}
Rails.logger.warn("No fileset data found for fileset id: #{fileset_id}") if fileset_data.blank?
# Retrieve the work related to the fileset
work_data = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
Rails.logger.warn("No work found associated with fileset id: #{fileset_id}") if work_data.blank?
# Set the admin set to an empty hash if the solr query returns nil
admin_set = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first || {} : {}
Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set.blank?

admin_set_name = work_data['admin_set_tesim']&.first
admin_set_data = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first : {}
Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set_data.blank?
{
work_id: work['id'],
work_type: work.dig('has_model_ssim', 0),
title: work['title_tesim']&.first,
admin_set_id: admin_set['id'],
work_id: fileset_data['id'],
work_type: work_data.dig('has_model_ssim', 0),
title: work_data['title_tesim']&.first,
admin_set_id: admin_set_data['id'],
admin_set_name: admin_set_name
}
end
Expand Down
2 changes: 1 addition & 1 deletion app/services/tasks/dimensions_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def solr_query_builder(pub)

# Query with paramaters to retrieve publications related to UNC
def generate_query_string(start_date, end_date, page_size, cursor)
search_clauses = ['where type = "article"', "date >= \"#{start_date}\"", "date < \"#{end_date}\""].join(' and ')
search_clauses = ['where type = "article"', "date_inserted >= \"#{start_date}\"", "date_inserted < \"#{end_date}\""].join(' and ')
return_fields = ['basics', 'extras', 'abstract', 'issn', 'publisher', 'journal_title_raw', 'linkout', 'concepts'].join(' + ')
unc_affiliation_variants = ['"UNC-CH"', '"University of North Carolina at Chapel Hill"', '"UNC-Chapel Hill"', '"University of North Carolina-Chapel Hill"', '"University of North Carolina, Chapel Hill"'].join(' OR ')
<<~QUERY
Expand Down
137 changes: 107 additions & 30 deletions app/services/tasks/download_stats_migration_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,32 @@
module Tasks
class DownloadStatsMigrationService
PAGE_SIZE = 1000
def list_work_stat_info(output_path, after_timestamp = nil)
begin
query = FileDownloadStat.all
query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
total_work_stats = query.count
timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'

# Log number of work stats retrieved and timestamp clause
Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")

aggregated_data = {}
work_stats_retrieved_from_query_count = 0

Rails.logger.info('Retrieving work_stats from the database')
# Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
query.find_each(batch_size: PAGE_SIZE) do |stat|
truncated_date = stat.date.beginning_of_month
# Group the file_id and truncated date to be used as a key
key = [stat.file_id, truncated_date]
# Initialize the hash for the key if it doesn't exist
aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 }
# Sum the downloads for each key
aggregated_data[key][:downloads] += stat.downloads
work_stats_retrieved_from_query_count += 1
log_progress(work_stats_retrieved_from_query_count, total_work_stats)
end
module DownloadMigrationSource
MATOMO = :matomo
GA4 = :ga4
CACHE = :cache

aggregated_work_stats = aggregated_data.values
Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats")
def self.all_sources
[MATOMO, GA4, CACHE]
end

# Write the work_stats to the specified CSV file
write_to_csv(output_path, aggregated_work_stats)
def self.valid?(source)
all_sources.include?(source)
end
end
def list_work_stat_info(output_path, after_timestamp = nil, before_timestamp = nil, source)
aggregated_work_stats = []
begin
case source
when DownloadMigrationSource::CACHE
aggregated_work_stats = fetch_local_cache_stats(after_timestamp, output_path)
write_to_csv(output_path, aggregated_work_stats)
when DownloadMigrationSource::MATOMO
aggregated_work_stats = fetch_matomo_stats(after_timestamp, before_timestamp, output_path)
write_to_csv(output_path, aggregated_work_stats)
else
raise ArgumentError, "Unsupported source: #{source}"
end
rescue StandardError => e
Rails.logger.error("An error occurred while listing work stats: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
Expand Down Expand Up @@ -68,6 +62,89 @@ def migrate_to_new_table(csv_path)

private

# Method to fetch and aggregate work stats from Matomo
def fetch_matomo_stats(after_timestamp, before_timestamp, output_path)
aggregated_data = {}
# Keeps count of stats retrieved from Matomo from all queries
all_query_stat_total = 0
# Log number of work stats retrieved and timestamp clause
timestamp_clause = "in specified range #{after_timestamp} to #{before_timestamp}"
Rails.logger.info("Fetching work stats #{timestamp_clause} from Matomo.")

# Query Matomo API for each month in the range and aggregate the data
# Setting period to month will return stats for each month in the range, regardless of the specified date
reporting_uri = URI("#{ENV['MATOMO_BASE_URL']}/index.php")
# Fetch the first of each month in the range
months_array = first_of_each_month_in_range(after_timestamp, before_timestamp)
months_array.each_with_index do |first_date_of_month, index|
uri_params = {
module: 'API',
idSite: ENV['MATOMO_SITE_ID'],
method: 'Events.getName',
period: 'month',
date: first_date_of_month,
format: JSON,
token_auth: ENV['MATOMO_AUTH_TOKEN'],
flat: '1',
filter_pattern: 'DownloadIR',
filter_limit: -1,
showColumns: 'nb_events',
}
reporting_uri.query = URI.encode_www_form(uri_params)
response = HTTParty.get(reporting_uri.to_s)
month_year_string = first_date_of_month.to_date.strftime('%B %Y')
Rails.logger.info("Processing Matomo response for #{month_year_string}. (#{index + 1}/#{months_array.count})")
response.parsed_response.each do |stat|
# Events_EventName is the file_id, nb_events is the number of downloads
update_aggregate_stats(aggregated_data, first_date_of_month, stat['Events_EventName'], stat['nb_events'])
end
monthly_stat_total = response.parsed_response.length
all_query_stat_total += monthly_stat_total
end
Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{all_query_stat_total} total retrieved stats")
# Return the aggregated data
aggregated_data.values
end

def update_aggregate_stats(aggregated_data, truncated_date, file_id, downloads)
# Group the file_id and truncated date to be used as a key
key = [file_id, truncated_date]
# Initialize the hash for the key if it doesn't exist
aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
# Sum the downloads for each key
aggregated_data[key][:downloads] += downloads
end

def first_of_each_month_in_range(after_timestamp, before_timestamp)
after_date = after_timestamp.to_date.beginning_of_month
before_date = before_timestamp.to_date.beginning_of_month
(after_date..before_date).select { |d| d.day == 1 }.map(&:to_s)
end

# Method to fetch and aggregate work stats from the local cache
def fetch_local_cache_stats(after_timestamp, output_path)
aggregated_data = {}
work_stats_retrieved_from_query_count = 0
query = FileDownloadStat.all
query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
total_work_stats = query.count
timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'

# Log number of work stats retrieved and timestamp clause
Rails.logger.info("Fetching #{total_work_stats} work stats #{timestamp_clause} from the hyrax local cache.")

# Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
query.find_each(batch_size: PAGE_SIZE) do |stat|
update_aggregate_stats(aggregated_data, stat.date.beginning_of_month, stat.file_id, stat.downloads)
work_stats_retrieved_from_query_count += 1
log_progress(work_stats_retrieved_from_query_count, total_work_stats)
end

Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{total_work_stats} daily stats")
# Return the aggregated data
aggregated_data.values
end

# Log progress at 25%, 50%, 75%, and 100%
def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation')
percentages = [0.25, 0.5, 0.75, 1.0]
Expand Down
18 changes: 16 additions & 2 deletions lib/tasks/migrate_download_stats.rake
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ require 'optparse/date'

namespace :migrate_download_stats do
desc 'output rows for download stat migration into a csv'
task :list_rows, [:output_dir, :after] => :environment do |_t, _args|
task :list_rows, [:output_dir, :after, :before, :source] => :environment do |_t, _args|
start_time = Time.now
puts "[#{start_time.utc.iso8601}] starting listing of work data"
options = {}
Expand All @@ -14,6 +14,8 @@ namespace :migrate_download_stats do
opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]'
opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
opts.on('-b', '--before ARG', String, 'List objects updated before this timestamp, only meant for matomo and ga4 migrations') { |val| options[:before] = val }
opts.on('-s', '--source ARG', String, 'Data source (matomo, ga4, cache)') { |val| options[:source] = val.to_sym }
args = opts.order!(ARGV) {}
opts.parse!(args)

Expand All @@ -22,8 +24,20 @@ namespace :migrate_download_stats do
exit 1
end

unless Tasks::DownloadStatsMigrationService::DownloadMigrationSource.valid?(options[:source])
puts "Please provide a valid source: #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource.all_sources.join(', ')}"
exit 1
end

# Require both 'before' and 'after' arguments if the source is not 'cache'
if options[:source] != Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE && (!options[:before].present? || !options[:after].present?)
puts "Both 'before' and 'after' timestamps are required for sources other than #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE}"
exit 1
end


migration_service = Tasks::DownloadStatsMigrationService.new
old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after])
old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after], options[:before], options[:source])
puts "Listing completed in #{Time.now - start_time}s"
puts "Stored id list to file: #{options[:output_dir]}"
exit 0
Expand Down
5 changes: 0 additions & 5 deletions spec/controllers/hyrax/downloads_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -345,9 +345,4 @@
end
end

describe '#site_id' do
it 'returns the site id from ENV' do
expect(controller.send(:site_id)).to eq('5')
end
end
end
80 changes: 80 additions & 0 deletions spec/fixtures/files/matomo_stats_migration_fixture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"2024-01-01": [
{
"label": "file_id_1 - DownloadIR",
"nb_events": 120,
"Events_EventName": "file_id_1",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_1 - DownloadIR",
"nb_events": 70,
"Events_EventName": "file_id_1",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_2 - DownloadIR",
"nb_events": 100,
"Events_EventName": "file_id_2",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_2 - DownloadIR",
"nb_events": 50,
"Events_EventName": "file_id_2",
"Events_EventAction": "DownloadIR"
}]
,
"2024-02-01": [
{
"label": "file_id_3 - DownloadIR",
"nb_events": 10,
"Events_EventName": "file_id_3",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_3 - DownloadIR",
"nb_events": 90,
"Events_EventName": "file_id_3",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_4 - DownloadIR",
"nb_events": 50,
"Events_EventName": "file_id_4",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_4 - DownloadIR",
"nb_events": 30,
"Events_EventName": "file_id_4",
"Events_EventAction": "DownloadIR"
}
],
"2024-03-01": [
{
"label": "file_id_5 - DownloadIR",
"nb_events": 80,
"Events_EventName": "file_id_5",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_5 - DownloadIR",
"nb_events": 100,
"Events_EventName": "file_id_5",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_6 - DownloadIR",
"nb_events": 250,
"Events_EventName": "file_id_6",
"Events_EventAction": "DownloadIR"
},
{
"label": "file_id_6 - DownloadIR",
"nb_events": 300,
"Events_EventName": "file_id_6",
"Events_EventAction": "DownloadIR"
}
]
}
Loading

0 comments on commit f673b03

Please sign in to comment.