From 3266c0019dd43f7271dbc2d465c9e4e64b51651f Mon Sep 17 00:00:00 2001 From: John Ferlito Date: Tue, 3 Dec 2024 10:01:04 +1100 Subject: [PATCH] Fix style --- app/models/item.rb | 16 +-- app/services/junk_service.rb | 240 +++++++++++++++++++++-------------- 2 files changed, 153 insertions(+), 103 deletions(-) diff --git a/app/models/item.rb b/app/models/item.rb index f39451ff..38e53909 100644 --- a/app/models/item.rb +++ b/app/models/item.rb @@ -376,7 +376,7 @@ def search_data # TODO: Should this be allowed in the data? data[:bounds] = if north_limit == south_limit && east_limit == west_limit { type: 'point', coordinates: [west_limit, north_limit] } - else + else { type: 'polygon', coordinates: [[ @@ -387,7 +387,7 @@ def search_data [west_limit, north_limit] ]] } - end + end end # Things we want to check blankness of @@ -425,16 +425,16 @@ def citation cite += type cite += if index == last '. ' - else + else '/' - end + end end cite += " #{collection.identifier}-#{identifier} at catalog.paradisec.org.au." cite += if doi " https://dx.doi.org/#{doi}" - else + else " #{full_path}" - end + end cite end @@ -639,9 +639,9 @@ def center_coordinate lng = if east_limit < west_limit 180 + ((west_limit + east_limit) / 2) - else + else (west_limit + east_limit) / 2 - end + end lat = (north_limit + south_limit) / 2 diff --git a/app/services/junk_service.rb b/app/services/junk_service.rb index f7c71693..78525745 100644 --- a/app/services/junk_service.rb +++ b/app/services/junk_service.rb @@ -5,118 +5,168 @@ class JunkService attr_reader :catalog_dir, :verbose - def initialize(env) - @bucket = "nabu-meta-#{env}" - @prefix = "inventories/catalog/nabu-catalog-#{env}/CatalogBucketInventory0/" - - # Strange bug in dev docker - ENV.delete('AWS_SECRET_ACCESS_KEY') - ENV.delete('AWS_ACCESS_KEY_ID') - ENV.delete('AWS_SESSION_TOKEN') - - @s3 = Aws::S3::Client.new(region: 'ap-southeast-2') + def initialize(env, verbose: false) end def run - inventory_dir = find_recent_inventory_dir - inventory_csv = fetch_inventory_csv(inventory_dir) - - s3_files = extract_s3_files(inventory_csv) - - s3_files.select! { |filename| filename.ends_with?('-deposit.pdf') } - - puts "Found #{s3_files.size} df files" - - s3_files.each do |filename| - puts "Marking #{filename}" - identifier = filename.split('/')[0] - collection = Collection.find_by(identifier: identifier) - throw "Could not find collection for #{identifier}" unless collection - collection.has_deposit_form = true - collection.save! - end - end - - private + filenames = Essence.pluck(:id, :filename) + filenames_hash = {} - def extract_s3_files(inventory_csv) - s3_files = [] - - CSV.parse(inventory_csv, headers: false) do |row| - _bucket_name, filename, _version_id, is_latest, delete_marker, _size, _last_modified, _etag, - storage_class, multiple_upload, multipart_upload_flag, replication_status, checksum_algo = row - - next if is_latest == 'false' || delete_marker == 'true' - - s3_files << CGI.unescape(filename) + filenames.each do |id, filename| + item_name, extension = filename.split('.', 2) + filenames_hash[item_name] ||= { extensions: [], errors: [] } + filenames_hash[item_name][:extensions] << extension.downcase end - if s3_files.size != s3_files.uniq.size - raise 'Duplicate files in S3 inventory' + ext_map = { + audio_ok: [ + 'mp3', + 'wav' + ], + audio: [ + 'mpg' + ], + video_ok: [ + 'mp4', + 'mxf', + 'mkv' + ], + video: [ + 'dv', + 'mov', + 'webm', + 'm4v', + 'avi', + 'mts' + ], + image_ok: [ + 'jpg', + 'tif' + ], + image: [ + 'png' + ], + lang: [ + 'eaf', + 'trs', + 'xml', + 'cha', + 'fwbackup', + 'pfsx', + 'ixt', + 'cmdi', + 'lbl', + 'textgrid', + 'srt', + 'flextext', + 'tex', + 'imdi', + 'version', + 'annis', + 'opex' + ], + standalone: [ + 'txt', + 'pdf', + 'rtf', + 'xlsx', + 'docx', + 'img', + 'tab', + 'odt', + 'html', + 'csv', + 'ods', + 'kml', + 'zip' + ], + broken: [ + 'mov.eaf', + 'eopas1.ixt', + 'eopas2.ixt', + 'mp4_good_audio.mp3', + 'mp4_good_audio.mp4', + 'mp4_good_audio.mxf', + 'mp4_good_audio.wav', + 'masing.pdf', + 'masing.rtf', + 'masing.txt', + '5.mp3', + '5.wav', + 'txt.txt', + 'wav.eaf', + 'wav.mp3', + 'wav.wav' + ] + } + + filenames_hash.each do |item_name, data| + filenames_hash[item_name][:extensions] = data[:extensions].sort end - s3_files - end - - def fetch_inventory_csv(inventory_dir) - manifest_json = @s3.get_object(bucket: @bucket, key: "#{inventory_dir}manifest.json").body.read - manifest = JSON.parse(manifest_json) + filenames_hash.each do |item_name, data| + extensions = data[:extensions].clone - files = manifest['files'] - if files.size > 1 - raise 'Multiple files in manifest' - end - - file = files.first['key'] - - # Download the S3 Inventory CSV file - puts "Downloading S3 Inventory CSV file: #{file}" - inventory_gzipped = @s3.get_object(bucket: @bucket, key: file).body.read - puts "Unzipping file: #{file}\n\n" - inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read - end - - def find_recent_inventory_dir - inventory_files = fetch_inventory_files - - # Extract the timestamp part from each key and convert it to Time object - timestamped_files = inventory_files.map do |key| - match = key.match(/CatalogBucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/) - if match - year, month, day, hour, minute = match.captures - time = Time.new(year, month, day, hour, minute) - { key: key, time: time } + # Audio + if extensions.include?('mp3') && !extensions.include?('wav') + data[:errors] << '+MP3-WAV' end - end.compact - - # Find the most recent file - most_recent_dir = timestamped_files.max_by { |file| file[:time] } + if extensions.include?('wav') && !extensions.include?('mp3') + data[:errors] << '+WAV-MP3' + end + extensions = extensions - ext_map[:audio_ok] + if (extensions & ext_map[:audio]).any? + data[:errors] << '+AUDIO' + end + extensions = extensions - ext_map[:audio] - puts "Most recent inventory file: #{most_recent_dir[:key]}" + # Normal Paragest Video (plus old video) + if extensions.include?('mkv') && !extensions.include?('mp4') + data[:errors] << '+MKV-MP4' + end + if extensions.include?('mxf') && !extensions.include?('mp4') + data[:errors] << '+MXF-MP4' + end + if extensions.include?('mp4') && !(extensions.include?('mkv') || extensions.include?('mxf')) + data[:errors] << '+MP4-MKV' + end + extensions = extensions - ext_map[:video_ok] + if (extensions & ext_map[:video]).any? + data[:errors] << '+VIDEO' + end + extensions = extensions - ext_map[:video] - most_recent_dir[:key] - end + # Image + if extensions.include?('jpg') && !extensions.include?('tif') + data[:errors] << '+JPG-TIF' + end + if extensions.include?('tif') && !extensions.include?('jpg') + data[:errors] << '+TIF-JPG' + end + extensions = extensions - ext_map[:image_ok] + if (extensions & ext_map[:image]).any? + data[:errors] << '+IMAGE' + end + extensions = extensions - ext_map[:image] - def fetch_inventory_files - inventory_files = [] - next_token = nil + extensions = extensions - ext_map[:lang] + extensions = extensions - ext_map[:standalone] + if (extensions & ext_map[:broken]).any? + data[:errors] << '+BROKEN' + end + extensions = extensions - ext_map[:broken] - loop do - response = @s3.list_objects_v2( - bucket: @bucket, - prefix: @prefix, - delimiter: '/', - continuation_token: next_token - ) - # Collect all object keys - inventory_files += response.common_prefixes.map(&:prefix) + if extensions.any? + abort "Item: #{item_name}, Extensions: #{extensions.join(', ')}" + end + end - break unless response.is_truncated + puts '# Error Summary' + filenames_hash.each do |item_name, data| + next if data[:errors].empty? - next_token = response.next_continuation_token + puts "Item: #{item_name}, Extensions: #{data[:extensions].join(', ')}, Errors: #{data[:errors].join(', ')}" end - - inventory_files end end