Skip to content

Commit

Permalink
Fix style
Browse files Browse the repository at this point in the history
  • Loading branch information
johnf committed Dec 2, 2024
1 parent 25fc918 commit 3266c00
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 103 deletions.
16 changes: 8 additions & 8 deletions app/models/item.rb
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def search_data
# TODO: Should this be allowed in the data?
data[:bounds] = if north_limit == south_limit && east_limit == west_limit
{ type: 'point', coordinates: [west_limit, north_limit] }
else
else
{
type: 'polygon',
coordinates: [[
Expand All @@ -387,7 +387,7 @@ def search_data
[west_limit, north_limit]
]]
}
end
end
end

# Things we want to check blankness of
Expand Down Expand Up @@ -425,16 +425,16 @@ def citation
cite += type
cite += if index == last
'. '
else
else
'/'
end
end
end
cite += " #{collection.identifier}-#{identifier} at catalog.paradisec.org.au."
cite += if doi
" https://dx.doi.org/#{doi}"
else
else
" #{full_path}"
end
end
cite
end

Expand Down Expand Up @@ -639,9 +639,9 @@ def center_coordinate

lng = if east_limit < west_limit
180 + ((west_limit + east_limit) / 2)
else
else
(west_limit + east_limit) / 2
end
end

lat = (north_limit + south_limit) / 2

Expand Down
240 changes: 145 additions & 95 deletions app/services/junk_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,118 +5,168 @@
class JunkService
attr_reader :catalog_dir, :verbose

def initialize(env)
@bucket = "nabu-meta-#{env}"
@prefix = "inventories/catalog/nabu-catalog-#{env}/CatalogBucketInventory0/"

# Strange bug in dev docker
ENV.delete('AWS_SECRET_ACCESS_KEY')
ENV.delete('AWS_ACCESS_KEY_ID')
ENV.delete('AWS_SESSION_TOKEN')

@s3 = Aws::S3::Client.new(region: 'ap-southeast-2')
def initialize(env, verbose: false)
end

def run
inventory_dir = find_recent_inventory_dir
inventory_csv = fetch_inventory_csv(inventory_dir)

s3_files = extract_s3_files(inventory_csv)

s3_files.select! { |filename| filename.ends_with?('-deposit.pdf') }

puts "Found #{s3_files.size} df files"

s3_files.each do |filename|
puts "Marking #{filename}"
identifier = filename.split('/')[0]
collection = Collection.find_by(identifier: identifier)
throw "Could not find collection for #{identifier}" unless collection
collection.has_deposit_form = true
collection.save!
end
end

private
filenames = Essence.pluck(:id, :filename)
filenames_hash = {}

def extract_s3_files(inventory_csv)
s3_files = []

CSV.parse(inventory_csv, headers: false) do |row|
_bucket_name, filename, _version_id, is_latest, delete_marker, _size, _last_modified, _etag,
storage_class, multiple_upload, multipart_upload_flag, replication_status, checksum_algo = row

next if is_latest == 'false' || delete_marker == 'true'

s3_files << CGI.unescape(filename)
filenames.each do |id, filename|
item_name, extension = filename.split('.', 2)
filenames_hash[item_name] ||= { extensions: [], errors: [] }
filenames_hash[item_name][:extensions] << extension.downcase
end

if s3_files.size != s3_files.uniq.size
raise 'Duplicate files in S3 inventory'
ext_map = {
audio_ok: [
'mp3',
'wav'
],
audio: [
'mpg'
],
video_ok: [
'mp4',
'mxf',
'mkv'
],
video: [
'dv',
'mov',
'webm',
'm4v',
'avi',
'mts'
],
image_ok: [
'jpg',
'tif'
],
image: [
'png'
],
lang: [
'eaf',
'trs',
'xml',
'cha',
'fwbackup',
'pfsx',
'ixt',
'cmdi',
'lbl',
'textgrid',
'srt',
'flextext',
'tex',
'imdi',
'version',
'annis',
'opex'
],
standalone: [
'txt',
'pdf',
'rtf',
'xlsx',
'docx',
'img',
'tab',
'odt',
'html',
'csv',
'ods',
'kml',
'zip'
],
broken: [
'mov.eaf',
'eopas1.ixt',
'eopas2.ixt',
'mp4_good_audio.mp3',
'mp4_good_audio.mp4',
'mp4_good_audio.mxf',
'mp4_good_audio.wav',
'masing.pdf',
'masing.rtf',
'masing.txt',
'5.mp3',
'5.wav',
'txt.txt',
'wav.eaf',
'wav.mp3',
'wav.wav'
]
}

filenames_hash.each do |item_name, data|
filenames_hash[item_name][:extensions] = data[:extensions].sort
end

s3_files
end

def fetch_inventory_csv(inventory_dir)
manifest_json = @s3.get_object(bucket: @bucket, key: "#{inventory_dir}manifest.json").body.read
manifest = JSON.parse(manifest_json)
filenames_hash.each do |item_name, data|
extensions = data[:extensions].clone

files = manifest['files']
if files.size > 1
raise 'Multiple files in manifest'
end

file = files.first['key']

# Download the S3 Inventory CSV file
puts "Downloading S3 Inventory CSV file: #{file}"
inventory_gzipped = @s3.get_object(bucket: @bucket, key: file).body.read
puts "Unzipping file: #{file}\n\n"
inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read
end

def find_recent_inventory_dir
inventory_files = fetch_inventory_files

# Extract the timestamp part from each key and convert it to Time object
timestamped_files = inventory_files.map do |key|
match = key.match(/CatalogBucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/)
if match
year, month, day, hour, minute = match.captures
time = Time.new(year, month, day, hour, minute)
{ key: key, time: time }
# Audio
if extensions.include?('mp3') && !extensions.include?('wav')
data[:errors] << '+MP3-WAV'
end
end.compact

# Find the most recent file
most_recent_dir = timestamped_files.max_by { |file| file[:time] }
if extensions.include?('wav') && !extensions.include?('mp3')
data[:errors] << '+WAV-MP3'
end
extensions = extensions - ext_map[:audio_ok]
if (extensions & ext_map[:audio]).any?
data[:errors] << '+AUDIO'
end
extensions = extensions - ext_map[:audio]

puts "Most recent inventory file: #{most_recent_dir[:key]}"
# Normal Paragest Video (plus old video)
if extensions.include?('mkv') && !extensions.include?('mp4')
data[:errors] << '+MKV-MP4'
end
if extensions.include?('mxf') && !extensions.include?('mp4')
data[:errors] << '+MXF-MP4'
end
if extensions.include?('mp4') && !(extensions.include?('mkv') || extensions.include?('mxf'))
data[:errors] << '+MP4-MKV'
end
extensions = extensions - ext_map[:video_ok]
if (extensions & ext_map[:video]).any?
data[:errors] << '+VIDEO'
end
extensions = extensions - ext_map[:video]

most_recent_dir[:key]
end
# Image
if extensions.include?('jpg') && !extensions.include?('tif')
data[:errors] << '+JPG-TIF'
end
if extensions.include?('tif') && !extensions.include?('jpg')
data[:errors] << '+TIF-JPG'
end
extensions = extensions - ext_map[:image_ok]
if (extensions & ext_map[:image]).any?
data[:errors] << '+IMAGE'
end
extensions = extensions - ext_map[:image]

def fetch_inventory_files
inventory_files = []
next_token = nil
extensions = extensions - ext_map[:lang]
extensions = extensions - ext_map[:standalone]
if (extensions & ext_map[:broken]).any?
data[:errors] << '+BROKEN'
end
extensions = extensions - ext_map[:broken]

loop do
response = @s3.list_objects_v2(
bucket: @bucket,
prefix: @prefix,
delimiter: '/',
continuation_token: next_token
)

# Collect all object keys
inventory_files += response.common_prefixes.map(&:prefix)
if extensions.any?
abort "Item: #{item_name}, Extensions: #{extensions.join(', ')}"
end
end

break unless response.is_truncated
puts '# Error Summary'
filenames_hash.each do |item_name, data|
next if data[:errors].empty?

next_token = response.next_continuation_token
puts "Item: #{item_name}, Extensions: #{data[:extensions].join(', ')}, Errors: #{data[:errors].join(', ')}"
end

inventory_files
end
end

0 comments on commit 3266c00

Please sign in to comment.