From b3753b4fc1c46b551fbb1d9bc257fe8062e97911 Mon Sep 17 00:00:00 2001 From: Rodolfo Zitellini Date: Tue, 10 Dec 2024 13:41:09 +0100 Subject: [PATCH] CMO Import --- housekeeping/correct/101_cmo.rb | 48 +++++++++++++++++++++++++ housekeeping/import/import_from_marc.rb | 3 ++ lib/marc_import.rb | 23 ++++++++---- lib/marc_node.rb | 42 ++++++++++++++++++---- 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 housekeeping/correct/101_cmo.rb diff --git a/housekeeping/correct/101_cmo.rb b/housekeeping/correct/101_cmo.rb new file mode 100644 index 000000000..c53abfafc --- /dev/null +++ b/housekeeping/correct/101_cmo.rb @@ -0,0 +1,48 @@ +def preprocess_cmo(marc, obj, options) + #puts "Callback to process #{obj.id}" + + # Remove the old 001 + marc.by_tags("001").each {|t2| t2.destroy_yourself} + + # And make a new one + # Add it in position 1 since there is a 000 in the original data + marc.root.add_at(MarcNode.new("person", "001", "__TEMP__", nil), 1) + + marc.by_tags("670").each do |t| + a = t.fetch_first_by_tag("a") + + if !a || !a.content || a.content.empty? + puts "Remove empty #{t}" + t.destroy_yourself + else + # Do some magics + parts = a.content.split(", ") + a.content = parts[0] + + if parts[1] + t.add_at(MarcNode.new("person", "9", parts[1], nil), 0 ) + t.sort_alphabetically + end + + end + end + + return marc +end + +files = Dir.glob("CMO-MARCXML/Person/*.xml") + +#source_file = "CMO-MARCXML/Person/cmo_person_00000001.xml" + +# Minimal option set +options = {first: 0, last: 1000000, versioning: false, index:false} + +options[:new_ids] = true +options[:authorities] = true +options[:callback] = method(:preprocess_cmo) + +files.each do |source_file| + puts source_file + import = MarcImport.new(source_file, "Person", options) + import.import +end \ No newline at end of file diff --git a/housekeeping/import/import_from_marc.rb b/housekeeping/import/import_from_marc.rb index 0f75e3946..773d641d8 100644 --- a/housekeeping/import/import_from_marc.rb +++ b/housekeeping/import/import_from_marc.rb @@ -17,6 +17,7 @@ def display_help -v, --versioning update records version -u, --authorities create (scaffold) related Marc authorities records -x, --index index records as they are imported + -n, --new-ids don't preserve imported ids, but assign new ones (default is to preserve) -h, --help this help This script can also be run with positional arguments: @@ -97,6 +98,8 @@ def display_help options[:authorities] = true elsif ["-x", "--index"].include? arg options[:index] = true + elsif ["-n", "--new-ids"].include? arg + options[:new_ids] = true elsif ["-h", "--help"].include? arg display_help # The following options are for backward compatibility diff --git a/lib/marc_import.rb b/lib/marc_import.rb index 90a93021b..ec6f7f91b 100644 --- a/lib/marc_import.rb +++ b/lib/marc_import.rb @@ -10,7 +10,7 @@ def initialize(source_file, model, options) @source_file = source_file @model = model @options = options - @total_records = open(source_file) { |f| f.grep(//) }.size + @total_records = open(source_file) { |f| f.grep(/record /) }.size @import_results = Array.new @cnt = 0 @start_time = Time.now @@ -69,13 +69,20 @@ def create_record(buffer) model = Object.const_get(@model).find_by_id(marc.get_id) if !model status = "created" + + params = {:wf_owner => 1, :wf_stage => "published"} + if @model == "Publication" - model = Object.const_get(@model).new(:id => marc.get_id, :name => marc.get_name, :author => marc.get_author, :journal=> marc.get_journal, :title => marc.get_title, :wf_owner => 1, :wf_stage => "published") + params += { name: marc.get_name, author: marc.get_author, journal: marc.get_journal, title: marc.get_title } elsif @model == "Source" - model = Object.const_get(@model).new(:id => marc.get_id, :lib_siglum => marc.get_siglum, :wf_owner => 1, :wf_stage => "published") - else - model = Object.const_get(@model).new(:id => marc.get_id, :wf_owner => 1, :wf_stage => "published") + params += {lib_siglum: marc.get_siglum} end + + # Preserve the id, unless we specifically want to create new ones + params[:id] = marc.get_id unless @options.include?(:new_ids) && @options[:new_ids] + + model = Object.const_get(@model).new(params) + puts model.id else status = "updated" end @@ -119,6 +126,9 @@ def create_record(buffer) #$stderr.puts "No date information for #{model.id}" end + # Callback + marc = @options[:callback]&.call(marc, model, @options) if @options.include?(:callback) + # Make internal format marc.to_internal @@ -192,7 +202,8 @@ def create_record(buffer) # $stderr.puts "#{marc.to_marc}" # puts e.backtrace.join("\n") # end - print "\rStarted: " + @start_time.strftime("%Y-%m-%d %H:%M:%S").green + " -- Record #{@cnt} of #{@total_records} processed".yellow +puts model.id + print "\rStarted: " + @start_time.strftime("%Y-%m-%d %H:%M:%S").green + " -- Record #{@cnt} of #{@total_records} processed\r\n".yellow #puts "Last offset: #{@total_records}, Last "+@model+" RISM ID: #{marc.first_occurance('001').content}" else $stderr.puts "Marc is not valid! #{buffer}" diff --git a/lib/marc_node.rb b/lib/marc_node.rb index a0c680704..cb4e4df54 100644 --- a/lib/marc_node.rb +++ b/lib/marc_node.rb @@ -118,14 +118,24 @@ def resolve_externals # Try to get a foreign object using the id. If the object does not exist, # create it. It is used during import of a Marc record, so relations (ex People or Library) # are established and in case created - def find_or_new_foreign_object_by_foreign_field(class_name, field_name, search_value) + def find_or_new_foreign_object_by_foreign_field(class_name, field_name, search_value, force_create = true) new_foreign_object = nil if foreign_class = get_class(class_name) new_foreign_object = foreign_class.send("find_by_" + field_name, search_value) if !new_foreign_object + + # We need to make sure id is valid! + if field_name == "id" && search_value.to_i == 0 + puts "find_or_new_foreign_object_by_foreign_field: #{foreign_class} #{search_value} is invalid as Muscat id".red + return false if !force_create + end + new_foreign_object = foreign_class.new new_foreign_object.send("#{field_name}=", search_value) new_foreign_object.send("wf_stage=", 'published') + puts "find_or_new_foreign_object_by_foreign_field: created new #{foreign_class} #{new_foreign_object.id} field:#{field_name}=#{search_value}".cyan + else + puts "find_or_new_foreign_object_by_foreign_field: matched #{foreign_class} #{new_foreign_object.id}".yellow end end return new_foreign_object @@ -133,18 +143,25 @@ def find_or_new_foreign_object_by_foreign_field(class_name, field_name, search_v # This works as find_or_new_foreign_object_by_foreign_field but instead of $0 id # it tries to use another field for the relation, as specified from the @marc_configuration. - def find_or_new_foreign_object_by_all_foreign_fields(class_name, tag, nmasters) + def find_or_new_foreign_object_by_all_foreign_fields(class_name, tag, nmasters, force = true) new_foreign_object = nil if foreign_class = get_class(class_name) conditions = Hash.new # put all the fields into a condition hash nmasters.each do |nmaster| + ap nmaster conditions[@marc_configuration.get_foreign_field(tag, nmaster.tag)] = nmaster.looked_up_content if !nmaster.looked_up_content.empty? end + # The imported fields are just... empty! + return false if !force && conditions.empty? + new_foreign_object = foreign_class.send("where", conditions).first if !new_foreign_object new_foreign_object = foreign_class.new new_foreign_object.send("wf_stage=", 'published') + puts "find_or_new_foreign_object_by_all_foreign_fields: created new #{foreign_class}".cyan + else + puts "find_or_new_foreign_object_by_all_foreign_fields: matched #{foreign_class}:#{new_foreign_object.id}, conditions:#{conditions}".yellow end end return new_foreign_object @@ -222,7 +239,7 @@ def import(overwrite = false, reindex = false, user = nil) end else self.sort_alphabetically - + # Before resolving the master fields, process the lightwheight link_to populate_links_to(self.tag) @@ -235,11 +252,24 @@ def import(overwrite = false, reindex = false, user = nil) add_master = false # will be used to check if we need to add a $_ db_master or not (for 004 we don't have one) add_db_master = true - # If we have a master subfield, fo the lookup using that + # If we have a master subfield, for the lookup using that if master master_field = @marc_configuration.get_foreign_field(tag, master.tag) - self.foreign_object = find_or_new_foreign_object_by_foreign_field(@marc_configuration.get_foreign_class(tag, master.tag), master_field, master.looked_up_content) - # If we have no master subfiled but master is actually empty "" (e.g. 004) with holding records + found_obj = find_or_new_foreign_object_by_foreign_field(@marc_configuration.get_foreign_class(tag, master.tag), master_field, master.looked_up_content, false) + + if found_obj + self.foreign_object = found_obj + else + # Try again, without using the master field + master_tag = @marc_configuration.get_master( self.tag ) + self.foreign_object = find_or_new_foreign_object_by_all_foreign_fields( @marc_configuration.get_foreign_class(tag, master_tag), tag, nmasters, false ) + end + #a = self.fetch_first_by_tag("a") + #if self.foreign_object.class == Publication + # puts "#{self.foreign_object.id}\t#{self.foreign_object.name}\t#{a.content}" if self.foreign_object.name && a && a.content && self.foreign_object.name.downcase.strip != a.content.downcase.strip + #end + + # If we have no master subfiled but master is actually empty "" (e.g. 004) with holding records elsif !master && @marc_configuration.get_master( self.tag ) == "" add_db_master = false master_field = @marc_configuration.get_foreign_field(tag, "")