Skip to content

Commit

Permalink
CMO Import
Browse files Browse the repository at this point in the history
  • Loading branch information
xhero committed Dec 10, 2024
1 parent 25b2956 commit b3753b4
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 12 deletions.
48 changes: 48 additions & 0 deletions housekeeping/correct/101_cmo.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
def preprocess_cmo(marc, obj, options)
#puts "Callback to process #{obj.id}"

# Remove the old 001
marc.by_tags("001").each {|t2| t2.destroy_yourself}

# And make a new one
# Add it in position 1 since there is a 000 in the original data
marc.root.add_at(MarcNode.new("person", "001", "__TEMP__", nil), 1)

marc.by_tags("670").each do |t|
a = t.fetch_first_by_tag("a")

if !a || !a.content || a.content.empty?
puts "Remove empty #{t}"
t.destroy_yourself
else
# Do some magics
parts = a.content.split(", ")
a.content = parts[0]

if parts[1]
t.add_at(MarcNode.new("person", "9", parts[1], nil), 0 )
t.sort_alphabetically
end

end
end

return marc
end

files = Dir.glob("CMO-MARCXML/Person/*.xml")

#source_file = "CMO-MARCXML/Person/cmo_person_00000001.xml"

# Minimal option set
options = {first: 0, last: 1000000, versioning: false, index:false}

options[:new_ids] = true
options[:authorities] = true
options[:callback] = method(:preprocess_cmo)

files.each do |source_file|
puts source_file
import = MarcImport.new(source_file, "Person", options)
import.import
end
3 changes: 3 additions & 0 deletions housekeeping/import/import_from_marc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def display_help
-v, --versioning update records version
-u, --authorities create (scaffold) related Marc authorities records
-x, --index index records as they are imported
-n, --new-ids don't preserve imported ids, but assign new ones (default is to preserve)
-h, --help this help
This script can also be run with positional arguments:
Expand Down Expand Up @@ -97,6 +98,8 @@ def display_help
options[:authorities] = true
elsif ["-x", "--index"].include? arg
options[:index] = true
elsif ["-n", "--new-ids"].include? arg
options[:new_ids] = true
elsif ["-h", "--help"].include? arg
display_help
# The following options are for backward compatibility
Expand Down
23 changes: 17 additions & 6 deletions lib/marc_import.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def initialize(source_file, model, options)
@source_file = source_file
@model = model
@options = options
@total_records = open(source_file) { |f| f.grep(/<record>/) }.size
@total_records = open(source_file) { |f| f.grep(/record /) }.size
@import_results = Array.new
@cnt = 0
@start_time = Time.now
Expand Down Expand Up @@ -69,13 +69,20 @@ def create_record(buffer)
model = Object.const_get(@model).find_by_id(marc.get_id)
if !model
status = "created"

params = {:wf_owner => 1, :wf_stage => "published"}

if @model == "Publication"
model = Object.const_get(@model).new(:id => marc.get_id, :name => marc.get_name, :author => marc.get_author, :journal=> marc.get_journal, :title => marc.get_title, :wf_owner => 1, :wf_stage => "published")
params += { name: marc.get_name, author: marc.get_author, journal: marc.get_journal, title: marc.get_title }
elsif @model == "Source"
model = Object.const_get(@model).new(:id => marc.get_id, :lib_siglum => marc.get_siglum, :wf_owner => 1, :wf_stage => "published")
else
model = Object.const_get(@model).new(:id => marc.get_id, :wf_owner => 1, :wf_stage => "published")
params += {lib_siglum: marc.get_siglum}
end

# Preserve the id, unless we specifically want to create new ones
params[:id] = marc.get_id unless @options.include?(:new_ids) && @options[:new_ids]

model = Object.const_get(@model).new(params)
puts model.id
else
status = "updated"
end
Expand Down Expand Up @@ -119,6 +126,9 @@ def create_record(buffer)
#$stderr.puts "No date information for #{model.id}"
end

# Callback
marc = @options[:callback]&.call(marc, model, @options) if @options.include?(:callback)

# Make internal format
marc.to_internal

Expand Down Expand Up @@ -192,7 +202,8 @@ def create_record(buffer)
# $stderr.puts "#{marc.to_marc}"
# puts e.backtrace.join("\n")
# end
print "\rStarted: " + @start_time.strftime("%Y-%m-%d %H:%M:%S").green + " -- Record #{@cnt} of #{@total_records} processed".yellow
puts model.id
print "\rStarted: " + @start_time.strftime("%Y-%m-%d %H:%M:%S").green + " -- Record #{@cnt} of #{@total_records} processed\r\n".yellow
#puts "Last offset: #{@total_records}, Last "+@model+" RISM ID: #{marc.first_occurance('001').content}"
else
$stderr.puts "Marc is not valid! #{buffer}"
Expand Down
42 changes: 36 additions & 6 deletions lib/marc_node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -118,33 +118,50 @@ def resolve_externals
# Try to get a foreign object using the id. If the object does not exist,
# create it. It is used during import of a Marc record, so relations (ex People or Library)
# are established and in case created
def find_or_new_foreign_object_by_foreign_field(class_name, field_name, search_value)
def find_or_new_foreign_object_by_foreign_field(class_name, field_name, search_value, force_create = true)
new_foreign_object = nil
if foreign_class = get_class(class_name)
new_foreign_object = foreign_class.send("find_by_" + field_name, search_value)
if !new_foreign_object

# We need to make sure id is valid!
if field_name == "id" && search_value.to_i == 0
puts "find_or_new_foreign_object_by_foreign_field: #{foreign_class} #{search_value} is invalid as Muscat id".red
return false if !force_create
end

new_foreign_object = foreign_class.new
new_foreign_object.send("#{field_name}=", search_value)
new_foreign_object.send("wf_stage=", 'published')
puts "find_or_new_foreign_object_by_foreign_field: created new #{foreign_class} #{new_foreign_object.id} field:#{field_name}=#{search_value}".cyan
else
puts "find_or_new_foreign_object_by_foreign_field: matched #{foreign_class} #{new_foreign_object.id}".yellow
end
end
return new_foreign_object
end

# This works as find_or_new_foreign_object_by_foreign_field but instead of $0 id
# it tries to use another field for the relation, as specified from the @marc_configuration.
def find_or_new_foreign_object_by_all_foreign_fields(class_name, tag, nmasters)
def find_or_new_foreign_object_by_all_foreign_fields(class_name, tag, nmasters, force = true)
new_foreign_object = nil
if foreign_class = get_class(class_name)
conditions = Hash.new
# put all the fields into a condition hash
nmasters.each do |nmaster|
ap nmaster
conditions[@marc_configuration.get_foreign_field(tag, nmaster.tag)] = nmaster.looked_up_content if !nmaster.looked_up_content.empty?
end
# The imported fields are just... empty!
return false if !force && conditions.empty?

new_foreign_object = foreign_class.send("where", conditions).first
if !new_foreign_object
new_foreign_object = foreign_class.new
new_foreign_object.send("wf_stage=", 'published')
puts "find_or_new_foreign_object_by_all_foreign_fields: created new #{foreign_class}".cyan
else
puts "find_or_new_foreign_object_by_all_foreign_fields: matched #{foreign_class}:#{new_foreign_object.id}, conditions:#{conditions}".yellow
end
end
return new_foreign_object
Expand Down Expand Up @@ -222,7 +239,7 @@ def import(overwrite = false, reindex = false, user = nil)
end
else
self.sort_alphabetically

# Before resolving the master fields, process the lightwheight link_to
populate_links_to(self.tag)

Expand All @@ -235,11 +252,24 @@ def import(overwrite = false, reindex = false, user = nil)
add_master = false
# will be used to check if we need to add a $_ db_master or not (for 004 we don't have one)
add_db_master = true
# If we have a master subfield, fo the lookup using that
# If we have a master subfield, for the lookup using that
if master
master_field = @marc_configuration.get_foreign_field(tag, master.tag)
self.foreign_object = find_or_new_foreign_object_by_foreign_field(@marc_configuration.get_foreign_class(tag, master.tag), master_field, master.looked_up_content)
# If we have no master subfiled but master is actually empty "" (e.g. 004) with holding records
found_obj = find_or_new_foreign_object_by_foreign_field(@marc_configuration.get_foreign_class(tag, master.tag), master_field, master.looked_up_content, false)

if found_obj
self.foreign_object = found_obj
else
# Try again, without using the master field
master_tag = @marc_configuration.get_master( self.tag )
self.foreign_object = find_or_new_foreign_object_by_all_foreign_fields( @marc_configuration.get_foreign_class(tag, master_tag), tag, nmasters, false )
end
#a = self.fetch_first_by_tag("a")
#if self.foreign_object.class == Publication
# puts "#{self.foreign_object.id}\t#{self.foreign_object.name}\t#{a.content}" if self.foreign_object.name && a && a.content && self.foreign_object.name.downcase.strip != a.content.downcase.strip
#end

# If we have no master subfiled but master is actually empty "" (e.g. 004) with holding records
elsif !master && @marc_configuration.get_master( self.tag ) == ""
add_db_master = false
master_field = @marc_configuration.get_foreign_field(tag, "")
Expand Down

0 comments on commit b3753b4

Please sign in to comment.