From 67bfa0e2af4575f40d6e08015a4489d558b9a312 Mon Sep 17 00:00:00 2001 From: wvengen Date: Fri, 31 May 2024 15:46:43 +0200 Subject: [PATCH] Support names with multiple parts in loose parser (#10) This allows the loose parser to include parts of the name before and after other things like mark, amount or children. In "cheese (MILK) with 2.3% fat" the whole ingredient name is now included. Also things like "foo* 50%" now recognize the amount. --- lib/food_ingredient_parser/loose/node.rb | 15 ++++++--- lib/food_ingredient_parser/loose/scanner.rb | 31 ++++++++++++------- .../loose/transform/amount.rb | 26 ++++++++++------ .../loose/transform/handle_missing_name.rb | 3 +- .../loose/transform/split_e_numbers.rb | 28 ++++++++++------- 5 files changed, 67 insertions(+), 36 deletions(-) diff --git a/lib/food_ingredient_parser/loose/node.rb b/lib/food_ingredient_parser/loose/node.rb index 6fc60b2..3fe649b 100644 --- a/lib/food_ingredient_parser/loose/node.rb +++ b/lib/food_ingredient_parser/loose/node.rb @@ -5,7 +5,7 @@ module FoodIngredientParser::Loose class Node include ToHtml - attr_accessor :name, :mark, :amount, :contains, :notes + attr_accessor :name_parts, :mark, :amount, :contains, :notes attr_reader :input, :interval, :auto_close def initialize(input, interval, auto_close: false) @@ -14,7 +14,8 @@ def initialize(input, interval, auto_close: false) @auto_close = auto_close @contains = [] @notes = [] - @name = @mark = @amount = nil + @name_parts = [] + @mark = @amount = nil end def ends(index) @@ -31,7 +32,8 @@ def text_value def to_h r = {} - r[:name] = name.text_value.strip if name && name.text_value.strip != '' + _name = name + r[:name] = _name if _name r[:marks] = [mark.text_value.strip] if mark r[:amount] = amount.text_value.strip if amount r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any? @@ -39,6 +41,11 @@ def to_h r end + def name + strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' } + return strings.any? ? strings.join(" ") : nil + end + def inspect(indent="", variant="") inspect_self(indent, variant) + inspect_children(indent) @@ -47,7 +54,7 @@ def inspect(indent="", variant="") def inspect_self(indent="", variant="") [ indent + "Node#{variant} interval=#{@interval}", - name ? "name=#{name.text_value.strip.inspect}" : nil, + name ? "name=#{name.inspect}" : nil, mark ? "mark=#{mark.text_value.strip.inspect}" : nil, amount ? "amount=#{amount.text_value.strip.inspect}" : nil, auto_close ? "auto_close" : nil diff --git a/lib/food_ingredient_parser/loose/scanner.rb b/lib/food_ingredient_parser/loose/scanner.rb index 917dbd6..0f7da91 100644 --- a/lib/food_ingredient_parser/loose/scanner.rb +++ b/lib/food_ingredient_parser/loose/scanner.rb @@ -33,8 +33,9 @@ class Scanner def initialize(s, index: 0) @s = s # input string - @i = index # current index in string + @i = index # current index in string, the iterator looks at this character @cur = nil # current node we're populating + @curifree = nil # last index in string for current node that we haven't added to a child node yet @ancestors = [Node.new(@s, @i)] # nesting hierarchy @iterator = :beginning # scan_iteration_ to use for parsing @dest = :contains # append current node to this attribute on parent @@ -79,6 +80,7 @@ def scan_iteration_standard # after bracket check for 'and' to not lose text if is_and_sep?(@i+1) @i += and_sep_len(@i+1) + @curifree = @i # don't include 'and' in cur name add_child end elsif is_notes_start? # usually a dot marks the start of notes @@ -147,7 +149,11 @@ def parent end def cur - @cur ||= Node.new(@s, @i) + if !@cur + @cur ||= Node.new(@s, @i) + @curifree = @i + end + @cur end def is_sep?(chars: SEP_CHARS) @@ -201,16 +207,19 @@ def add_child cur.ends(@i-1) parent.send(@dest) << cur @cur = nil + @curifree = nil end def open_parent(**options) name_until_here @ancestors << cur @cur = Node.new(@s, @i + 1, **options) + @curifree = @i + 1 end def close_parent return unless @ancestors.count > 1 + @curifree = @i + 1 @cur = @ancestors.pop while @cur.auto_close add_child @@ -227,15 +236,15 @@ def close_all_ancestors end def name_until_here - cur.name ||= begin - i, j = cur.interval.first, @i - 1 - i += mark_len(i) # skip any mark in front - # Set name if there is any. There is one corner-case that needs to be avoided when - # a nesting was opened without a name, which would set the name to the nesting text. - # In this case, the name starts with an open-nesting symbol, which should never happen. - if j >= i && !"([:".include?(@s[i]) - Node.new(@s, i .. j) - end + return unless @curifree # no cur started yet + i, j = @curifree, @i - 1 + i += mark_len(i) # skip any mark in front + # Set name if there is any. There is one corner-case that needs to be avoided when + # a nesting was opened without a name, which would set the name to the nesting text. + # In this case, the name starts with an open-nesting symbol, which should never happen. + if j >= i && !"([:".include?(@s[i]) + cur.name_parts << Node.new(@s, i .. j) + @curifree = @i end end diff --git a/lib/food_ingredient_parser/loose/transform/amount.rb b/lib/food_ingredient_parser/loose/transform/amount.rb index 6c3f9f1..2a66ad9 100644 --- a/lib/food_ingredient_parser/loose/transform/amount.rb +++ b/lib/food_ingredient_parser/loose/transform/amount.rb @@ -29,18 +29,26 @@ def transform! # Extract amount from name, if any. def transform_name(node = @node) - if !node.amount && parsed = parse_amount(node.name&.text_value) - offset = node.name.interval.first + if !node.amount + node.name_parts.each_with_index do |name, i| + parsed = parse_amount(name.text_value) + next unless parsed + offset = name.interval.first - amount = parsed.amount.amount - node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1) + amount = parsed.amount.amount + node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1) - name = parsed.respond_to?(:name) && parsed.name - if name && name.interval.count > 0 - node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1) - else - node.name = nil + name = parsed.respond_to?(:name) && parsed.name + node.name_parts[i] = if name && name.interval.count > 0 + Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1) + else + nil + end + # found an amount, stop looking in other parts + break end + # remove cleared name parts + node.name_parts.reject!(&:nil?) end # recursively transform contained nodes diff --git a/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb b/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb index 362da2c..ff43529 100644 --- a/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb +++ b/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb @@ -42,7 +42,8 @@ def transform_children!(node) # Apply recursively. Do it before processing to handle multiple depth levels of missing names. transform_children!(child) if child.contains.any? - if child.name.nil? || child.name.text_value.strip == '' + name = child.name + if name.nil? || name == '' # Name is empty, we need to do something. if prev # there is a previous ingredient: move children to new parent diff --git a/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb b/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb index b36f5e4..0d4f67a 100644 --- a/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb +++ b/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb @@ -29,21 +29,27 @@ def transform! def transform_node!(node) if node.contains.any? node.contains.each {|n| transform_node!(n) } - elsif node.name && m = MATCH_RE.match(node.name.text_value) - i = 0 - while m = node.name.text_value.match(SPLIT_RE, i) - node.contains << new_node(node, i, m.begin(0)-1) - i = m.end(0) + else + node.name_parts.each_with_index do |name, name_index| + if m = MATCH_RE.match(name.text_value) + i = 0 + while m = name.text_value.match(SPLIT_RE, i) + node.contains << new_node(name, i, m.begin(0)-1) + i = m.end(0) + end + node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last + node.name_parts[name_index] = nil + end end - node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last - node.name = nil + # remove cleared name parts + node.name_parts.reject!(&:nil?) end end - def new_node(node, begins, ends) - offset = node.name.interval.first - new_node = Node.new(node.input, offset + begins .. offset + ends) - new_node.name = Node.new(node.input, new_node.interval) + def new_node(name, begins, ends) + offset = name.interval.first + new_node = Node.new(name.input, offset + begins .. offset + ends) + new_node.name_parts = [Node.new(name.input, new_node.interval)] new_node end end