From 67bfa0e2af4575f40d6e08015a4489d558b9a312 Mon Sep 17 00:00:00 2001
From: wvengen <willem@thequestionmark.org>
Date: Fri, 31 May 2024 15:46:43 +0200
Subject: [PATCH] Support names with multiple parts in loose parser (#10)

This allows the loose parser to include parts of the name before and
after other things like mark, amount or children.

In "cheese (MILK) with 2.3% fat" the whole ingredient name is now
included. Also things like "foo* 50%" now recognize the amount.
---
 lib/food_ingredient_parser/loose/node.rb      | 15 ++++++---
 lib/food_ingredient_parser/loose/scanner.rb   | 31 ++++++++++++-------
 .../loose/transform/amount.rb                 | 26 ++++++++++------
 .../loose/transform/handle_missing_name.rb    |  3 +-
 .../loose/transform/split_e_numbers.rb        | 28 ++++++++++-------
 5 files changed, 67 insertions(+), 36 deletions(-)
diff --git a/lib/food_ingredient_parser/loose/node.rb b/lib/food_ingredient_parser/loose/node.rb
index 6fc60b2..3fe649b 100644
--- a/lib/food_ingredient_parser/loose/node.rb
+++ b/lib/food_ingredient_parser/loose/node.rb
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
   class Node
     include ToHtml
 
-    attr_accessor :name, :mark, :amount, :contains, :notes
+    attr_accessor :name_parts, :mark, :amount, :contains, :notes
     attr_reader :input, :interval, :auto_close
 
     def initialize(input, interval, auto_close: false)
@@ -14,7 +14,8 @@ def initialize(input, interval, auto_close: false)
       @auto_close = auto_close
       @contains = []
       @notes = []
-      @name = @mark = @amount = nil
+      @name_parts = []
+      @mark = @amount = nil
     end
 
     def ends(index)
@@ -31,7 +32,8 @@ def text_value
 
     def to_h
       r = {}
-      r[:name] = name.text_value.strip if name && name.text_value.strip != ''
+      _name = name
+      r[:name] = _name if _name
       r[:marks] = [mark.text_value.strip] if mark
       r[:amount] = amount.text_value.strip if amount
       r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
@@ -39,6 +41,11 @@ def to_h
       r
     end
 
+    def name
+      strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
+      return strings.any? ? strings.join(" ") : nil
+    end
+
     def inspect(indent="", variant="")
       inspect_self(indent, variant) +
       inspect_children(indent)
@@ -47,7 +54,7 @@ def inspect(indent="", variant="")
     def inspect_self(indent="", variant="")
       [
         indent + "Node#{variant} interval=#{@interval}",
-        name ? "name=#{name.text_value.strip.inspect}" : nil,
+        name ? "name=#{name.inspect}" : nil,
         mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
         amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
         auto_close ? "auto_close" : nil
diff --git a/lib/food_ingredient_parser/loose/scanner.rb b/lib/food_ingredient_parser/loose/scanner.rb
index 917dbd6..0f7da91 100644
--- a/lib/food_ingredient_parser/loose/scanner.rb
+++ b/lib/food_ingredient_parser/loose/scanner.rb
@@ -33,8 +33,9 @@ class Scanner
 
     def initialize(s, index: 0)
       @s = s                           # input string
-      @i = index                       # current index in string
+      @i = index                       # current index in string, the iterator looks at this character
       @cur = nil                       # current node we're populating
+      @curifree = nil                  # last index in string for current node that we haven't added to a child node yet
       @ancestors = [Node.new(@s, @i)]  # nesting hierarchy
       @iterator = :beginning           # scan_iteration_<iterator> to use for parsing
       @dest = :contains                # append current node to this attribute on parent
@@ -79,6 +80,7 @@ def scan_iteration_standard
         # after bracket check for 'and' to not lose text
         if is_and_sep?(@i+1)
           @i += and_sep_len(@i+1)
+          @curifree = @i # don't include 'and' in cur name
           add_child
         end
       elsif is_notes_start?       # usually a dot marks the start of notes
@@ -147,7 +149,11 @@ def parent
     end
 
     def cur
-      @cur ||= Node.new(@s, @i)
+      if !@cur
+        @cur ||= Node.new(@s, @i)
+        @curifree = @i
+      end
+      @cur
     end
 
     def is_sep?(chars: SEP_CHARS)
@@ -201,16 +207,19 @@ def add_child
       cur.ends(@i-1)
       parent.send(@dest) << cur
       @cur = nil
+      @curifree = nil
     end
 
     def open_parent(**options)
       name_until_here
       @ancestors << cur
       @cur = Node.new(@s, @i + 1, **options)
+      @curifree = @i + 1
     end
 
     def close_parent
       return unless @ancestors.count > 1
+      @curifree = @i + 1
       @cur = @ancestors.pop
       while @cur.auto_close
         add_child
@@ -227,15 +236,15 @@ def close_all_ancestors
     end
 
     def name_until_here
-      cur.name ||= begin
-        i, j = cur.interval.first, @i - 1
-        i += mark_len(i) # skip any mark in front
-        # Set name if there is any. There is one corner-case that needs to be avoided when
-        # a nesting was opened without a name, which would set the name to the nesting text.
-        # In this case, the name starts with an open-nesting symbol, which should never happen.
-        if j >= i && !"([:".include?(@s[i])
-          Node.new(@s, i .. j)
-        end
+      return unless @curifree # no cur started yet
+      i, j = @curifree, @i - 1
+      i += mark_len(i) # skip any mark in front
+      # Set name if there is any. There is one corner-case that needs to be avoided when
+      # a nesting was opened without a name, which would set the name to the nesting text.
+      # In this case, the name starts with an open-nesting symbol, which should never happen.
+      if j >= i && !"([:".include?(@s[i])
+        cur.name_parts << Node.new(@s, i .. j)
+        @curifree = @i
       end
     end
 
diff --git a/lib/food_ingredient_parser/loose/transform/amount.rb b/lib/food_ingredient_parser/loose/transform/amount.rb
index 6c3f9f1..2a66ad9 100644
--- a/lib/food_ingredient_parser/loose/transform/amount.rb
+++ b/lib/food_ingredient_parser/loose/transform/amount.rb
@@ -29,18 +29,26 @@ def transform!
 
       # Extract amount from name, if any.
       def transform_name(node = @node)
-        if !node.amount && parsed = parse_amount(node.name&.text_value)
-          offset = node.name.interval.first
+        if !node.amount
+          node.name_parts.each_with_index do |name, i|
+            parsed = parse_amount(name.text_value)
+            next unless parsed
+            offset = name.interval.first
 
-          amount = parsed.amount.amount
-          node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
+            amount = parsed.amount.amount
+            node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
 
-          name = parsed.respond_to?(:name) && parsed.name
-          if name && name.interval.count > 0
-            node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
-          else
-            node.name = nil
+            name = parsed.respond_to?(:name) && parsed.name
+            node.name_parts[i] = if name && name.interval.count > 0
+              Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
+            else
+              nil
+            end
+            # found an amount, stop looking in other parts
+            break
           end
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
 
         # recursively transform contained nodes
diff --git a/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb b/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
index 362da2c..ff43529 100644
--- a/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
+++ b/lib/food_ingredient_parser/loose/transform/handle_missing_name.rb
@@ -42,7 +42,8 @@ def transform_children!(node)
           # Apply recursively. Do it before processing to handle multiple depth levels of missing names.
           transform_children!(child) if child.contains.any?
 
-          if child.name.nil? || child.name.text_value.strip == ''
+          name = child.name
+          if name.nil? || name == ''
             # Name is empty, we need to do something.
             if prev
               # there is a previous ingredient: move children to new parent
diff --git a/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb b/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb
index b36f5e4..0d4f67a 100644
--- a/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb
+++ b/lib/food_ingredient_parser/loose/transform/split_e_numbers.rb
@@ -29,21 +29,27 @@ def transform!
       def transform_node!(node)
         if node.contains.any?
           node.contains.each {|n| transform_node!(n) }
-        elsif node.name && m = MATCH_RE.match(node.name.text_value)
-          i = 0
-          while m = node.name.text_value.match(SPLIT_RE, i)
-            node.contains << new_node(node, i, m.begin(0)-1)
-            i = m.end(0)
+        else
+          node.name_parts.each_with_index do |name, name_index|
+            if m = MATCH_RE.match(name.text_value)
+              i = 0
+              while m = name.text_value.match(SPLIT_RE, i)
+                node.contains << new_node(name, i, m.begin(0)-1)
+                i = m.end(0)
+              end
+              node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
+              node.name_parts[name_index] = nil
+            end
           end
-          node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
-          node.name = nil
+          # remove cleared name parts
+          node.name_parts.reject!(&:nil?)
         end
       end
 
-      def new_node(node, begins, ends)
-        offset = node.name.interval.first
-        new_node = Node.new(node.input, offset + begins .. offset + ends)
-        new_node.name = Node.new(node.input, new_node.interval)
+      def new_node(name, begins, ends)
+        offset = name.interval.first
+        new_node = Node.new(name.input, offset + begins .. offset + ends)
+        new_node.name_parts = [Node.new(name.input, new_node.interval)]
         new_node
       end
     end