Refactor how to extract tocs

PopulateTools · Nov 19, 2024 · 65c9745 · 65c9745
1 parent c70a544
commit 65c9745
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 150 deletions.
diff --git a/lib/section_extractor/document_parser.rb b/lib/section_extractor/document_parser.rb
@@ -46,7 +46,8 @@ def extract_sections(content, tocs) # rubocop:disable Metrics/AbcSize
     end
 
     def extract_tocs(content)
-      SectionExtractor::TocParser.new(content).call
+      all_tocs = SectionExtractor::TocParser.new(content).call
+      all_tocs.values.map(&:values).flatten
     end
 
     def section_exists?(sections, section)

diff --git a/lib/section_extractor/toc.rb b/lib/section_extractor/toc.rb
@@ -4,14 +4,14 @@ module SectionExtractor
   class Toc
     attr_accessor :toc_series_type, :toc_separator_chars, :toc_items
 
-    def initialize
+    def initialize(toc_series_type, toc_separator_chars)
       @toc_items = []
       # The type of toc series can be:
       #   - numeric: 1, 2, 3, ...
       #   - roman: I, II, III, ...
       #   - alpha: a), b), c), ...
-      @toc_series_type = nil
-      @toc_separator_chars = ""
+      @toc_series_type = toc_series_type
+      @toc_separator_chars = toc_separator_chars
     end
 
     def add_item(raw_title, position)

diff --git a/lib/section_extractor/toc_parser.rb b/lib/section_extractor/toc_parser.rb
@@ -5,21 +5,28 @@ class TocParser
     ROMAN_SERIES = %w[I II III IV V VI VII VIII IX X XI XII XIII XIV XV].freeze
     ALPHA_SERIES = ("a".."z").to_a
     MAX_TOC_ITEM_SIZE = 60
-    RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m
-    RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m
-    RE_ALPHA = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m
-    RE_SPECIAL = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
-
-    attr_reader :content
+    RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m
+    RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
+    RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
+    RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
+    RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m
+    REGEXES_WITH_TYPES = {
+      numeric: RE_NUMERIC,
+      numeric_with_clause: RE_NUMERIC_WITH_CLAUSE,
+      roman: RE_ROMAN,
+      roman_with_title: RE_ROMAN_WITH_TITLE,
+      alpha: RE_ALPHA
+    }
+
+    attr_reader :content, :tocs
 
     def initialize(content)
       @content = content
+      @tocs = {}
     end
 
     def call
-      tocs = []
-      [RE_NUMERIC, RE_ROMAN, RE_ALPHA, RE_SPECIAL].map do |re|
-        toc = Toc.new
+      REGEXES_WITH_TYPES.map do |type, re|
         content.scan(re).each do |match|
           toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ")
           toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":")
@@ -28,162 +35,59 @@ def call
           next if toc_item_title.include?(".....") || toc_item_title.include?("_____")
 
           toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) if toc_item_title.size > MAX_TOC_ITEM_SIZE
+          separator_char = detect_separator_chars(toc_item_title, type)
+          if separator_char.nil?
+            puts " - Skipping #{toc_item_title} because separator_char is nil (type: #{type})"
+            next
+          end
 
+          tocs[type] ||= {}
+          tocs[type][separator_char] ||= Toc.new(type, separator_char)
           puts " - Adding TOC item: #{toc_item_title}"
-          toc.add_item(toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first))
+          tocs[type][separator_char].add_item(
+            toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first)
+          )
         end
-
-        tocs << toc if toc.toc_items.any?
       end
 
-      analyze_and_close(tocs)
-    end
-
-    def analyze_and_close(tocs)
-      tocs.map do |toc|
-        toc.toc_series_type = detect_series_type(toc)
-        toc_separator_chars = detect_separator_chars(toc)
-        if toc_separator_chars.size > 1
-          extract_tocs_with_different_separators(toc, toc_separator_chars)
-        else
-          toc.toc_separator_chars = toc_separator_chars.first
-          toc
-        end
-      end.flatten.map do |toc|
-        calculate_titles(toc)
-        # TODO, for the moment is not necessary
-        # cleanup_toc_items(toc)
-
-        toc
-      end
-    end
-
-    private
-
-    def extract_tocs_with_different_separators(toc, toc_separator_chars)
-      tocs = []
-      toc_separator_chars.sort_by(&:size).reverse.each do |separator_char|
-        new_toc = Toc.new
-        new_toc.toc_separator_chars = separator_char
-        next if new_toc.toc_separator_chars.empty?
-
-        new_toc.toc_series_type = toc.toc_series_type
-        toc.toc_items.each do |item|
-          new_toc.add_item(item.title, item.position) if item.title.include?(separator_char)
-        end
-
-        # Delete the items from the original TOC
-        new_toc.toc_items.each do |new_item|
-          toc.toc_items.delete_if { |item| item.title == new_item.title }
-        end
-        tocs << new_toc
-      end
       tocs
     end
 
-    def calculate_titles(toc)
-      toc.toc_items.each do |item|
-        item.title = item.raw_title.split(toc.toc_separator_chars).last&.strip
-        control = item.raw_title.split(toc.toc_separator_chars).first
-        toc.toc_items.delete(item) if control.size > 10
-        toc.toc_items.delete(item) if toc.toc_separator_chars.nil? || toc.toc_separator_chars.size > 5
-        case toc.toc_series_type
-        when :numeric
-          if control !~ /\A\d+/
-            puts " - Skipping #{item.title}, should start with a number"
-            toc.toc_items.delete(item)
-          end
-        when :roman, :alpha
-          if control !~ /\A[A-Za-z]/
-            puts " - Skipping #{item.title}, should start with a letter"
-            toc.toc_items.delete(item)
-          end
-        else
-          raise "series type not detected"
-        end
-      end
-    end
-
-    def detect_series_type(toc)
-      random_items = toc.toc_items.sample(5)
-      types = random_items.map { |item| detect_series_type_from_item(item) }
-      # return the most common type
-      types.max_by { |type| types.count(type) }
-    end
-
-    def cleanup_toc_items(toc)
-      raise "series type not detected" unless toc.toc_series_type
-
-      # toc_items are sorted,
-      current_series_item = nil
-      next_series_item_should_be = expected_next_series_item(current_series_item)
-      new_toc_items = []
-
-      puts " - Cleaning up TOC items"
-      puts " - Toc separator chars: #{toc.toc_separator_chars}"
-
-      toc.toc_items.each_with_index do |item, _i|
-        if item.title !~ /\A#{next_series_item_should_be}\s*#{Regexp.quote(toc_separator_chars)}/
-          puts "- Skipping #{item.title}, should be #{next_series_item_should_be}#{toc_separator_chars}"
-          next
-        end
-
-        new_toc_items << item
-        current_series_item = next_series_item_should_be
-        next_series_item_should_be = expected_next_series_item(current_series_item)
-      end
-
-      toc.toc_items = new_toc_items
-    end
+    private
 
-    def detect_series_type_from_item(item)
-      case item.title
-      when /\A\d+/
-        :numeric
-      when /\A\b(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|IVX|XV)+\b/
-        :roman
-      when /\A[a-zA-Z]+/
-        :alpha
-      else
-        raise "series type not detected from title #{item.title}"
+    def detect_separator_chars(title, toc_series_type) # rubocop:disable Metrics/MethodLength
+      case toc_series_type
+      when :numeric
+        detect_numeric_series_separator_chars(title)
+      when :numeric_with_clause
+        detect_numeric_with_clause_series_separator_chars(title)
+      when :roman
+        detect_roman_series_separator_chars(title)
+      when :roman_with_title
+        detect_roman_with_title_series_separator_chars(title)
+      when :alpha
+        detect_alpha_series_separator_chars(title)
       end
     end
 
-    def detect_separator_chars(toc)
-      separators_chars = case toc.toc_series_type
-                         when :numeric
-                           toc.toc_items.map { |item| detect_numeric_series_separator_chars(item) }
-                         when :roman
-                           toc.toc_items.map { |item| detect_roman_series_separator_chars(item) }
-                         when :alpha
-                           toc.toc_items.map { |item| detect_alpha_series_separator_chars(item) }
-                         else
-                           raise "series type not detected"
-                         end
-      separators_chars.compact.uniq
+    def detect_numeric_series_separator_chars(title)
+      title.match(/(\d+(?:\.\d+)*(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
     end
 
-    def detect_numeric_series_separator_chars(item)
-      item.title.split(" ")[0].match(/.*\d([^\d]*)\z/) ? ::Regexp.last_match(1) : nil
+    def detect_numeric_with_clause_series_separator_chars(title)
+      title.match(/(?:Cláusula\s+)(\d+(?:\.\d+)*\s*(\.?-?)\s+[^\n]+)/m) ? ::Regexp.last_match(2) : nil
     end
 
-    def detect_roman_series_separator_chars(item)
-      item.title.match(/\b(IX|IV|V|VI|I|II|III)\b([^\s]+)\s/) ? ::Regexp.last_match(2) : nil
+    def detect_roman_series_separator_chars(title)
+      title.match(/((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
     end
 
-    def detect_alpha_series_separator_chars(item)
-      item.title.match(/([a-zA-Z])([^\s]+)\s/) ? ::Regexp.last_match(2) : nil
+    def detect_roman_with_title_series_separator_chars(title)
+      title.match(/((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil # rubocop:disable Layout/LineLength
     end
 
-    def expected_next_series_item(current_item)
-      case @toc_series_type
-      when :numeric
-        (current_item || 0) + 1
-      when :roman
-        ROMAN_SERIES[(ROMAN_SERIES.index(current_item) || -1) + 1]
-      when :alpha
-        ALPHA_SERIES[(ALPHA_SERIES.index(current_item) || -1) + 1]
-      end
+    def detect_alpha_series_separator_chars(title)
+      title.match(/([a-zA-Z]([).-]+)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
     end
   end
 end
diff --git a/spec/document_parser_spec.rb b/spec/document_parser_spec.rb
@@ -134,7 +134,6 @@ def assert_sections_absent(unexpected_sections)
     let(:file_path) { "spec/files/66067442.txt" }
 
     it "has these sections" do
-      binding.pry
       assert_sections_present([
         ["TÍTULO I. DISPOSICIONES GENERALES", ""],
         ["Cláusula 1 Régimen jurídico.", ""],