Skip to content

Commit

Permalink
Refactor how to extract tocs
Browse files Browse the repository at this point in the history
  • Loading branch information
ferblape committed Nov 19, 2024
1 parent c70a544 commit 65c9745
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 150 deletions.
3 changes: 2 additions & 1 deletion lib/section_extractor/document_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def extract_sections(content, tocs) # rubocop:disable Metrics/AbcSize
end

def extract_tocs(content)
SectionExtractor::TocParser.new(content).call
all_tocs = SectionExtractor::TocParser.new(content).call
all_tocs.values.map(&:values).flatten
end

def section_exists?(sections, section)
Expand Down
6 changes: 3 additions & 3 deletions lib/section_extractor/toc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ module SectionExtractor
class Toc
attr_accessor :toc_series_type, :toc_separator_chars, :toc_items

def initialize
def initialize(toc_series_type, toc_separator_chars)
@toc_items = []
# The type of toc series can be:
# - numeric: 1, 2, 3, ...
# - roman: I, II, III, ...
# - alpha: a), b), c), ...
@toc_series_type = nil
@toc_separator_chars = ""
@toc_series_type = toc_series_type
@toc_separator_chars = toc_separator_chars
end

def add_item(raw_title, position)
Expand Down
194 changes: 49 additions & 145 deletions lib/section_extractor/toc_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,28 @@ class TocParser
ROMAN_SERIES = %w[I II III IV V VI VII VIII IX X XI XII XIII XIV XV].freeze
ALPHA_SERIES = ("a".."z").to_a
MAX_TOC_ITEM_SIZE = 60
RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?\-?\s+[^\n]+)\n/m
RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3})\.?\-?\s+[^\n]+)\n/m
RE_ALPHA = /\n([a-zA-Z][\).-]+\s+[^\n]+)\n/m
RE_SPECIAL = /\n((?:ANEXO|CAPITULO|CAPÍTULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi

attr_reader :content
RE_NUMERIC = /\n(\d+(?:\.\d+)*\.?-?\s+[^\n]+)\n/m
RE_NUMERIC_WITH_CLAUSE = /\n((?:Cláusula\s+)(\d+(?:\.\d+)*\.?-?\s+[^\n]+))\n/m
RE_ROMAN = /\n((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?\.?-?\s+[^\n]+)\n/mi
RE_ROMAN_WITH_TITLE = /\n((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})[.-]*\s+[^\n]+)\n/mi
RE_ALPHA = /\n([a-zA-Z][).-]+\s+[^\n]+)\n/m
REGEXES_WITH_TYPES = {
numeric: RE_NUMERIC,
numeric_with_clause: RE_NUMERIC_WITH_CLAUSE,
roman: RE_ROMAN,
roman_with_title: RE_ROMAN_WITH_TITLE,
alpha: RE_ALPHA
}

attr_reader :content, :tocs

def initialize(content)
@content = content
@tocs = {}
end

def call
tocs = []
[RE_NUMERIC, RE_ROMAN, RE_ALPHA, RE_SPECIAL].map do |re|
toc = Toc.new
REGEXES_WITH_TYPES.map do |type, re|
content.scan(re).each do |match|
toc_item_title = match.first.strip.gsub(/\n/, "").gsub(/\s+/, " ")
toc_item_title = toc_item_title.split(":").first.strip if toc_item_title.include?(":")
Expand All @@ -28,162 +35,59 @@ def call
next if toc_item_title.include?(".....") || toc_item_title.include?("_____")

toc_item_title = toc_item_title.slice(0, MAX_TOC_ITEM_SIZE) if toc_item_title.size > MAX_TOC_ITEM_SIZE
separator_char = detect_separator_chars(toc_item_title, type)
if separator_char.nil?
puts " - Skipping #{toc_item_title} because separator_char is nil (type: #{type})"
next
end

tocs[type] ||= {}
tocs[type][separator_char] ||= Toc.new(type, separator_char)
puts " - Adding TOC item: #{toc_item_title}"
toc.add_item(toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first))
tocs[type][separator_char].add_item(
toc_item_title, content.rindex(toc_item_title) || content.rindex(match.first)
)
end

tocs << toc if toc.toc_items.any?
end

analyze_and_close(tocs)
end

def analyze_and_close(tocs)
tocs.map do |toc|
toc.toc_series_type = detect_series_type(toc)
toc_separator_chars = detect_separator_chars(toc)
if toc_separator_chars.size > 1
extract_tocs_with_different_separators(toc, toc_separator_chars)
else
toc.toc_separator_chars = toc_separator_chars.first
toc
end
end.flatten.map do |toc|
calculate_titles(toc)
# TODO, for the moment is not necessary
# cleanup_toc_items(toc)

toc
end
end

private

def extract_tocs_with_different_separators(toc, toc_separator_chars)
tocs = []
toc_separator_chars.sort_by(&:size).reverse.each do |separator_char|
new_toc = Toc.new
new_toc.toc_separator_chars = separator_char
next if new_toc.toc_separator_chars.empty?

new_toc.toc_series_type = toc.toc_series_type
toc.toc_items.each do |item|
new_toc.add_item(item.title, item.position) if item.title.include?(separator_char)
end

# Delete the items from the original TOC
new_toc.toc_items.each do |new_item|
toc.toc_items.delete_if { |item| item.title == new_item.title }
end
tocs << new_toc
end
tocs
end

def calculate_titles(toc)
toc.toc_items.each do |item|
item.title = item.raw_title.split(toc.toc_separator_chars).last&.strip
control = item.raw_title.split(toc.toc_separator_chars).first
toc.toc_items.delete(item) if control.size > 10
toc.toc_items.delete(item) if toc.toc_separator_chars.nil? || toc.toc_separator_chars.size > 5
case toc.toc_series_type
when :numeric
if control !~ /\A\d+/
puts " - Skipping #{item.title}, should start with a number"
toc.toc_items.delete(item)
end
when :roman, :alpha
if control !~ /\A[A-Za-z]/
puts " - Skipping #{item.title}, should start with a letter"
toc.toc_items.delete(item)
end
else
raise "series type not detected"
end
end
end

def detect_series_type(toc)
random_items = toc.toc_items.sample(5)
types = random_items.map { |item| detect_series_type_from_item(item) }
# return the most common type
types.max_by { |type| types.count(type) }
end

def cleanup_toc_items(toc)
raise "series type not detected" unless toc.toc_series_type

# toc_items are sorted,
current_series_item = nil
next_series_item_should_be = expected_next_series_item(current_series_item)
new_toc_items = []

puts " - Cleaning up TOC items"
puts " - Toc separator chars: #{toc.toc_separator_chars}"

toc.toc_items.each_with_index do |item, _i|
if item.title !~ /\A#{next_series_item_should_be}\s*#{Regexp.quote(toc_separator_chars)}/
puts "- Skipping #{item.title}, should be #{next_series_item_should_be}#{toc_separator_chars}"
next
end

new_toc_items << item
current_series_item = next_series_item_should_be
next_series_item_should_be = expected_next_series_item(current_series_item)
end

toc.toc_items = new_toc_items
end
private

def detect_series_type_from_item(item)
case item.title
when /\A\d+/
:numeric
when /\A\b(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|IVX|XV)+\b/
:roman
when /\A[a-zA-Z]+/
:alpha
else
raise "series type not detected from title #{item.title}"
def detect_separator_chars(title, toc_series_type) # rubocop:disable Metrics/MethodLength
case toc_series_type
when :numeric
detect_numeric_series_separator_chars(title)
when :numeric_with_clause
detect_numeric_with_clause_series_separator_chars(title)
when :roman
detect_roman_series_separator_chars(title)
when :roman_with_title
detect_roman_with_title_series_separator_chars(title)
when :alpha
detect_alpha_series_separator_chars(title)
end
end

def detect_separator_chars(toc)
separators_chars = case toc.toc_series_type
when :numeric
toc.toc_items.map { |item| detect_numeric_series_separator_chars(item) }
when :roman
toc.toc_items.map { |item| detect_roman_series_separator_chars(item) }
when :alpha
toc.toc_items.map { |item| detect_alpha_series_separator_chars(item) }
else
raise "series type not detected"
end
separators_chars.compact.uniq
def detect_numeric_series_separator_chars(title)
title.match(/(\d+(?:\.\d+)*(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
end

def detect_numeric_series_separator_chars(item)
item.title.split(" ")[0].match(/.*\d([^\d]*)\z/) ? ::Regexp.last_match(1) : nil
def detect_numeric_with_clause_series_separator_chars(title)
title.match(/(?:Cláusula\s+)(\d+(?:\.\d+)*\s*(\.?-?)\s+[^\n]+)/m) ? ::Regexp.last_match(2) : nil
end

def detect_roman_series_separator_chars(item)
item.title.match(/\b(IX|IV|V|VI|I|II|III)\b([^\s]+)\s/) ? ::Regexp.last_match(2) : nil
def detect_roman_series_separator_chars(title)
title.match(/((?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
end

def detect_alpha_series_separator_chars(item)
item.title.match(/([a-zA-Z])([^\s]+)\s/) ? ::Regexp.last_match(2) : nil
def detect_roman_with_title_series_separator_chars(title)
title.match(/((?:ANEXO|CAPITULO|CAPÍTULO|TÍTULO|TITULO)\s+(?:IX|IV|V?I{1,3}|VI{1,3}|X{1,3}V?I{0,3})\s?(\.?-?)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil # rubocop:disable Layout/LineLength
end

def expected_next_series_item(current_item)
case @toc_series_type
when :numeric
(current_item || 0) + 1
when :roman
ROMAN_SERIES[(ROMAN_SERIES.index(current_item) || -1) + 1]
when :alpha
ALPHA_SERIES[(ALPHA_SERIES.index(current_item) || -1) + 1]
end
def detect_alpha_series_separator_chars(title)
title.match(/([a-zA-Z]([).-]+)\s+[^\n]+)/) ? ::Regexp.last_match(2) : nil
end
end
end
1 change: 0 additions & 1 deletion spec/document_parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,6 @@ def assert_sections_absent(unexpected_sections)
let(:file_path) { "spec/files/66067442.txt" }

it "has these sections" do
binding.pry
assert_sections_present([
["TÍTULO I. DISPOSICIONES GENERALES", ""],
["Cláusula 1 Régimen jurídico.", ""],
Expand Down

0 comments on commit 65c9745

Please sign in to comment.