Skip to content
This repository has been archived by the owner on Nov 29, 2019. It is now read-only.

Commit

Permalink
Pad column containers when detecting regions they contain.
Browse files Browse the repository at this point in the history
Also includes a refinement to possible header detection. Headers must be wider than they
are tall. Avoids confusing table columns and other bits as headers and incorrectly
splitting sections as a result.
  • Loading branch information
kjw committed Apr 12, 2012
1 parent 383e3c8 commit 18062fe
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 31 deletions.
10 changes: 5 additions & 5 deletions lib/analysis/columns.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def self.columns_at y, body_regions
def self.include_in pdf
deps = [:regions, :bodies]
pdf.spatials :columns, :paged => true, :depends_on => deps do |parser|

body = nil
body_regions = []

parser.before do
body_regions = []
end

parser.objects :bodies do |b|
body = b
end
Expand All @@ -48,7 +48,7 @@ def self.include_in pdf

parser.after do
column_sample_count = pdf.settings[:column_sample_count]

step = 1.0 / (column_sample_count + 1)
column_ranges = []

Expand All @@ -59,7 +59,7 @@ def self.include_in pdf

# Discard those with a coverage of 0.
column_ranges.reject! { |r| r.covered.zero? }

# Discard those with more than x columns. They've probably hit a table.
column_ranges.reject! { |r| r.count > pdf.settings[:max_column_count] }

Expand All @@ -79,7 +79,7 @@ def self.include_in pdf
end
end
end

end
end

Expand Down
19 changes: 17 additions & 2 deletions lib/analysis/sections.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
require_relative '../language'
require_relative '../spatial'
require_relative '../kmeans'
Expand Down Expand Up @@ -27,6 +28,16 @@ def self.candidate? pdf, region, column
within_column && (region[:width].to_f / column[:width]) >= width_ratio
end

def self.possible_header? pdf, region, column
# Possible headers are narrower than the column width_ratio
# but still within the column bounds. They must also be at least
# as wide as they are tall (otherwise we may have a table
# column, which should be ignored for purposes of determing
# page flow).
within_column = region[:width] <= column[:width]
within_column && (region[:width] >= region[:height])
end

def self.reference_cluster clusters
# Find the cluster with name_ratio closest to 0.1
# Those are our reference sections.
Expand Down Expand Up @@ -76,13 +87,13 @@ def self.include_in pdf
columns = []

parser.objects :columns do |column|
columns << {:column => column, :regions => []}
columns << {:column => column, :regions => []}
end

parser.objects :regions do |region|
containers = columns.reject do |c|
column = c[:column]
not (column[:page] == region[:page] && Spatial.contains?(column, region))
not (column[:page] == region[:page] && Spatial.contains?(column, region, 1))
end

containers.first[:regions] << region unless containers.empty?
Expand Down Expand Up @@ -131,6 +142,10 @@ def self.include_in pdf
:components => [Spatial.get_dimensions(region)]
})
end
elsif possible_header? pdf, region, column
# Split sections, ignore the header
sections << merging_region if !merging_region.nil?
merging_region = nil
end
end
end
Expand Down
18 changes: 9 additions & 9 deletions lib/language.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def self.transliterate s
when "\ufb05" then r << "ft"
when "\ufb06" then r << "st"
when "\u1d6b" then r << "ue"

# Normalise some punctuation.
when "\u2018" then r << "'"
when "\u2019" then r << "'"
Expand All @@ -33,19 +33,19 @@ def self.transliterate s
r << c
end
end

r.gsub /\s+/, " "
end

def self.letter_ratio s
s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
end

# TODO Ignore caps in middle of words
def self.cap_ratio s
sentence_end = true
cap_count = 0

s.each_char do |c|
if c =~ /\./
sentence_end = true
Expand All @@ -56,13 +56,13 @@ def self.cap_ratio s
sentence_end = false
end
end

cap_count / s.split.length.to_f
end

def self.year_ratio s
words = s.split

year_words = words.map do |word|
word =~ /[^\d]\d{4}[^\d]/
end
Expand All @@ -77,6 +77,6 @@ def self.name_ratio content
def self.word_count s
s.split.count
end

end

9 changes: 0 additions & 9 deletions lib/model/regions.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,6 @@ def self.append_line_spacing region
end
end

# Resplitting looks at line y offsets from the origin of a region. If
# a line offset is not uniform a region split occurs.
# Resplitting is handy becaue it preserves the maximum width bounds of the
# total region while still presenting paragraphs, headers etc as separate
# text regions.
def self.resplit region

end

def self.include_in pdf
pdf.spatials :regions, :paged => true, :depends_on => [:chunks] do |parser|
chunks = []
Expand Down
1 change: 0 additions & 1 deletion lib/references/references.rb
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ def self.include_in pdf
#if Score.reference?(section)
content = Spatial.get_text_content(section)
if numeric_sequence? pdf, content
puts "looks like numeric content"
refs += split_by_delimiter pdf, content
elsif multi_margin? section[:lines]
refs += split_by_margin section[:lines]
Expand Down
10 changes: 5 additions & 5 deletions lib/spatial.rb
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,11 @@ def self.collapse objs, options={}
end
end

def self.contains? a, b
a_x1 = a[:x]
a_x2 = a[:x] + a[:width]
a_y1 = a[:y]
a_y2 = a[:y] + a[:height]
def self.contains? a, b, padding=0
a_x1 = a[:x] - padding
a_x2 = a[:x] + a[:width] + (padding * 2)
a_y1 = a[:y] - padding
a_y2 = a[:y] + a[:height] + (padding * 2)

b_x1 = b[:x]
b_x2 = b[:x] + b[:width]
Expand Down

0 comments on commit 18062fe

Please sign in to comment.