Skip to content
This repository has been archived by the owner on Nov 29, 2019. It is now read-only.

Commit

Permalink
Fix packaging and some code.
Browse files Browse the repository at this point in the history
  • Loading branch information
pwnall committed Nov 27, 2013
1 parent 78f32c6 commit c2c7e7c
Show file tree
Hide file tree
Showing 35 changed files with 292 additions and 220 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
bin/*.pdf

*.sw*
.DS_Store
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
source 'https://rubygems.org'

gemspec
53 changes: 53 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
PATH
remote: .
specs:
pdf-extract (0.1.1)
commander (>= 4.0.4)
json (>= 1.5.1)
nokogiri (>= 1.5.0)
pdf-reader (= 1.3.3)
prawn (>= 0.11.1)
rb-libsvm (>= 1.1.3)
sqlite3 (>= 1.3.4)

GEM
remote: https://rubygems.org/
specs:
Ascii85 (1.0.2)
afm (0.2.0)
bson (1.9.2)
bson_ext (1.9.2)
bson (~> 1.9.2)
commander (4.1.5)
highline (~> 1.6.11)
hashery (2.1.1)
highline (1.6.20)
json (1.8.1)
mini_portile (0.5.2)
mongo (1.9.2)
bson (~> 1.9.2)
nokogiri (1.6.0)
mini_portile (~> 0.5.0)
pdf-reader (1.3.3)
Ascii85 (~> 1.0.0)
afm (~> 0.2.0)
hashery (~> 2.0)
ruby-rc4
ttfunk
prawn (0.12.0)
pdf-reader (>= 0.9.0)
ttfunk (~> 1.0.2)
rake (10.1.0)
rb-libsvm (1.1.3)
ruby-rc4 (0.1.5)
sqlite3 (1.3.8)
ttfunk (1.0.3)

PLATFORMS
ruby

DEPENDENCIES
bson_ext (>= 1.9.2)
mongo (>= 1.9.2)
pdf-extract!
rake (>= 10.1.0)
2 changes: 2 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
require_relative 'tasks/assign.rb'
require_relative 'tasks/train.rb'
72 changes: 0 additions & 72 deletions bin/assign.rb

This file was deleted.

6 changes: 3 additions & 3 deletions bin/pdf-extract
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

require 'commander/import'
require 'json'
require_relative '../lib/pdf-extract'
require_relative '../lib/references/resolve'
require_relative '../lib/pdf-extract.rb'
require_relative '../lib/pdf/extract/references/resolve.rb'

program :name, 'pdf-extract'
program :version, '0.0.1'
Expand Down Expand Up @@ -146,7 +146,7 @@ command :settings do |c|
else
say "<%= color('AT DEFAULT:', BOLD) %>\n\n"
s.unmodified.each_pair { |k, v| say "#{k}:\t#{v}" }

say "\n<%= color('OVERRIDDEN:', BOLD) %>\n\n"
s.modified.each_pair { |k, v| say "#{k}:\t#{v} (#{s.agent(k)})" }
end
Expand Down
48 changes: 0 additions & 48 deletions bin/train.rb

This file was deleted.

1 change: 1 addition & 0 deletions catalog.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
76 changes: 1 addition & 75 deletions lib/pdf-extract.rb
Original file line number Diff line number Diff line change
@@ -1,75 +1 @@
require_relative 'pdf'
require_relative 'model/characters'
require_relative 'model/chunks'
require_relative 'model/regions'
require_relative 'analysis/titles'
require_relative 'analysis/margins'
require_relative 'analysis/zones'
require_relative 'analysis/columns'
require_relative 'analysis/sections'
require_relative 'references/references'
require_relative 'references/resolved_references'
require_relative 'view/pdf_view'
require_relative 'view/xml_view'

module PdfExtract

@views = {}

@parsers = []

def self.add_view name, view_class
@views[name] = view_class
end

def self.add_parser parser_class
@parsers << parser_class
end

def self.parse filename, &block
pdf = Pdf.new

@parsers.each do |p|
p.include_in pdf
end

yield pdf

pdf.spatial_calls.each do |spatial_call|
name = spatial_call[:name]
receiver = Receiver.new pdf
pdf.spatial_builders[name].call receiver
receiver.invoke_calls filename, pdf.spatial_options[name]
end

pdf
end

def self.view_class short_name
@views[short_name]
end

def self.view filename, options = {}, &block
pdf = parse filename, &block
view_class(options[:as]).new(pdf, filename).render options
end

def self.init
add_parser Characters
add_parser Chunks
add_parser Regions
add_parser Titles
add_parser Margins
add_parser Zones
add_parser Columns
add_parser Sections
add_parser References
add_parser ResolvedReferences

add_view :pdf, PdfView
add_view :xml, XmlView
end

init

end
require_relative 'pdf/extract.rb'
75 changes: 75 additions & 0 deletions lib/pdf/extract.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
require_relative 'extract/pdf.rb'
require_relative 'extract/model/characters.rb'
require_relative 'extract/model/chunks.rb'
require_relative 'extract/model/regions.rb'
require_relative 'extract/analysis/titles.rb'
require_relative 'extract/analysis/margins.rb'
require_relative 'extract/analysis/zones.rb'
require_relative 'extract/analysis/columns.rb'
require_relative 'extract/analysis/sections.rb'
require_relative 'extract/references/references.rb'
require_relative 'extract/references/resolved_references.rb'
require_relative 'extract/view/pdf_view.rb'
require_relative 'extract/view/xml_view.rb'

module PdfExtract

@views = {}

@parsers = []

def self.add_view name, view_class
@views[name] = view_class
end

def self.add_parser parser_class
@parsers << parser_class
end

def self.parse filename, &block
pdf = Pdf.new

@parsers.each do |p|
p.include_in pdf
end

yield pdf

pdf.spatial_calls.each do |spatial_call|
name = spatial_call[:name]
receiver = Receiver.new pdf
pdf.spatial_builders[name].call receiver
receiver.invoke_calls filename, pdf.spatial_options[name]
end

pdf
end

def self.view_class short_name
@views[short_name]
end

def self.view filename, options = {}, &block
pdf = parse filename, &block
view_class(options[:as]).new(pdf, filename).render options
end

def self.init
add_parser Characters
add_parser Chunks
add_parser Regions
add_parser Titles
add_parser Margins
add_parser Zones
add_parser Columns
add_parser Sections
add_parser References
add_parser ResolvedReferences

add_view :pdf, PdfView
add_view :xml, XmlView
end

init

end
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
11 changes: 6 additions & 5 deletions lib/font_metrics.rb → lib/pdf/extract/font_metrics.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Taken from pdfminer, in turn extracted from AFM files at:
#
#
# http://www.ctan.org/tex-archive/fonts/adobe/afm/
#

Expand Down Expand Up @@ -39,12 +39,13 @@ def initialize font
@@base_fonts[base_font][:Widths].fetch(c.codepoints.first, 0)
}
else
@ascent = font.ascent
@descent = font.descent
@bbox = font.bbox
font_descriptor = font.font_descriptor
@ascent = font_descriptor.ascent
@descent = font_descriptor.descent
@bbox = font_descriptor.font_bounding_box
@glyph_width_lookup = proc do |c|
begin
font.glyph_width c.codepoints.first
font.glyph_width(c.codepoints.first) || 0
rescue TypeError => e
# It seems some fonts don't have a first char attribute in their
# descriptor and this causes problems for pdf-reader.
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion lib/language.rb → lib/pdf/extract/language.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
require_relative "names"
require_relative 'names.rb'

module PdfExtract::Language

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit c2c7e7c

Please sign in to comment.