diff --git a/bin/pdf-extract b/bin/pdf-extract index 13e685f..db3ea28 100755 --- a/bin/pdf-extract +++ b/bin/pdf-extract @@ -22,6 +22,7 @@ resolvers = { outputs = { :xml => proc { :stdout }, + :bib => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".refs.bib" }, :pdf => proc { |f| File::basename(f.sub /\.[a-zA-Z0-9]+\Z/, "") + ".mask.pdf" } } @@ -31,6 +32,11 @@ commands = [ :view => :xml, :description => "Extract objects as XML." }, + { + :name => "extract-bib", + :view => :bib, + :description => "Extract resolved references in BibTeX format." + }, { :name => "mark", :view => :pdf, diff --git a/lib/pdf/extract.rb b/lib/pdf/extract.rb index d6ad1fa..cd2658f 100644 --- a/lib/pdf/extract.rb +++ b/lib/pdf/extract.rb @@ -11,6 +11,7 @@ require_relative 'extract/references/resolved_references.rb' require_relative 'extract/view/pdf_view.rb' require_relative 'extract/view/xml_view.rb' +require_relative 'extract/view/bib_view.rb' module PdfExtract @@ -68,6 +69,7 @@ def self.init add_view :pdf, PdfView add_view :xml, XmlView + add_view :bib, BibView end init diff --git a/lib/pdf/extract/references/resolve.rb b/lib/pdf/extract/references/resolve.rb index b379016..fdb933d 100644 --- a/lib/pdf/extract/references/resolve.rb +++ b/lib/pdf/extract/references/resolve.rb @@ -13,8 +13,9 @@ def self.find ref url = "http://search.labs.crossref.org/dois?q=#{CGI.escape(ref)}&rows=1" query = JSON.parse(open(url).read()) unless query.nil? - resolved[:doi] = query[0]["doi"] + resolved[:doi] = query[0]["doi"].sub "http://dx.doi.org/","" resolved[:score] = query[0]["score"] + puts "Found DOI from Text: #{resolved[:doi]} (Score: #{resolved[:score]})" end resolved end diff --git a/lib/pdf/extract/view/bib_view.rb b/lib/pdf/extract/view/bib_view.rb new file mode 100644 index 0000000..3ab9c51 --- /dev/null +++ b/lib/pdf/extract/view/bib_view.rb @@ -0,0 +1,48 @@ +require 'net/http' + +require_relative 'abstract_view' +require_relative '../language' + +module PdfExtract + class BibView < AbstractView + + def render options={} + + bibs = [] + + objects.each_pair do |type, objs| + objs.each do |obj| + + if obj.key? :doi and obj.key? :score + + if obj[:score] > 1 + url = "http://api.crossref.org/works/#{obj[:doi]}/transform/application/x-bibtex" + begin + bib = open(URI.encode(url)).read() + rescue URI::InvalidURIError + puts "DOI not a valid URL: #{obj[:doi]}" + rescue OpenURI::HTTPError + puts "DOI not found on CrossRef: #{obj[:doi]}" + else + puts "Found BibTeX from DOI: #{obj[:doi]}" + bibs << bib + end + end + + else + raise "Must run extract-bib with --resolved_references flag" + end + end + end + + bibs.join("\n") + end + + def self.write render, filename + File.open filename, "w" do |file| + file.write render + end + end + + end +end