diff --git a/lib/reverse_markdown.rb b/lib/reverse_markdown.rb index 6450d64..9d766f7 100644 --- a/lib/reverse_markdown.rb +++ b/lib/reverse_markdown.rb @@ -33,15 +33,17 @@ module ReverseMarkdown def self.convert(input, options = {}) - root = case input - when String then Nokogiri::HTML(input).root - when Nokogiri::XML::Document then input.root - when Nokogiri::XML::Node then input - end + config.with(options) do + input = cleaner.force_encoding(input.to_s) - root or return '' + root = case input + when String then Nokogiri::HTML(input).root + when Nokogiri::XML::Document then input.root + when Nokogiri::XML::Node then input + end + + root or return '' - config.with(options) do result = ReverseMarkdown::Converters.lookup(root.name).convert(root) cleaner.tidy(result) end diff --git a/lib/reverse_markdown/cleaner.rb b/lib/reverse_markdown/cleaner.rb index 78f9a6c..404b91f 100644 --- a/lib/reverse_markdown/cleaner.rb +++ b/lib/reverse_markdown/cleaner.rb @@ -59,6 +59,11 @@ def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([\.!\?'"])/, "\\1".strip + "\\2") end + def force_encoding(string) + ReverseMarkdown.config.force_encoding or return string + string.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') + end + private def preserve_border_whitespaces(string, options = {}, &block) diff --git a/lib/reverse_markdown/config.rb b/lib/reverse_markdown/config.rb index 04089b1..07db135 100644 --- a/lib/reverse_markdown/config.rb +++ b/lib/reverse_markdown/config.rb @@ -1,10 +1,11 @@ module ReverseMarkdown class Config - attr_accessor :unknown_tags, :github_flavored, :tag_border + attr_accessor :unknown_tags, :github_flavored, :tag_border, :force_encoding def initialize @unknown_tags = :pass_through @github_flavored = false + @force_encoding = false @em_delimiter = '_'.freeze @strong_delimiter = '**'.freeze @inline_options = {} @@ -29,5 +30,9 @@ def github_flavored def tag_border @inline_options[:tag_border] || @tag_border end + + def force_encoding + @inline_options[:force_encoding] || @force_encoding + end end end diff --git a/spec/lib/reverse_markdown_spec.rb b/spec/lib/reverse_markdown_spec.rb index 58a5057..a734cd4 100644 --- a/spec/lib/reverse_markdown_spec.rb +++ b/spec/lib/reverse_markdown_spec.rb @@ -33,5 +33,15 @@ end expect(ReverseMarkdown.config.github_flavored).to eq true end + + describe 'force_encoding option' do + it 'raises invalid byte sequence in UTF-8 exception' do + expect { ReverseMarkdown.convert("hi \255") }.to raise_error(ArgumentError) + end + + it 'handles invalid byte sequence if option is set' do + expect(ReverseMarkdown.convert("hi \255", force_encoding: true)).to eq "hi\n\n" + end + end end end