Skip to content

Commit

Permalink
Fix handling of whitespace/non breaking spaces
Browse files Browse the repository at this point in the history
This is based on a currently unmerged PR to the main repo:
hgmnz#69

It fixes an issue where truncate_html would remove non breaking spaces,
resulting in words being joined together incorrectly. Now they are
treated like other whitespaces.
  • Loading branch information
sonjapeterson committed Nov 30, 2016
1 parent a09ddcd commit a61fa95
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 7 deletions.
2 changes: 1 addition & 1 deletion lib/truncate_html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
TruncateHtml.configure do |config|
config.length = 100
config.omission = '...'
config.word_boundary = /\S/
config.word_boundary = /(?![[:space:]])./
end


Expand Down
6 changes: 2 additions & 4 deletions lib/truncate_html/html_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module TruncateHtml
class HtmlString < String

UNPAIRED_TAGS = %w(br hr img).freeze
REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze

def initialize(original_html)
super(original_html)
Expand All @@ -13,9 +13,7 @@ def html_tokens
scan(REGEX).map do |token|
HtmlString.new(
token.gsub(
/\n/,' ' #replace newline characters with a whitespace
).gsub(
/\s+/, ' ' #clean out extra consecutive whitespace
/[[:space:]]+/, ' ' #clean out extra consecutive whitespace
)
)
end
Expand Down
4 changes: 2 additions & 2 deletions spec/truncate_html/html_string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ def html_string(original_string)

describe '#html_tokens' do
it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
html = '<h1>Hi there</h1> <p>This is sweet!</p> <p> squaremeter m² </p>'
html = "<h1>Hi there</h1> <p>This is sweet!</p> \r\n<p> squaremeter m² </p><div>Non-breaking\nspace here: </div>"
html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>']
' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>', '<div>', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '</div>']
end
end

Expand Down

0 comments on commit a61fa95

Please sign in to comment.