Skip to content

Commit

Permalink
Improve robustness of charset handling when parsing HTML.
Browse files Browse the repository at this point in the history
Between the server-provided and automatically detected Content-Type of
text types, use the value that includes charset specification.

This handles the case when the server sends a plain "text/html" Content-Type,
but the code detects it to be a more specific "text/html; charset=utf-8".
  • Loading branch information
artyom committed Jun 14, 2024
1 parent 404c3cf commit 69c2871
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions html_meta_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ import (

func basicParseHTML(chunk *pageChunk) *unfurlResult {
result := new(unfurlResult)
result.Type = http.DetectContentType(chunk.data)
sniffedContentType := http.DetectContentType(chunk.data)
result.Type = sniffedContentType
switch {
case strings.HasPrefix(result.Type, "image/"):
result.Type = "image"
Expand All @@ -26,7 +27,14 @@ func basicParseHTML(chunk *pageChunk) *unfurlResult {
result.Type = "website"
// pass Content-Type from response headers as it may have
// charset definition like "text/html; charset=windows-1251"
if title, desc, err := extractData(chunk.data, chunk.ct); err == nil {
ct := chunk.ct
// There are cases where Content-Type header is "text/html", but http.DetectContentType
// narrows it down to a more specific "text/html; charset=utf-8". In such a case use
// the latter.
if !strings.Contains(ct, "charset=") && strings.Contains(sniffedContentType, "charset=") {
ct = sniffedContentType
}
if title, desc, err := extractData(chunk.data, ct); err == nil {
result.Title = title
result.Description = desc
}
Expand Down

0 comments on commit 69c2871

Please sign in to comment.