diff --git a/html_meta_parser.go b/html_meta_parser.go index 44af992..8f401a3 100644 --- a/html_meta_parser.go +++ b/html_meta_parser.go @@ -17,7 +17,8 @@ import ( func basicParseHTML(chunk *pageChunk) *unfurlResult { result := new(unfurlResult) - result.Type = http.DetectContentType(chunk.data) + sniffedContentType := http.DetectContentType(chunk.data) + result.Type = sniffedContentType switch { case strings.HasPrefix(result.Type, "image/"): result.Type = "image" @@ -26,7 +27,14 @@ func basicParseHTML(chunk *pageChunk) *unfurlResult { result.Type = "website" // pass Content-Type from response headers as it may have // charset definition like "text/html; charset=windows-1251" - if title, desc, err := extractData(chunk.data, chunk.ct); err == nil { + ct := chunk.ct + // There are cases where Content-Type header is "text/html", but http.DetectContentType + // narrows it down to a more specific "text/html; charset=utf-8". In such a case use + // the latter. + if !strings.Contains(ct, "charset=") && strings.Contains(sniffedContentType, "charset=") { + ct = sniffedContentType + } + if title, desc, err := extractData(chunk.data, ct); err == nil { result.Title = title result.Description = desc }