Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor icon finder #2141

Merged
merged 1 commit into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions internal/reader/handler/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,8 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool

func checkFeedIcon(store *storage.Storage, feedID int64, websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) {
if !store.HasIcon(feedID) {
icon, err := icon.FindIcon(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
if err != nil {
iconFinder := icon.NewIconFinder(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
if icon, err := iconFinder.FindIcon(); err != nil {
slog.Warn("Unable to find feed icon",
slog.Int64("feed_id", feedID),
slog.String("website_url", websiteURL),
Expand Down
224 changes: 163 additions & 61 deletions internal/reader/icon/finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"encoding/base64"
"fmt"
"io"
"log/slog"
"net/url"
"strings"

Expand All @@ -19,106 +20,170 @@ import (
"github.com/PuerkitoBio/goquery"
)

// FindIcon try to find the website's icon.
func FindIcon(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (icon *model.Icon, err error) {
if feedIconURL == "" {
feedIconURL, err = fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
if err != nil {
return nil, err
type IconFinder struct {
websiteURL string
feedIconURL string
userAgent string
fetchViaProxy bool
allowSelfSignedCertificates bool
}

func NewIconFinder(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) *IconFinder {
return &IconFinder{
websiteURL: websiteURL,
feedIconURL: feedIconURL,
userAgent: userAgent,
fetchViaProxy: fetchViaProxy,
allowSelfSignedCertificates: allowSelfSignedCertificates,
}
}

func (f *IconFinder) FindIcon() (*model.Icon, error) {
slog.Debug("Begin icon discovery process",
slog.String("website_url", f.websiteURL),
slog.String("feed_icon_url", f.feedIconURL),
)

if f.feedIconURL != "" {
if icon, err := f.FetchFeedIcon(); err != nil {
slog.Debug("Unable to download icon from feed",
slog.String("website_url", f.websiteURL),
slog.String("feed_icon_url", f.feedIconURL),
slog.Any("error", err),
)
} else if icon != nil {
return icon, nil
}
}

if strings.HasPrefix(feedIconURL, "data:") {
return parseImageDataURL(feedIconURL)
if icon, err := f.FetchIconsFromHTMLDocument(); err != nil {
slog.Debug("Unable to fetch icons from HTML document",
slog.String("website_url", f.websiteURL),
slog.Any("error", err),
)
} else if icon != nil {
return icon, nil
}

feedIconURL, err = generateIconURL(websiteURL, feedIconURL)
return f.FetchDefaultIcon()
}

func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) {
slog.Debug("Fetching default icon",
slog.String("website_url", f.websiteURL),
)

iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
if err != nil {
return nil, err
return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
}

if icon, err = downloadIcon(feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates); err != nil {
icon, err := f.DownloadIcon(iconURL)
if err != nil {
return nil, err
}

return icon, nil
}

func generateIconURL(websiteURL, feedIconURL string) (iconURL string, err error) {
feedIconURL = strings.TrimSpace(feedIconURL)
func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) {
slog.Debug("Fetching feed icon",
slog.String("website_url", f.websiteURL),
slog.String("feed_icon_url", f.feedIconURL),
)

if feedIconURL == "" {
iconURL, err = urllib.JoinBaseURLAndPath(urllib.RootURL(websiteURL), "favicon.ico")
if err != nil {
return "", fmt.Errorf(`icon: unable to join base URL and path: %w`, err)
}
} else {
iconURL, err = urllib.AbsoluteURL(websiteURL, feedIconURL)
if err != nil {
return "", fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
}
iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
if err != nil {
return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
}

return iconURL, nil
return f.DownloadIcon(iconURL)
}

func fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (string, error) {
rootURL := urllib.RootURL(websiteURL)
func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
slog.Debug("Searching icons from HTML document",
slog.String("website_url", f.websiteURL),
)

clt := client.NewClientWithConfig(rootURL, config.Opts)
clt.WithUserAgent(userAgent)
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates

if fetchViaProxy {
clt.WithProxy()
documentBody, err := f.FetchRootDocument()
if err != nil {
return nil, err
}

response, err := clt.Get()
iconURLs, err := findIconURLsFromHTMLDocument(documentBody)
if err != nil {
return "", fmt.Errorf("icon: unable to download website index page: %v", err)
return nil, err
}

if response.HasServerFailure() {
return "", fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
slog.Debug("Searched icon from HTML document",
slog.String("website_url", f.websiteURL),
slog.String("icon_urls", strings.Join(iconURLs, ",")),
)

for _, iconURL := range iconURLs {
if strings.HasPrefix(iconURL, "data:") {
slog.Debug("Found icon with data URL",
slog.String("website_url", f.websiteURL),
)
return parseImageDataURL(iconURL)
}

iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
if err != nil {
return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
}

if icon, err := f.DownloadIcon(iconURL); err != nil {
slog.Debug("Unable to download icon from HTML document",
slog.String("website_url", f.websiteURL),
slog.String("icon_url", iconURL),
slog.Any("error", err),
)
} else if icon != nil {
slog.Debug("Found icon from HTML document",
slog.String("website_url", f.websiteURL),
slog.String("icon_url", iconURL),
)
return icon, nil
}
}

return findIconURLFromHTMLDocument(response.Body)
return nil, nil
}

func findIconURLFromHTMLDocument(body io.Reader) (string, error) {
queries := []string{
"link[rel='shortcut icon']",
"link[rel='Shortcut Icon']",
"link[rel='icon shortcut']",
"link[rel='icon']",
func (f *IconFinder) FetchRootDocument() (io.Reader, error) {
rootURL := urllib.RootURL(f.websiteURL)

clt := client.NewClientWithConfig(rootURL, config.Opts)
clt.WithUserAgent(f.userAgent)
clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates

if f.fetchViaProxy {
clt.WithProxy()
}

doc, err := goquery.NewDocumentFromReader(body)
response, err := clt.Get()
if err != nil {
return "", fmt.Errorf("icon: unable to read document: %v", err)
return nil, fmt.Errorf("icon: unable to download website index page: %v", err)
}

var iconURL string
for _, query := range queries {
doc.Find(query).Each(func(i int, s *goquery.Selection) {
if href, exists := s.Attr("href"); exists {
iconURL = strings.TrimSpace(href)
}
})

if iconURL != "" {
break
}
if response.HasServerFailure() {
return nil, fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
}

return iconURL, nil
return response.Body, nil
}

func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
slog.Debug("Downloading icon",
slog.String("website_url", f.websiteURL),
slog.String("icon_url", iconURL),
)

clt := client.NewClientWithConfig(iconURL, config.Opts)
clt.WithUserAgent(userAgent)
clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
if fetchViaProxy {
clt.WithUserAgent(f.userAgent)
clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates
if f.fetchViaProxy {
clt.WithProxy()
}

Expand Down Expand Up @@ -149,6 +214,43 @@ func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCerti
return icon, nil
}

func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
queries := []string{
"link[rel='shortcut icon']",
"link[rel='Shortcut Icon']",
"link[rel='icon shortcut']",
"link[rel='icon']",
}

doc, err := goquery.NewDocumentFromReader(body)
if err != nil {
return nil, fmt.Errorf("icon: unable to read document: %v", err)
}

var iconURLs []string
for _, query := range queries {
slog.Debug("Searching icon URL in HTML document", slog.String("query", query))

doc.Find(query).Each(func(i int, s *goquery.Selection) {
var iconURL string

if href, exists := s.Attr("href"); exists {
iconURL = strings.TrimSpace(href)
}

if iconURL != "" {
iconURLs = append(iconURLs, iconURL)

slog.Debug("Found icon URL in HTML document",
slog.String("query", query),
slog.String("icon_url", iconURL))
}
})
}

return iconURLs, nil
}

// https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
// data:[<mediatype>][;base64],<data>
func parseImageDataURL(value string) (*model.Icon, error) {
Expand Down
53 changes: 5 additions & 48 deletions internal/reader/icon/finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,59 +112,16 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) {
/static/img/favicon.ico
">`

iconURL, err := findIconURLFromHTMLDocument(strings.NewReader(html))
iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

if iconURL != "/static/img/favicon.ico" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
}
}

func TestGenerateIconURL(t *testing.T) {
iconURL, err := generateIconURL("https://example.org/", "/favicon.png")
if err != nil {
t.Fatal(err)
}

if iconURL != "https://example.org/favicon.png" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
}

iconURL, err = generateIconURL("https://example.org/", "img/favicon.png")
if err != nil {
t.Fatal(err)
}

if iconURL != "https://example.org/img/favicon.png" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
}

iconURL, err = generateIconURL("https://example.org/", "https://example.org/img/favicon.png")
if err != nil {
t.Fatal(err)
}

if iconURL != "https://example.org/img/favicon.png" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
}

iconURL, err = generateIconURL("https://example.org/", "//example.org/img/favicon.png")
if err != nil {
t.Fatal(err)
}

if iconURL != "https://example.org/img/favicon.png" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
}

iconURL, err = generateIconURL("https://example.org/", " ")
if err != nil {
t.Fatal(err)
if len(iconURLs) != 1 {
t.Fatalf(`Invalid number of icon URLs, got %d`, len(iconURLs))
}

if iconURL != "https://example.org/favicon.ico" {
t.Errorf(`Invalid icon URL, got %q`, iconURL)
if iconURLs[0] != "/static/img/favicon.ico" {
t.Errorf(`Invalid icon URL, got %q`, iconURLs[0])
}
}