Skip to content

Commit

Permalink
Filter out irrelevant feed URLs (#3780)
Browse files Browse the repository at this point in the history
  • Loading branch information
Eakam1007 authored Dec 14, 2022
1 parent 465e1a2 commit f2ae958
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 10 deletions.
22 changes: 18 additions & 4 deletions src/api/feed-discovery/src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ const getFeedUrlType = (feedUrl) => {
}
};

// return true if the feedURL is a relevant URL that we want to keep
const relevantFeedUrl = (feedUrl) => {
const invalidPathMatchers = [
/\/comments\/feed\/$/, // wordpress.com comments feed
/^https:\/\/public-api.wordpress.com\/oembed/,
/^https:\/\/www.blogger.com\/feeds\/[0-9]*\/posts\/default$/,
/\/feeds\/posts\/default\?alt=rss$/, // blogspot.com alternate rss feed
];
return invalidPathMatchers.every((matcher) => !matcher.test(feedUrl));
};

// Helper function to return the feed url of a given blog url
const getFeedUrls = (document) => {
try {
Expand Down Expand Up @@ -103,10 +114,12 @@ const getFeedUrls = (document) => {
if (links.length > 0) {
for (let i = 0; i < links.length; i += 1) {
const feedUrl = links[i].attribs.href;
feedUrls.push({
feedUrl,
type: getFeedUrlType(feedUrl),
});
if (relevantFeedUrl(feedUrl)) {
feedUrls.push({
feedUrl,
type: getFeedUrlType(feedUrl),
});
}
}
}

Expand All @@ -123,3 +136,4 @@ module.exports.getFeedUrls = getFeedUrls;
module.exports.isTwitchUrl = isTwitchUrl;
module.exports.toTwitchFeedUrl = toTwitchFeedUrl;
module.exports.isFeedUrl = isFeedUrl;
module.exports.relevantFeedUrl = relevantFeedUrl;
24 changes: 18 additions & 6 deletions src/api/feed-discovery/test/router.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -267,17 +267,25 @@ describe('POST /', () => {
expect(res.body).toEqual(result);
});

it('should return 200 and all feed urls if there are multiple link elements that could contain a feed url', async () => {
it('should return 200 and all relevant feed urls if there are multiple link elements that could contain a feed url', async () => {
const blogUrl = 'https://test321.blogspot.com/';
const mockBlogUrlResponseBody = `
<!doctype html>
<html lang="en">
<head>
<link rel="alternate" type="application/x.atom+xml" href="https://test321.blogspot.com/x.atom/feeds/posts/default/-/open-source"/>
<link rel="alternate" type="application/x-atom+xml" href="https://test321.blogspot.com/x-atom/feeds/posts/default/-/open-source"/>
<link rel="alternate" type="application/json" href="https://test321.blogspot.com/json"/>
<link rel="alternate" type="application/json+oembed" href="https://test321.blogspot.com/oembed/?format=json"/>
<link rel="alternate" type="application/xml+oembed" href="https://test321.blogspot.com/oembed/?format=xml"/>
<link rel="alternate" type="application/x.atom+xml" href="https://test321.blogspot.com/x.atom/feeds/posts/default/-/open-source" />
<link rel="alternate" type="application/x-atom+xml" href="https://test321.blogspot.com/x-atom/feeds/posts/default/-/open-source" />
<link rel="alternate" type="application/json" href="https://test321.blogspot.com/json" />
<link rel="alternate" type="application/json+oembed" href="https://test321.blogspot.com/oembed/?format=json" />
<link rel="alternate" type="application/xml+oembed" href="https://test321.blogspot.com/oembed/?format=xml" />
<link rel="alternate" type="application/rss+xml" href="https://test321.blogspot.com/feeds/posts/default" />
<link rel="alternate" type="application/rss+xml" href="https://test321.blogspot.com/feeds/posts/default?alt=rss" />
<link rel="alternate" type="application/atom+xml" href="https://www.blogger.com/feeds/123/posts/default" />
<link rel="alternate" type="application/rss+xml" href="https://test321.wordpress.com/feed/" />
<link rel="alternate" type="application/rss+xml" href="https://test321.wordpress.com/comments/feed/" />
<link rel="alternate" type="application/json+oembed" href="https://public-api.wordpress.com/oembed/?format=json&url=https%3A%2F%2Ftest321.wordpress.com%2F&for=wpcom-auto-discovery" />
<link rel="alternate" type="application/rss+xml" href="https://medium.com/feed/@test321" />
<link rel="alternate" type="application/rss+xml" href="https://dev.to/feed/test321" />
</head>
<body></body>
</html>
Expand All @@ -290,6 +298,10 @@ describe('POST /', () => {
'https://test321.blogspot.com/json',
'https://test321.blogspot.com/oembed/?format=json',
'https://test321.blogspot.com/oembed/?format=xml',
'https://test321.blogspot.com/feeds/posts/default',
'https://test321.wordpress.com/feed/',
'https://medium.com/feed/@test321',
'https://dev.to/feed/test321',
].map((feedUrl) => ({ feedUrl, type: 'blog' })),
};

Expand Down
49 changes: 49 additions & 0 deletions src/api/feed-discovery/test/util.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const {
isFeedUrl,
getBlogBody,
getFeedUrlType,
relevantFeedUrl,
getFeedUrls,
} = require('../src/util');

Expand Down Expand Up @@ -127,6 +128,27 @@ describe('util.js', () => {
expect(getFeedUrlType('not-valid')).toBe('blog');
});

test('relevantFeedUrl returns true for relevant feed URLs', () => {
[
'https://test321.com/feed/user',
'https://test321.workpress.com/feed/',
'https://test321.blogspot.com/feeds/posts/default',
].forEach((feedUrl) => {
expect(relevantFeedUrl(feedUrl)).toBe(true);
});
});

test('relevantFeedUrl returns false for irrelevant feed URLs', () => {
[
'https://test321.workpress.com/comments/feed/',
'https://public-api.wordpress.com/oembed/?format=json&url=https%3A%2F%2Ftest321.wordpress.com%2F&for=wpcom-auto-discovery',
'https://www.blogger.com/feeds/123/posts/default',
'https://test321.blogspot.com/feeds/posts/default?alt=rss',
].forEach((feedUrl) => {
expect(relevantFeedUrl(feedUrl)).toBe(false);
});
});

test('getFeedUrls returns expected atom+xml feed URL for a given document', () => {
const html = (type) => `
<html lang="en">
Expand Down Expand Up @@ -158,4 +180,31 @@ describe('util.js', () => {
test('getFeedUrls returns null if document cannot be parsed', () => {
expect(getFeedUrls(null)).toBe(null);
});

test('getFeedUrls filters irrelevant feed URLs', () => {
const html = `
<html lang="en">
<head>
<link rel="alternate" type="application/atom+xml" href="https://test321.blogspot.com/feeds/posts/default" />
<link rel="alternate" type="application/rss+xml" href="https://test321.blogspot.com/feeds/posts/default?alt=rss" />
<link rel="alternate" type="application/atom+xml" href="https://www.blogger.com/feeds/123/posts/default" />
<link rel="alternate" type="application/rss+xml" href="https://test321.wordpress.com/feed/" />
<link rel="alternate" type="application/rss+xml" href="https://test321.wordpress.com/comments/feed/" />
<link rel="alternate" type="application/json+oembed" href="https://public-api.wordpress.com/oembed/?format=json&url=https%3A%2F%2Ftest321.wordpress.com%2F&for=wpcom-auto-discovery" />
<link rel="alternate" type="application/rss+xml" href="https://medium.com/feed/@test321" />
<link rel="alternate" type="application/rss+xml" href="https://dev.to/feed/test321" />
</head>
<body></body>
</html>
`;

const expectedFeedUrls = [
'https://test321.blogspot.com/feeds/posts/default',
'https://test321.wordpress.com/feed/',
'https://medium.com/feed/@test321',
'https://dev.to/feed/test321',
].map((feedUrl) => ({ feedUrl, type: 'blog' }));

expect(getFeedUrls(html)).toEqual(expectedFeedUrls);
});
});

0 comments on commit f2ae958

Please sign in to comment.