Skip to content

Commit

Permalink
normalize relative URLs in JSONFeed items
Browse files Browse the repository at this point in the history
closes #77
  • Loading branch information
aaronpk committed Nov 9, 2018
1 parent b618f9a commit 9163341
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 14 deletions.
8 changes: 7 additions & 1 deletion lib/XRay/Formats/Format.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ protected static function _loadHTML($html) {
return [$doc, $xpath];
}

protected static function sanitizeHTML($html, $allowImg=true) {
protected static function sanitizeHTML($html, $allowImg=true, $baseURL=false) {
$allowed = [
'a',
'abbr',
Expand Down Expand Up @@ -68,6 +68,12 @@ protected static function sanitizeHTML($html, $allowImg=true) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', $allowed);

if($baseURL) {
$config->set('URI.MakeAbsolute', true);
$config->set('URI.Base', $baseURL);
}

$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
Expand Down
12 changes: 7 additions & 5 deletions lib/XRay/Formats/JSONFeed.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ public static function parse($feed, $url) {
$result['data']['type'] = 'feed';

foreach($feed['items'] as $item) {
$result['data']['items'][] = self::_hEntryFromFeedItem($item, $feed);
$result['data']['items'][] = self::_hEntryFromFeedItem($item, $feed, $url);
}
}

return $result;
}

private static function _hEntryFromFeedItem($item, $feed) {
private static function _hEntryFromFeedItem($item, $feed, $feedurl) {
$entry = [
'type' => 'entry',
'author' => [
Expand Down Expand Up @@ -64,14 +64,16 @@ private static function _hEntryFromFeedItem($item, $feed) {
$entry['name'] = trim($item['title']);
}

$baseURL = isset($entry['url']) ? $entry['url'] : $feedurl;

if(isset($item['content_html']) && isset($item['content_text'])) {
$entry['content'] = [
'html' => self::sanitizeHTML($item['content_html']),
'html' => self::sanitizeHTML($item['content_html'], true, $baseURL),
'text' => trim($item['content_text'])
];
} elseif(isset($item['content_html'])) {
$entry['content'] = [
'html' => self::sanitizeHTML($item['content_html']),
'html' => self::sanitizeHTML($item['content_html'], true, $baseURL),
'text' => self::stripHTML($item['content_html'])
];
} elseif(isset($item['content_text'])) {
Expand All @@ -93,7 +95,7 @@ private static function _hEntryFromFeedItem($item, $feed) {
}

if(isset($item['image'])) {
$entry['photo'] = $item['image'];
$entry['photo'] = \Mf2\resolveUrl($baseURL, $item['image']);
}

if(isset($item['tags'])) {
Expand Down
32 changes: 26 additions & 6 deletions tests/FeedTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public function testJSONFeed() {
$this->assertEquals('feed+json', $result->{'source-format'});
$data = $result->data;

$this->assertEquals(10, count($data->items));
$this->assertEquals(11, count($data->items));
for($i=0; $i<8; $i++) {
$this->assertEquals('entry', $data->items[$i]->type);
$this->assertEquals('manton', $data->items[$i]->author->name);
Expand All @@ -213,15 +213,35 @@ public function testJSONFeed() {
$this->assertEquals('note', $data->items[0]->{'post-type'});
$this->assertEquals('article', $data->items[4]->{'post-type'});

$this->assertEquals('<p>Lots of good feedback on <a href="http://help.micro.blog/2017/wordpress-import/">the WordPress import</a>. Made a couple improvements this morning. Overall, pretty good.</p>', $data->items[9]->content->html);
$this->assertEquals('Lots of good feedback on the WordPress import. Made a couple improvements this morning. Overall, pretty good.', $data->items[9]->content->text);
$this->assertEquals('http://www.manton.org/2017/11/5975.html', $data->items[9]->url);
$this->assertEquals('http://www.manton.org/2017/11/5975.html', $data->items[9]->uid);
$this->assertEquals('2017-11-07T15:04:01+00:00', $data->items[9]->published);
$this->assertEquals('<p>Coming up on a year since I wrote about how <a href="http://www.manton.org/2016/11/todays-social-networks-are-broken.html">today’s social networks are broken</a>. Still what I believe.</p>', $data->items[7]->content->html);
$this->assertEquals('Coming up on a year since I wrote about how today’s social networks are broken. Still what I believe.', $data->items[7]->content->text);
$this->assertEquals('http://www.manton.org/2017/11/5979.html', $data->items[7]->url);
$this->assertEquals('http://www.manton.org/2017/11/5979.html', $data->items[7]->uid);
$this->assertEquals('2017-11-07T21:00:42+00:00', $data->items[7]->published);

$this->assertEquals('feed', $data->type);
}

public function testJSONFeedRelativeImages() {
$url = 'http://feed.example.com/jsonfeed';
$response = $this->parse(['url' => $url, 'expect' => 'feed']);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$result = json_decode($body);
$this->assertEquals('feed+json', $result->{'source-format'});
$data = $result->data;

// Relative image on an item that has a url
$this->assertEquals('http://www.manton.org/2017/11/image.jpg', $data->items[9]->photo);

// Relative image on an item that has no URL, fall back to feed URL
$this->assertEquals('http://feed.example.com/image.jpg', $data->items[10]->photo);

// Relative image inside the content html
$this->assertContains('http://www.manton.org/2017/11/img.jpg', $data->items[9]->content->html);
}

public function testAtomFeed() {
$url = 'http://feed.example.com/atom';
$response = $this->parse(['url' => $url, 'expect' => 'feed']);
Expand Down
22 changes: 22 additions & 0 deletions tests/SanitizeTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,28 @@ public function testPhotoInTextContentNoAlt() {
}
*/

public function testRelativePhotoInContent() {
$url = 'http://sanitize.example/photo-in-content-relative';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertContains('http://sanitize.example/photo1.jpg', $data->data->content->html);
}

public function testRelativePhotoProperty() {
$url = 'http://sanitize.example/photo-relative';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
}

public function testPhotoInContentEmptyAltAttribute() {
// https://github.com/aaronpk/XRay/issues/52

Expand Down
17 changes: 15 additions & 2 deletions tests/data/feed.example.com/jsonfeed
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,25 @@ Content-Type: application/json; charset=UTF-8
"id": "http://www.manton.org/2017/11/5975.html",
"url": "http://www.manton.org/2017/11/5975.html",
"title": "",
"content_html": "<p>Lots of good feedback on <a href=\"http://help.micro.blog/2017/wordpress-import/\">the WordPress import</a>. Made a couple improvements this morning. Overall, pretty good.</p>\n",
"content_html": "<p><img src=\"img.jpg\"></p>\n",
"image": "image.jpg",
"banner_image": "banner_image.jpg",
"date_published": "2017-11-07T15:04:01+00:00",
"date_modified": "2017-11-07T15:04:01+00:00",
"author": {
"name": "manton"
}
},
{
"id": "http://www.manton.org/2017/11/5975.html",
"title": "",
"content_html": "<p><img src=\"img.jpg\"></p>\n",
"image": "image.jpg",
"date_published": "2017-11-07T15:04:01+00:00",
"date_modified": "2017-11-07T15:04:01+00:00",
"author": {
"name": "manton"
}
}
]
}
}
14 changes: 14 additions & 0 deletions tests/data/sanitize.example/photo-in-content-relative
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive

<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content">Test of <b>relative URL resolution</b> with two <img src="photo1.jpg"> images <img src="photo2.jpg"> inside the content</p>
</body>
</html>
15 changes: 15 additions & 0 deletions tests/data/sanitize.example/photo-relative
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive

<html>
<head>
<title>Test</title>
</head>
<body class="h-entry">
<p class="e-content">Test of <b>relative URL resolution</b> with an photo property</p>
<img class="u-photo" src="photo.jpg">
</body>
</html>

0 comments on commit 9163341

Please sign in to comment.