Skip to content

Commit

Permalink
Fix HtmlSource meta tags behaviour
Browse files Browse the repository at this point in the history
Only for "keywords" and "description" meta tags "content" attr should be
treated as user visible text.
  • Loading branch information
mekras committed Mar 11, 2017
1 parent 1938cf0 commit b58d2c3
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 8 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## 1.5.1 - 2017-03-11

### Fixed

- HtmlSource: only for "keywords" and "description" meta tags "content" attr should be treated as
user visible text.

## 1.5 - 2017-03-11

### Added
Expand Down
38 changes: 31 additions & 7 deletions src/Source/HtmlSource.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ class HtmlSource implements Source
static private $textAttributes = [
'abbr',
'alt',
'content',
'label',
'placeholder',
'title'
Expand Down Expand Up @@ -61,7 +60,7 @@ public function getAsString()
$document = new \DOMDocument('1.0');
$document->loadHTML($this->html);

return $this->extractText($document->documentElement);
return $this->extractFromNode($document->documentElement);
}

/**
Expand All @@ -71,26 +70,51 @@ public function getAsString()
*
* @return string
*/
private function extractText(\DOMNode $node)
private function extractFromNode(\DOMNode $node)
{
if ($node instanceof \DOMText) {
return trim($node->textContent);
}

$text = '';
$text = [];

if ($node instanceof \DOMElement) {
foreach ($node->attributes as $attr) {
/** @var \DOMAttr $attr */
if (in_array($attr->name, self::$textAttributes, true)) {
$text .= ' ' . trim($attr->textContent);
$text[] = trim($attr->textContent);
}
}
$text[] = $this->extractFromMeta($node);
foreach ($node->childNodes as $child) {
$text .= ' ' . $this->extractText($child);
$text[] = $this->extractFromNode($child);
}
}

return trim($text);
return trim(implode(' ', $text));
}

/**
* Extract text from meta tag.
*
* @param \DOMElement $node
*
* @return string
*/
private function extractFromMeta(\DOMElement $node)
{
if (strtolower($node->nodeName) !== 'meta') {
return '';
}

if (!($node->hasAttribute('name') && $node->hasAttribute('content'))) {
return '';
}

if (!in_array(strtolower($node->getAttribute('name')), ['description', 'keywords'], true)) {
return '';
}

return trim($node->getAttribute('content'));
}
}
15 changes: 14 additions & 1 deletion tests/Source/HtmlSourceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ class HtmlSourceTest extends TestCase
public function testBasics()
{
$source = new HtmlSource('<a href="#" title="Foo">Bar</a> Baz');
static::assertEquals('Foo Bar Baz', $source->getAsString());
static::assertEquals('Foo Bar Baz', $source->getAsString());
}

/**
* Only for "keywords" and "description" meta tags "content" attr should be treated as string.
*/
public function testMetaContent()
{
$source = new HtmlSource(
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8">' .
'<meta name="Keywords" content="Foo">' .
'<meta name="description" content="Bar">'
);
static::assertEquals('Foo Bar', $source->getAsString());
}
}

0 comments on commit b58d2c3

Please sign in to comment.