Skip to content

Commit

Permalink
Improve decoding JSON
Browse files Browse the repository at this point in the history
The `Json` class from the crwlr/utils is now used to decode JSON
strings. It tries to fix keys without quotes, which is allowed in
relaxed JSON. Further, JSON-LD <script> blocks containing an invalid
JSON string are ignored and don't lead to an error anymore.

Also, you can now pass a PSR-3 LoggerInterface to the `SchemaOrg` class,
so it'll log decoding errors.
  • Loading branch information
otsch committed May 17, 2023
1 parent 0d6b80e commit eecf44b
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 5 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.2.0] - 2022-05-17
### Added
* You can now optionally pass a PSR-3 LoggerInterface to the `SchemaOrg` class, so it'll log decoding errors.

### Fixed
* The `Json` class from the crwlr/utils is now used to decode JSON strings. It tries to fix keys without quotes, which is allowed in relaxed JSON. Further, JSON-LD <script> blocks containing an invalid JSON string are ignored and don't lead to an error anymore.

## [0.1.0] - 2022-09-22
Initial version containing `SchemaOrg` class that finds schema.org JSON-LD objects in HTML documents and converts them to instances of the classes from the spatie schema-org package.
4 changes: 3 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
"require": {
"php": "^8.0",
"spatie/schema-org": "~3.11.0",
"symfony/dom-crawler": "^6.1"
"symfony/dom-crawler": "^6.1",
"crwlr/utils": "^1.0",
"psr/log": "^2.0|^3.0"
},
"require-dev": {
"pestphp/pest": "^1.22",
Expand Down
25 changes: 21 additions & 4 deletions src/SchemaOrg.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

namespace Crwlr\SchemaOrg;

use Crwlr\Utils\Exceptions\InvalidJsonException;
use Crwlr\Utils\Json;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Spatie\SchemaOrg\BaseType;
use Symfony\Component\DomCrawler\Crawler;

Expand All @@ -12,18 +15,20 @@ class SchemaOrg

private static ?self $singletonInstance = null;

public function __construct()
public function __construct(protected ?LoggerInterface $logger = null)
{
$this->types = new TypeList();
}

/**
* @return BaseType[]
*/
public static function fromHtml(string $html): array
public static function fromHtml(string $html, ?LoggerInterface $logger = null): array
{
if (!self::$singletonInstance) {
self::$singletonInstance = new self();
self::$singletonInstance = new self($logger);
} elseif ($logger) {
self::$singletonInstance->logger = $logger;
}

return self::$singletonInstance->getFromHtml($html);
Expand Down Expand Up @@ -54,7 +59,19 @@ public function getFromHtml(string $html): array

private function getSchemaOrgObjectFromScriptBlock(Crawler $domCrawler): ?BaseType
{
return $this->convertJsonDataToSchemaOrgObject(json_decode($domCrawler->text(), true));
try {
$jsonData = Json::stringToArray($domCrawler->text());
} catch (InvalidJsonException) {
$snippetWithReducedSpaces = preg_replace('/\s+/', ' ', $domCrawler->text()) ?? $domCrawler->text();

$this->logger?->warning(
'Failed to parse content of JSON-LD script block as JSON: ' . substr($snippetWithReducedSpaces, 0, 100)
);

return null;
}

return $this->convertJsonDataToSchemaOrgObject($jsonData);
}

/**
Expand Down
114 changes: 114 additions & 0 deletions tests/SchemaOrgTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Tests;

use Crwlr\SchemaOrg\SchemaOrg;
use Psr\Log\LoggerInterface;
use Spatie\SchemaOrg\Article;
use Spatie\SchemaOrg\FAQPage;
use Spatie\SchemaOrg\JobPosting;
Expand Down Expand Up @@ -156,3 +157,116 @@

expect($schemaOrgObjects[0]->getProperty('publisher')->getProperty('name'))->toBe('Some Organization, Inc.');
});

test('there is no error if a json-ld script block contains an invalid JSON string', function () {
$html = <<<HTML
<!DOCTYPE html>
<html lang="de-AT">
<head>
<title>Some Article</title>
</head>
<body>
<h1>Some Article</h1>
<h2>This is some article about something.</h2>
<script type="application/ld+json">
{
"@context": "https:\/\/schema.org",
"@type": "Article",
name: Some Article,
url: https://de.example.org/articles/some,
]
</script>
</body>
</html>
HTML;

$schemaOrgObjects = SchemaOrg::fromHtml($html);

expect($schemaOrgObjects)->toBeEmpty();
});

test('you can pass it a PSR-3 LoggerInterface and it will log an error message for invalid JSON string', function () {
$scriptBlockContent = <<<INVALIDJSON
{
"@context": "https:\/\/schema.org",
"@type": "Article",
name: Some Article,
url: https://de.example.org/articles/some,
]
INVALIDJSON;

$html = <<<HTML
<!DOCTYPE html>
<html lang="de-AT">
<head>
<title>Some Article</title>
</head>
<body>
<h1>Some Article</h1>
<h2>This is some article about something.</h2>
<script type="application/ld+json">{$scriptBlockContent}</script>
</body>
</html>
HTML;

$logger = new class () implements LoggerInterface {
/**
* @var array<array<string|string[]>>
*/
public array $messages = [];

public function emergency(\Stringable|string $message, array $context = []): void
{
$this->log('emergency', $message, $context);
}

public function alert(\Stringable|string $message, array $context = []): void
{
$this->log('alert', $message, $context);
}

public function critical(\Stringable|string $message, array $context = []): void
{
$this->log('critical', $message, $context);
}

public function error(\Stringable|string $message, array $context = []): void
{
$this->log('error', $message, $context);
}

public function warning(\Stringable|string $message, array $context = []): void
{
$this->log('warning', $message, $context);
}

public function notice(\Stringable|string $message, array $context = []): void
{
$this->log('notice', $message, $context);
}

public function info(\Stringable|string $message, array $context = []): void
{
$this->log('info', $message, $context);
}

public function debug(\Stringable|string $message, array $context = []): void
{
$this->log('debug', $message, $context);
}

public function log($level, \Stringable|string $message, array $context = []): void
{
$this->messages[] = ['level' => $level, 'message' => $message, 'context' => $context];
}
};

SchemaOrg::fromHtml($html, $logger);

expect($logger->messages[0])->toBe([
'level' => 'warning',
'message' => 'Failed to parse content of JSON-LD script block as JSON: { "@context": "https:\/\/schema.org", ' .
'"@type": "Article", name: Some Article, url: https://de.exampl',
'context' => [],
]);
});

0 comments on commit eecf44b

Please sign in to comment.