Skip to content

Commit

Permalink
Added natural language date parser for published date scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
cjmalloy committed Jul 26, 2023
1 parent 989309e commit c008703
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,11 @@
<artifactId>service</artifactId>
<version>${openapi-gpt.version}</version>
</dependency>
<dependency>
<groupId>com.rubiconproject.oss</groupId>
<artifactId>jchronic</artifactId>
<version>0.2.8</version>
</dependency>

<!-- Spring Cloud -->
<dependency>
Expand Down
21 changes: 16 additions & 5 deletions src/main/java/jasper/component/WebScraper.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.mdimension.jchronic.Chronic;
import com.mdimension.jchronic.Options;
import com.mdimension.jchronic.tags.Pointer;
import io.micrometer.core.annotation.Timed;
import jasper.component.dto.JsonLd;
import jasper.domain.Ref;
Expand Down Expand Up @@ -188,16 +191,24 @@ private void parseThumbnails(Ref result, Document doc, Scrape config) {
}
}

private static Options opts = new Options(Pointer.PointerType.PAST);;
private void parsePublished(Ref result, Document doc, Scrape config) {
if (config.getPublishedSelectors() == null) return;
for (var s : config.getPublishedSelectors()) {
var published = doc.select(s).first();
if (published != null) {
try {
result.setPublished(Instant.parse(published.attr("datetime")));
break;
} catch (DateTimeParseException ignored) {}
if (published == null) continue;
String date = "";
if (published.tagName().equals("time")) {
date = published.attr("datetime");
} else {
result.setPublished(Instant.ofEpochSecond(Chronic.parse(published.text(), opts).getBegin()));
return;
}
if (isBlank(date)) continue;
try {
result.setPublished(Instant.parse(date));
return;
} catch (DateTimeParseException ignored) {}
}
}

Expand Down

0 comments on commit c008703

Please sign in to comment.