Skip to content

Commit

Permalink
Bugfix for RSS scraper populating cache
Browse files Browse the repository at this point in the history
Also fixed relative RSS urls
  • Loading branch information
cjmalloy committed Mar 22, 2024
1 parent 01d246e commit 00c75a3
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 17 deletions.
26 changes: 14 additions & 12 deletions src/main/java/jasper/component/WebScraper.java
Original file line number Diff line number Diff line change
Expand Up @@ -525,18 +525,20 @@ private String getThumbnail(String src) {
return src;
}

@Timed(value = "jasper.webscrape")
public String rss(String url, String origin) {
var cache = fetch(url, origin);
if (isBlank(cache.getId())) return null;
var strData = new String(storage.get(origin, CACHE, cache.getId()));
if (!strData.trim().startsWith("<")) return null;
var doc = Jsoup.parse(strData);
return doc.getElementsByTag("link").stream()
.filter(t -> t.attr("type").equals("application/rss+xml"))
.filter(t -> t.hasAttr("href"))
.map(t -> t.attr("href"))
.findFirst().orElse(null);
@Timed(value = "jasper.rssscrape")
public String rss(String url) throws IOException {
try (var res = doScrape(url)) {
if (res == null) return null;
var strData = new String(res.getEntity().getContent().readAllBytes());
EntityUtils.consumeQuietly(res.getEntity());
if (!strData.trim().startsWith("<")) return null;
var doc = Jsoup.parse(strData, url);
return doc.getElementsByTag("link").stream()
.filter(t -> t.attr("type").equals("application/rss+xml"))
.filter(t -> t.hasAttr("href"))
.map(t -> t.absUrl("href"))
.findFirst().orElse(null);
}
}

public Cache scrape(String url, String origin) {
Expand Down
7 changes: 3 additions & 4 deletions src/main/java/jasper/service/ScrapeService.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,10 @@ public Cache fetch(String url, boolean thumbnail, OutputStream os) throws IOExce
return webScraper.fetch(url, thumbnail, auth.getOrigin(), os);
}

@PreAuthorize( "@auth.hasRole('USER')")
@Timed(value = "jasper.service", extraTags = {"service", "scrape"}, histogram = true)
public String rss(String url) {
// Only require role for new scrapes
if (!refRepository.existsByUrlAndOrigin(url, auth.getOrigin()) && !auth.hasRole(USER)) throw new AccessDeniedException("Requires USER role to scrape.");
return webScraper.rss(url, auth.getOrigin());
public String rss(String url) throws IOException {
return webScraper.rss(url);
}

@PreAuthorize( "@auth.hasRole('USER')")
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/jasper/web/rest/ScrapeController.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void fetch(
@ApiResponse(responseCode = "500", content = @Content(schema = @Schema(ref = "https://opensource.zalando.com/problem/schema.yaml#/Problem"))),
})
@GetMapping("rss")
ResponseEntity<String> rss(@RequestParam @Length(max = URL_LEN) String url) {
ResponseEntity<String> rss(@RequestParam @Length(max = URL_LEN) String url) throws IOException {
return ResponseEntity.ok()
.cacheControl(CacheControl.maxAge(100, TimeUnit.DAYS).cachePrivate())
.body(scrapeService.rss(url));
Expand Down

0 comments on commit 00c75a3

Please sign in to comment.