From e91dacdcc78650c2abbde7ee58e8d187952f297b Mon Sep 17 00:00:00 2001 From: CodingPF Date: Sun, 12 Nov 2023 15:37:01 +0100 Subject: [PATCH 1/6] Move ratelimiter to crawler --- .../base/webaccess/JsoupConnection.java | 4 ++- .../crawler/arte/tasks/ArteTaskBase.java | 7 ---- .../crawler/basic/AbstractCrawler.java | 18 ++++++++-- .../crawler/basic/AbstractJsonRestTask.java | 6 ---- .../crawler/basic/AbstractRestTask.java | 1 + .../mserver/crawler/dw/DWTaskBase.java | 9 ----- .../crawler/sr/tasks/SrFilmDetailTask.java | 3 +- .../sr/tasks/SrRateLimitedDocumentTask.java | 33 ------------------- .../sr/tasks/SrTopicArchivePageTask.java | 3 +- .../crawler/zdf/tasks/ZdfTaskBase.java | 10 ------ 10 files changed, 24 insertions(+), 70 deletions(-) delete mode 100644 src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java diff --git a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java index 8c03195f4..6ff257725 100644 --- a/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java +++ b/src/main/java/de/mediathekview/mserver/base/webaccess/JsoupConnection.java @@ -1,5 +1,6 @@ package de.mediathekview.mserver.base.webaccess; +import okhttp3.ConnectionPool; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; @@ -22,12 +23,13 @@ public class JsoupConnection { private static final String FILE_TYPE_M3U8 = "m3u8"; protected OkHttpClient client; - public JsoupConnection(final int timeout) { + public JsoupConnection(final int timeout, final int threadPoolSize) { client = new OkHttpClient.Builder() .connectTimeout(timeout, TimeUnit.SECONDS) .readTimeout(timeout, TimeUnit.SECONDS) .callTimeout(timeout, TimeUnit.SECONDS) + .connectionPool(new ConnectionPool(threadPoolSize, 5L, TimeUnit.MINUTES)) .build(); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java index fc9705df5..ffdd4d853 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java +++ b/src/main/java/de/mediathekview/mserver/crawler/arte/tasks/ArteTaskBase.java @@ -1,6 +1,5 @@ package de.mediathekview.mserver.crawler.arte.tasks; -import com.google.common.util.concurrent.RateLimiter; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; @@ -19,7 +18,6 @@ public abstract class ArteTaskBase extends AbstractRestTask { private static final Logger LOG = LogManager.getLogger(ArteTaskBase.class); - private static RateLimiter limiter = null; private final transient GsonBuilder gsonBuilder; protected ArteTaskBase( @@ -106,11 +104,6 @@ private Response executeRequest(final WebTarget aTarget) { if (authKey.isPresent()) { request = request.header(HEADER_AUTHORIZATION, authKey.get()); } - - if (limiter == null) { - limiter = RateLimiter.create(crawler.getCrawlerConfig().getMaximumRequestsPerSecond()); - } - limiter.acquire(); return request .header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP) .header(HEADER_ACCEPT, APPLICATION_JSON) diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java index 226a122ad..87483f71e 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java @@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger; import org.jsoup.nodes.Document; +import com.google.common.util.concurrent.RateLimiter; + import java.io.IOException; import java.time.Duration; import java.time.LocalDateTime; @@ -41,6 +43,7 @@ public abstract class AbstractCrawler implements Callable> { protected Set films; private LocalDateTime startTime; protected JsoupConnection jsoupConnection; + protected RateLimiter rateLimiter; protected AbstractCrawler( final ForkJoinPool aForkJoinPool, @@ -58,8 +61,11 @@ protected AbstractCrawler( runtimeConfig = rootConfig.getConfig(); crawlerConfig = rootConfig.getSenderConfig(getSender()); - jsoupConnection = new JsoupConnection(crawlerConfig.getSocketTimeoutInSeconds()); - + jsoupConnection = new JsoupConnection( + rootConfig.getSenderConfig(getSender()).getSocketTimeoutInSeconds(), + runtimeConfig.getMaximumCpuThreads()); + rateLimiter = RateLimiter.create(rootConfig.getSenderConfig(getSender()).getMaximumRequestsPerSecond()); + films = ConcurrentHashMap.newKeySet(); } @@ -137,6 +143,14 @@ public JsoupConnection getConnection() { public void setConnection(JsoupConnection connection) { jsoupConnection = connection; } + + public RateLimiter getRateLimiter() { + return rateLimiter; + } + + public void setRateLimiter(RateLimiter rateLimiter) { + this.rateLimiter = rateLimiter; + } /** * Request an url and receive the body as String diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java index 80cd14fa0..327e25fab 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractJsonRestTask.java @@ -1,6 +1,5 @@ package de.mediathekview.mserver.crawler.basic; -import com.google.common.util.concurrent.RateLimiter; import com.google.gson.Gson; import com.google.gson.GsonBuilder; @@ -23,7 +22,6 @@ public abstract class AbstractJsonRestTask protected static final String ENCODING_GZIP = "gzip"; private static final long serialVersionUID = -1090560363478964885L; protected final transient GsonBuilder gsonBuilder; - private static RateLimiter limiter = null; protected AbstractJsonRestTask( final AbstractCrawler crawler, @@ -63,10 +61,6 @@ protected void processRestTarget(final D aDTO, final WebTarget aTarget) { } protected Response createResponse(final Builder request, final D aDTO) { - if (limiter == null) { - limiter = RateLimiter.create(crawler.getCrawlerConfig().getMaximumRequestsPerSecond()); - } - limiter.acquire(); request.header(ACCEPT_CHARSET, StandardCharsets.UTF_8); return request.header(ACCEPT_ENCODING, ENCODING_GZIP).header("User-Agent", "Mozilla").get(); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java index 6c3ac4248..c48ca5664 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractRestTask.java @@ -72,6 +72,7 @@ protected void processElement(final D aDTO) { * @return the {@link WebTarget} to access the url. */ protected WebTarget createWebTarget(final String aUrl) { + crawler.getRateLimiter().acquire(); return client.target(aUrl); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java index 43b40f0fb..7cc07c7c7 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java +++ b/src/main/java/de/mediathekview/mserver/crawler/dw/DWTaskBase.java @@ -1,10 +1,8 @@ package de.mediathekview.mserver.crawler.dw; -import com.google.common.util.concurrent.RateLimiter; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import de.mediathekview.mlib.daten.Sender; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRestTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -23,9 +21,6 @@ @SuppressWarnings("serial") public abstract class DWTaskBase extends AbstractRestTask { private static final Logger LOG = LogManager.getLogger(DWTaskBase.class); - - private static RateLimiter limiter = null; - private final transient GsonBuilder gsonBuilder; protected DWTaskBase( @@ -78,10 +73,6 @@ private Response executeRequest(final WebTarget aTarget) { request.header( ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get()); } - if (limiter == null) { - limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.DW).getMaximumRequestsPerSecond()); - } - limiter.acquire(); return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get(); } } diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java index baca014cf..25ac3044e 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java @@ -10,6 +10,7 @@ import de.mediathekview.mserver.crawler.ard.json.ArdVideoInfoDto; import de.mediathekview.mserver.crawler.ard.json.ArdVideoInfoJsonDeserializer; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; import de.mediathekview.mserver.crawler.basic.AbstractUrlTask; import de.mediathekview.mserver.crawler.sr.SrTopicUrlDTO; import org.apache.commons.lang3.StringUtils; @@ -27,7 +28,7 @@ import java.time.format.DateTimeParseException; import java.util.*; -public class SrFilmDetailTask extends SrRateLimitedDocumentTask { +public class SrFilmDetailTask extends AbstractDocumentTask { private static final org.apache.logging.log4j.Logger LOG = LogManager.getLogger(SrFilmDetailTask.class); diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java deleted file mode 100644 index 7ef0414e3..000000000 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrRateLimitedDocumentTask.java +++ /dev/null @@ -1,33 +0,0 @@ -package de.mediathekview.mserver.crawler.sr.tasks; - -import com.google.common.util.concurrent.RateLimiter; -import de.mediathekview.mlib.daten.Sender; -import de.mediathekview.mserver.base.config.MServerConfigManager; -import de.mediathekview.mserver.crawler.basic.AbstractCrawler; -import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; -import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; - -import java.util.Queue; - -public abstract class SrRateLimitedDocumentTask - extends AbstractDocumentTask { - - private static final long serialVersionUID = -4077182368484515410L; - - private static RateLimiter LIMITER = null; - - SrRateLimitedDocumentTask( - final AbstractCrawler crawler, - final Queue urlToCrawlDTOs) { - super(crawler, urlToCrawlDTOs); - } - - @Override - protected void processElement(final D urlDTO) { - if (LIMITER== null) { - LIMITER = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.SR).getMaximumRequestsPerSecond()); - } - LIMITER.acquire(); - super.processElement(urlDTO); - } -} diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java index afaf0fbac..4ab22a597 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrTopicArchivePageTask.java @@ -2,6 +2,7 @@ import de.mediathekview.mserver.base.HtmlConsts; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; import de.mediathekview.mserver.crawler.basic.AbstractUrlTask; import de.mediathekview.mserver.crawler.sr.SrConstants; import de.mediathekview.mserver.crawler.sr.SrTopicUrlDTO; @@ -15,7 +16,7 @@ import java.util.concurrent.ConcurrentLinkedQueue; public class SrTopicArchivePageTask - extends SrRateLimitedDocumentTask { + extends AbstractDocumentTask { private static final String NEXT_PAGE_SELECTOR = "div.pagination__item > a[title*=weiter]"; private static final String SHOW_SELECTOR = "h3.teaser__text__header"; diff --git a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java index e2318e46b..9b8b021a8 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java +++ b/src/main/java/de/mediathekview/mserver/crawler/zdf/tasks/ZdfTaskBase.java @@ -1,9 +1,7 @@ package de.mediathekview.mserver.crawler.zdf.tasks; -import com.google.common.util.concurrent.RateLimiter; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import de.mediathekview.mlib.daten.Sender; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.AbstractRestTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; @@ -20,9 +18,6 @@ public abstract class ZdfTaskBase extends AbstractRestTask { private static final Logger LOG = LogManager.getLogger(ZdfTaskBase.class); - - private static RateLimiter limiter = null; - private final GsonBuilder gsonBuilder; protected ZdfTaskBase( @@ -73,11 +68,6 @@ private Response executeRequest(final WebTarget aTarget) { request.header( ZdfConstants.HEADER_AUTHENTIFICATION, AUTHORIZATION_BEARER + authKey.get()); } - if (limiter == null) { - limiter = RateLimiter.create(crawler.getRuntimeConfig().getSenderConfig(Sender.ZDF).getMaximumRequestsPerSecond()); - } - - limiter.acquire(); return request.header(HEADER_ACCEPT_ENCODING, ENCODING_GZIP).get(); } } From a9db3351f90502b83991647647fa8659adce41d4 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 15 Nov 2023 19:55:02 +0100 Subject: [PATCH 2/6] ratelimiter for crawler calls, added exception for duplicates --- MServer-Config.yaml | 26 +++++++++++++------ .../crawler/basic/AbstractCrawler.java | 5 ++++ .../crawler/kika/tasks/KikaApiFilmTask.java | 10 +++++-- .../crawler/sr/tasks/SrFilmDetailTask.java | 8 ++++-- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/MServer-Config.yaml b/MServer-Config.yaml index d6bc5192d..85cb9940f 100644 --- a/MServer-Config.yaml +++ b/MServer-Config.yaml @@ -17,12 +17,22 @@ maximumRequestsPerSecond: 999.0 # If set only these Sender will be crawled all other will be ignored. senderIncluded: - #- MDR - #- NDR + #- ARD + #- ARTE_DE + #- ARGE_FR + #- ARTE_EN + #- ARTE_PL + #- ARTE_IT + #- ARTE_ES + #- 3SAT + #- FUNK #- KIKA - - DW - #- BR + #- DW + #- ORF #- PHOENIX + #- SRF + - SR + #- ZDF # If set the server will be awake after the crawler run and restarts the run after the given amount. #schedules: @@ -111,14 +121,14 @@ checkImportListUrlTimeoutInSec: 1800 #### Default crawler configurations #### # The maximum amount of URLs to be processed per task. -maximumUrlsPerTask: 50 +maximumUrlsPerTask: 10 # The maximum duration in minutes a crawler may run. maximumCrawlDurationInMinutes: 120 # Enables the topics search # maximumSubpages limits the depth of the topics search -topicsSearchEnabled: false +topicsSearchEnabled: true # The maximum amount of sub pages to be crawled.
# Example: If a Sendung overview side has 10 pages with videos for this Sendung and @@ -164,14 +174,14 @@ senderConfigurations: KIKA: maximumSubpages: 2 maximumRequestsPerSecond: 8.0 - SR: - maximumRequestsPerSecond: 2.0 ZDF: maximumRequestsPerSecond: 10.0 FUNK: maximumUrlsPerTask: 99 DW: maximumSubpages: 0 + SR: + maximumSubpages: 5 # configure string variables crawlerApiParams: diff --git a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java index 87483f71e..b9faa3455 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/basic/AbstractCrawler.java @@ -159,6 +159,7 @@ public void setRateLimiter(RateLimiter rateLimiter) { * @throws IOException */ public String requestBodyAsString(String url) throws IOException { + getRateLimiter().acquire(); return getConnection().requestBodyAsString(url); } @@ -169,6 +170,7 @@ public String requestBodyAsString(String url) throws IOException { * @throws IOException */ public Document requestBodyAsHtmlDocument(String url) throws IOException { + getRateLimiter().acquire(); return getConnection().requestBodyAsHtmlDocument(url); } @@ -179,6 +181,7 @@ public Document requestBodyAsHtmlDocument(String url) throws IOException { * @throws IOException */ public Document requestBodyAsXmlDocument(String url) throws IOException { + getRateLimiter().acquire(); return getConnection().requestBodyAsXmlDocument(url); } @@ -190,6 +193,7 @@ public Document requestBodyAsXmlDocument(String url) throws IOException { * @return size of the response in KB or -1 in case we could not determine the size. */ public long determineFileSizeInKB(String url) { + getRateLimiter().acquire(); return getConnection().determineFileSize(url) / 1024; } @@ -199,6 +203,7 @@ public long determineFileSizeInKB(String url) { * @return return true if the request was successfully processed by the server */ public boolean requestUrlExists(String url) { + getRateLimiter().acquire(); return getConnection().requestUrlExists(url); } /** diff --git a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java index 4d68062cc..5e0c09ac6 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java @@ -110,8 +110,14 @@ protected void postProcessing(KikaApiVideoInfoDto aResponseObj, KikaApiFilmDto a aFilm.setUrls(getVideoUrls(aResponseObj, aDTO)); aFilm.addAllSubtitleUrls(getSubtitle(aResponseObj, aDTO)); // - taskResults.add(aFilm); - crawler.incrementAndGetActualCount(); + + + if (!taskResults.add(aFilm)) { + LOG.debug("Rejected duplicate {}",aFilm); + crawler.incrementAndGetErrorCount(); + } else { + crawler.incrementAndGetActualCount(); + } crawler.updateProgress(); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java index 25ac3044e..3181c0d74 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/sr/tasks/SrFilmDetailTask.java @@ -157,8 +157,12 @@ protected void processDocument(final SrTopicUrlDTO aUrlDTO, final Document aDocu addUrls(film, videoInfo.getVideoUrls()); - taskResults.add(film); - crawler.incrementAndGetActualCount(); + if (taskResults.add(film)) { + crawler.incrementAndGetActualCount(); + } else { + crawler.incrementAndGetErrorCount(); + LOG.error("Rejected duplicate {}", film); + } crawler.updateProgress(); } else { LOG.error("SrFilmDetailTask: no title or video found for url {}", aUrlDTO.getUrl()); From 5fb75476d162c025a02f12e663426055cd052a9a Mon Sep 17 00:00:00 2001 From: CodingPF Date: Wed, 15 Nov 2023 23:54:03 +0100 Subject: [PATCH 3/6] threadsafe --- .../mserver/testhelper/WireMockTestBase.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java index ad6f7388a..2ea2bd934 100644 --- a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java +++ b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java @@ -23,7 +23,7 @@ public abstract class WireMockTestBase { private boolean wireMockStarted = false; @Before - public void setUpClass() { + public synchronized void setUpClass() { LOG.info("Setting up WireMock test class"); startWireMock(); } @@ -73,7 +73,7 @@ protected Queue createCrawlerUrlDto(final String requestUrl) { return input; } - protected void setupSuccessfulJsonResponse(final String requestUrl, final String aResponseFile) { + protected synchronized void setupSuccessfulJsonResponse(final String requestUrl, final String aResponseFile) { final String jsonBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful JSON response stub for {}", requestUrl); wireMockServer.stubFor( @@ -95,7 +95,7 @@ protected void setupSuccessfulJsonPostResponse( setupSuccessfulJsonPostResponse(requestUrl, aResponseFile, null); } - protected void setupSuccessfulJsonPostResponse( + protected synchronized void setupSuccessfulJsonPostResponse( final String requestUrl, final String aResponseFile, @Nullable final Integer status) { final String jsonBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful JSON post response stub for {}", requestUrl); @@ -108,7 +108,7 @@ protected void setupSuccessfulJsonPostResponse( .withBody(jsonBody))); } - protected void setupSuccessfulJsonPostResponse( + protected synchronized void setupSuccessfulJsonPostResponse( final String requestUrl, final String responsefile, final String requestBodyPart, @Nullable final Integer status) { final String jsonBody = FileReader.readFile(responsefile); wireMockServer.stubFor( @@ -121,7 +121,7 @@ protected void setupSuccessfulJsonPostResponse( .withBody(jsonBody))); } - protected void setupSuccessfulXmlResponse(final String requestUrl, final String aResponseFile) { + protected synchronized void setupSuccessfulXmlResponse(final String requestUrl, final String aResponseFile) { final String xmlBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful XML response stub for {}", requestUrl); wireMockServer.stubFor( @@ -133,31 +133,31 @@ protected void setupSuccessfulXmlResponse(final String requestUrl, final String .withBody(xmlBody))); } - protected void setupSuccessfulResponse(final String requestUrl, final String aResponseFile) { + protected synchronized void setupSuccessfulResponse(final String requestUrl, final String aResponseFile) { final String body = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful response stub for {}", requestUrl); wireMockServer.stubFor( get(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(200).withBody(body))); } - protected void setupHeadResponse(final String requestUrl, final int aHttpCode) { + protected synchronized void setupHeadResponse(final String requestUrl, final int aHttpCode) { LOG.info("Adding successful HEAD response stub for {}", requestUrl); wireMockServer.stubFor( head(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(aHttpCode))); } - protected void setupHeadResponse(final int aHttpCode) { + protected synchronized void setupHeadResponse(final int aHttpCode) { LOG.info("Adding {} HEAD response stub for any URL.", aHttpCode); wireMockServer.stubFor(head(anyUrl()).willReturn(aResponse().withStatus(aHttpCode))); } - protected void setupResponseWithoutBody(final String requestUrl, final int aHttpCode) { + protected synchronized void setupResponseWithoutBody(final String requestUrl, final int aHttpCode) { LOG.info("Adding {} stub for {}.", aHttpCode, requestUrl); wireMockServer.stubFor( get(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(aHttpCode))); } - protected void setupHeadRequestForFileSize() { + protected synchronized void setupHeadRequestForFileSize() { LOG.info("Adding file size HEAD request stub for any url."); wireMockServer.stubFor( head(urlMatching(".*")) From 1913d72bb5fc0095586388df9a0419ebbad46f2d Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 12 Nov 2023 21:40:45 +0100 Subject: [PATCH 4/6] #938 use url optimizers in kika crawler --- .../mserver/crawler/ard/ArdUrlOptimizer.java | 41 +++++++++++++++---- .../crawler/kika/tasks/KikaApiFilmTask.java | 27 ++++++++---- .../crawler/zdf/ZdfVideoUrlOptimizer.java | 2 + .../ArdVideoInfoJsonDeserializerTest.java | 4 +- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/ard/ArdUrlOptimizer.java b/src/main/java/de/mediathekview/mserver/crawler/ard/ArdUrlOptimizer.java index c9b3bdb84..e573021d7 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/ard/ArdUrlOptimizer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/ard/ArdUrlOptimizer.java @@ -2,11 +2,34 @@ import de.mediathekview.mserver.crawler.basic.AbstractCrawler; -public class ArdUrlOptimizer { +import java.util.HashMap; +import java.util.Map; - public static final String ARD_URL_1280 = ".xl.mp4"; - public static final String ARD_URL_1920 = ".xxl.mp4"; +public class ArdUrlOptimizer { + private static final String BR_URL_1280 = "_X.mp4"; + private static final String BR_URL_1920 = "_HD.mp4"; + private static final String HR_URL_1280 = "1280x720-50p-3200kbit.mp4"; + private static final String HR_URL_1920 = "1920x1080-50p-5000kbit.mp4"; + private static final String NDR_URL_1280 = ".hd.mp4"; + private static final String NDR_URL_1920 = ".1080.mp4"; + private static final String RBB_URL_1280 = "hd1080-avc720.mp4"; + private static final String RBB_URL_1920 = "hd1080-avc1080.mp4"; + private static final String SR_URL_1280 = "_P.mp4"; + private static final String SR_URL_1920 = "_H.mp4"; + private static final String SWR_URL_1280 = ".xl.mp4"; + private static final String SWR_URL_1920 = ".xxl.mp4"; + + private static final Map HD_OPTIMIZE = new HashMap<>(); + + static { + HD_OPTIMIZE.put(BR_URL_1280, new String[] {BR_URL_1920}); + HD_OPTIMIZE.put(HR_URL_1280, new String[] {HR_URL_1920}); + HD_OPTIMIZE.put(NDR_URL_1280, new String[] {NDR_URL_1920}); + HD_OPTIMIZE.put(RBB_URL_1280, new String[] {RBB_URL_1920}); + HD_OPTIMIZE.put(SR_URL_1280, new String[] {SR_URL_1920}); + HD_OPTIMIZE.put(SWR_URL_1280, new String[] {SWR_URL_1920}); + } protected AbstractCrawler crawler; @@ -15,10 +38,14 @@ public ArdUrlOptimizer(AbstractCrawler aCrawler) { } public String optimizeHdUrl(final String url) { - if (url.contains(ARD_URL_1280)) { - final String optimizedUrl = url.replace(ARD_URL_1280, ARD_URL_1920); - if (crawler.requestUrlExists(optimizedUrl)) { - return optimizedUrl; + for (Map.Entry entry : HD_OPTIMIZE.entrySet()) { + if (url.contains(entry.getKey())) { + for (String optimizeFragment : entry.getValue()) { + final String optimizedUrl = url.replace(entry.getKey(), optimizeFragment); + if (crawler.requestUrlExists(optimizedUrl)) { + return optimizedUrl; + } + } } } diff --git a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java index 5e0c09ac6..6aef6089b 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/kika/tasks/KikaApiFilmTask.java @@ -17,6 +17,8 @@ import java.util.Set; import java.util.UUID; +import de.mediathekview.mserver.crawler.ard.ArdUrlOptimizer; +import de.mediathekview.mserver.crawler.zdf.ZdfVideoUrlOptimizer; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -41,8 +43,13 @@ public class KikaApiFilmTask extends AbstractJsonRestTask urlToCrawlDTOs) { super(crawler, urlToCrawlDTOs, null); + ardUrlOptimizer = new ArdUrlOptimizer(crawler); + zdfVideoUrlOptimizer = new ZdfVideoUrlOptimizer(crawler); } @Override @@ -128,7 +135,7 @@ protected AbstractRecursiveConverterTask createNewOwnInsta } protected Optional getAiredDateTime(KikaApiFilmDto aDTO) { - Optional airedDate = null; + Optional airedDate; if (aDTO.getDate().isPresent()) { airedDate = parseLocalDateTime(aDTO, aDTO.getDate()); } else { @@ -157,12 +164,18 @@ protected Set getSubtitle(KikaApiVideoInfoDto aResponseObj, KikaApiFilmDto protected Map getVideoUrls(KikaApiVideoInfoDto aResponseObj, KikaApiFilmDto aDTO) { Map urls = new EnumMap<>(Resolution.class); for (Map.Entry element : aResponseObj.getVideoUrls().entrySet()) { - try { - final FilmUrl filmUrl = new FilmUrl(element.getValue(), crawler.determineFileSizeInKB(element.getValue())); - urls.put(element.getKey(), filmUrl); - } catch (MalformedURLException e) { - LOG.error("Invalid video url {} for {} error {}", element.getValue(), aDTO.getUrl(), e); - } + try { + String url = element.getValue(); + if (Resolution.HD.equals(element.getKey())) { + url = ardUrlOptimizer.optimizeHdUrl(url); + url = zdfVideoUrlOptimizer.getOptimizedUrlHd(url); + } + + final FilmUrl filmUrl = new FilmUrl(url, crawler.determineFileSizeInKB(url)); + urls.put(element.getKey(), filmUrl); + } catch (MalformedURLException e) { + LOG.error("Invalid video url {} for {} error {}", element.getValue(), aDTO.getUrl(), e); + } } return urls; } diff --git a/src/main/java/de/mediathekview/mserver/crawler/zdf/ZdfVideoUrlOptimizer.java b/src/main/java/de/mediathekview/mserver/crawler/zdf/ZdfVideoUrlOptimizer.java index 50a71acd1..bf07661ca 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/zdf/ZdfVideoUrlOptimizer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/zdf/ZdfVideoUrlOptimizer.java @@ -35,6 +35,7 @@ public class ZdfVideoUrlOptimizer { private static final String HD_3296_15_13 = "3296k_p15v13.mp4"; private static final String HD_3296_15_14 = "3296k_p15v14.mp4"; private static final String HD_3328_15_15 = "3328k_p15v15.mp4"; + private static final String HD_3328_15_17 = "3328k_p15v17.mp4"; private static final String HD_3328_12 = "3328k_p36v12.mp4"; private static final String HD_3328_13 = "3328k_p36v13.mp4"; private static final String HD_3328_14 = "3328k_p36v14.mp4"; @@ -78,6 +79,7 @@ public class ZdfVideoUrlOptimizer { HD_OPTIMIZE.put(HD_3360_36_17, new String[] {HD_6660_37_17, HD_6628_61_17}); HD_OPTIMIZE.put(HD_6628_61_17, new String[] {HD_6660_37_17}); + HD_OPTIMIZE.put(HD_3328_15_17, new String[] {HD_6660_37_17, HD_6628_61_17, HD_3360_36_17}); HD_OPTIMIZE.put(HD_3328_15_15, new String[] {HD_3360_36_15}); HD_OPTIMIZE.put(HD_3256, new String[] {HD_3328_12}); HD_OPTIMIZE.put(HD_3296_15_14, new String[] {HD_3328_36_14}); diff --git a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java index caf8df929..4ef1bd46d 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java @@ -72,7 +72,7 @@ public static Collection data() { "", "https://mediastorage01.sr-online.de/Video/UD/DOKU/1505155201_20170911_KANDIDATENCHECK_LUKSIC_M.mp4", "https://srstorage01-a.akamaihd.net/Video/UD/DOKU/1505155201_20170911_KANDIDATENCHECK_LUKSIC_L.mp4", - "https://srstorage01-a.akamaihd.net/Video/UD/DOKU/1505155201_20170911_KANDIDATENCHECK_LUKSIC_P.mp4" + "https://srstorage01-a.akamaihd.net/Video/UD/DOKU/1505155201_20170911_KANDIDATENCHECK_LUKSIC_H.mp4" }, { "/ard/ard_video_use_http_url.json", @@ -144,7 +144,7 @@ public static Collection data() { "", "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_M.mp4", "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_L.mp4", - "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_P.mp4" + "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_H .mp4" }, { "/ndr/ndr_film_detail_m3u8.json", From 74ae034758b3ff572d092ddcc1e8ed34fe81a9fa Mon Sep 17 00:00:00 2001 From: CodingPF Date: Thu, 16 Nov 2023 08:24:27 +0100 Subject: [PATCH 5/6] undo --- .../mserver/testhelper/WireMockTestBase.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java index 2ea2bd934..ad6f7388a 100644 --- a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java +++ b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java @@ -23,7 +23,7 @@ public abstract class WireMockTestBase { private boolean wireMockStarted = false; @Before - public synchronized void setUpClass() { + public void setUpClass() { LOG.info("Setting up WireMock test class"); startWireMock(); } @@ -73,7 +73,7 @@ protected Queue createCrawlerUrlDto(final String requestUrl) { return input; } - protected synchronized void setupSuccessfulJsonResponse(final String requestUrl, final String aResponseFile) { + protected void setupSuccessfulJsonResponse(final String requestUrl, final String aResponseFile) { final String jsonBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful JSON response stub for {}", requestUrl); wireMockServer.stubFor( @@ -95,7 +95,7 @@ protected void setupSuccessfulJsonPostResponse( setupSuccessfulJsonPostResponse(requestUrl, aResponseFile, null); } - protected synchronized void setupSuccessfulJsonPostResponse( + protected void setupSuccessfulJsonPostResponse( final String requestUrl, final String aResponseFile, @Nullable final Integer status) { final String jsonBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful JSON post response stub for {}", requestUrl); @@ -108,7 +108,7 @@ protected synchronized void setupSuccessfulJsonPostResponse( .withBody(jsonBody))); } - protected synchronized void setupSuccessfulJsonPostResponse( + protected void setupSuccessfulJsonPostResponse( final String requestUrl, final String responsefile, final String requestBodyPart, @Nullable final Integer status) { final String jsonBody = FileReader.readFile(responsefile); wireMockServer.stubFor( @@ -121,7 +121,7 @@ protected synchronized void setupSuccessfulJsonPostResponse( .withBody(jsonBody))); } - protected synchronized void setupSuccessfulXmlResponse(final String requestUrl, final String aResponseFile) { + protected void setupSuccessfulXmlResponse(final String requestUrl, final String aResponseFile) { final String xmlBody = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful XML response stub for {}", requestUrl); wireMockServer.stubFor( @@ -133,31 +133,31 @@ protected synchronized void setupSuccessfulXmlResponse(final String requestUrl, .withBody(xmlBody))); } - protected synchronized void setupSuccessfulResponse(final String requestUrl, final String aResponseFile) { + protected void setupSuccessfulResponse(final String requestUrl, final String aResponseFile) { final String body = FileReader.readFile(aResponseFile, getWireMockHostPort()); LOG.info("Adding successful response stub for {}", requestUrl); wireMockServer.stubFor( get(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(200).withBody(body))); } - protected synchronized void setupHeadResponse(final String requestUrl, final int aHttpCode) { + protected void setupHeadResponse(final String requestUrl, final int aHttpCode) { LOG.info("Adding successful HEAD response stub for {}", requestUrl); wireMockServer.stubFor( head(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(aHttpCode))); } - protected synchronized void setupHeadResponse(final int aHttpCode) { + protected void setupHeadResponse(final int aHttpCode) { LOG.info("Adding {} HEAD response stub for any URL.", aHttpCode); wireMockServer.stubFor(head(anyUrl()).willReturn(aResponse().withStatus(aHttpCode))); } - protected synchronized void setupResponseWithoutBody(final String requestUrl, final int aHttpCode) { + protected void setupResponseWithoutBody(final String requestUrl, final int aHttpCode) { LOG.info("Adding {} stub for {}.", aHttpCode, requestUrl); wireMockServer.stubFor( get(urlEqualTo(requestUrl)).willReturn(aResponse().withStatus(aHttpCode))); } - protected synchronized void setupHeadRequestForFileSize() { + protected void setupHeadRequestForFileSize() { LOG.info("Adding file size HEAD request stub for any url."); wireMockServer.stubFor( head(urlMatching(".*")) From 5c46be841b4a9c2fbefb90a42e620686593d07b1 Mon Sep 17 00:00:00 2001 From: CodingPF Date: Thu, 16 Nov 2023 08:32:25 +0100 Subject: [PATCH 6/6] typo --- .../crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java index 4ef1bd46d..837306d25 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/ard/json/ArdVideoInfoJsonDeserializerTest.java @@ -144,7 +144,7 @@ public static Collection data() { "", "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_M.mp4", "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_L.mp4", - "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_H .mp4" + "https://srstorage01-a.akamaihd.net/Video/FS/SA/sportarena_20190815_184401_H.mp4" }, { "/ndr/ndr_film_detail_m3u8.json",