From 1cea11881efeff0b8df4f0c7911c1f80bee06ca5 Mon Sep 17 00:00:00 2001 From: Grzegorz Piwowarek Date: Sun, 10 Nov 2024 08:14:06 +0100 Subject: [PATCH] Expand Article 1 examples --- .gitignore | 4 +- pom.xml | 78 ++++++++----- src/main/java/blog/article1/Item.java | 30 ----- src/main/java/blog/article1/WebScraper.java | 56 --------- .../java/blog/article1/e1_plain/E1_Plain.java | 39 +++++++ .../article1/e2_json/E2_ResultsAsJson.java | 52 +++++++++ .../e3_multiple_cities/E3_MultipleCities.java | 57 +++++++++ .../E4_MultipleOutputTypes.java | 108 +++++++++++++++++ .../e5_parallel/E5_ParallelExecution.java | 109 ++++++++++++++++++ .../java/blog/article3/BillDownloader.java | 9 +- .../java/blog/article6/SchemaScraper.java | 9 +- 11 files changed, 422 insertions(+), 129 deletions(-) delete mode 100644 src/main/java/blog/article1/Item.java delete mode 100644 src/main/java/blog/article1/WebScraper.java create mode 100644 src/main/java/blog/article1/e1_plain/E1_Plain.java create mode 100644 src/main/java/blog/article1/e2_json/E2_ResultsAsJson.java create mode 100644 src/main/java/blog/article1/e3_multiple_cities/E3_MultipleCities.java create mode 100644 src/main/java/blog/article1/e4_multiple_outputs/E4_MultipleOutputTypes.java create mode 100644 src/main/java/blog/article1/e5_parallel/E5_ParallelExecution.java diff --git a/.gitignore b/.gitignore index bdbf179..0b24114 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ target target/* *.pdf -*.DS_STORE \ No newline at end of file +*.DS_STORE + +.idea diff --git a/pom.xml b/pom.xml index 04150f1..3c23455 100644 --- a/pom.xml +++ b/pom.xml @@ -1,33 +1,47 @@ - - 4.0.0 - fr.ksahin - blog - 0.0.1-SNAPSHOT - Blog - - - - - net.sourceforge.htmlunit - htmlunit - 2.19 - - - com.fasterxml.jackson.core - jackson-databind - 2.9.8 - - - - org.seleniumhq.selenium - selenium-java - 3.8.1 - + + 4.0.0 + fr.ksahin + blog + 0.0.1-SNAPSHOT + Blog - - com.github.detro - phantomjsdriver - 1.2.0 - - - \ No newline at end of file + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 23 + + + + + + + + net.sourceforge.htmlunit + htmlunit + 2.70.0 + + + + com.fasterxml.jackson.core + jackson-databind + 2.18.0 + + + + org.seleniumhq.selenium + selenium-java + 3.8.1 + + + + com.github.detro + phantomjsdriver + 1.2.0 + + + diff --git a/src/main/java/blog/article1/Item.java b/src/main/java/blog/article1/Item.java deleted file mode 100644 index 661edfb..0000000 --- a/src/main/java/blog/article1/Item.java +++ /dev/null @@ -1,30 +0,0 @@ -package blog.article1; - -import java.math.BigDecimal; - -public class Item { - private String title ; - private BigDecimal price ; - private String url ; - - public String getTitle() { - return title; - } - public void setTitle(String title) { - this.title = title; - } - public BigDecimal getPrice() { - return price; - } - public void setPrice(BigDecimal price) { - this.price = price; - } - public String getUrl() { - return url; - } - public void setUrl(String url) { - this.url = url; - } - - -} diff --git a/src/main/java/blog/article1/WebScraper.java b/src/main/java/blog/article1/WebScraper.java deleted file mode 100644 index 226bd48..0000000 --- a/src/main/java/blog/article1/WebScraper.java +++ /dev/null @@ -1,56 +0,0 @@ -package blog.article1; - -import java.math.BigDecimal; -import java.net.URLEncoder; -import java.util.List; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.gargoylesoftware.htmlunit.WebClient; -import com.gargoylesoftware.htmlunit.html.HtmlAnchor; -import com.gargoylesoftware.htmlunit.html.HtmlElement; -import com.gargoylesoftware.htmlunit.html.HtmlPage; - -public class WebScraper { - - public static void main(String[] args) { - - String searchQuery = "iphone 6s" ; - String baseUrl = "https://newyork.craigslist.org/" ; - WebClient client = new WebClient(); - client.getOptions().setCssEnabled(false); - client.getOptions().setJavaScriptEnabled(false); - try { - String searchUrl = baseUrl + "search/sss?sort=rel&query=" + URLEncoder.encode(searchQuery, "UTF-8"); - HtmlPage page = client.getPage(searchUrl); - - List items = (List) page.getByXPath("//li[@class='result-row']") ; - if(items.isEmpty()){ - System.out.println("No items found !"); - }else{ - for(HtmlElement htmlItem : items){ - HtmlAnchor itemAnchor = ((HtmlAnchor) htmlItem.getFirstByXPath(".//p[@class='result-info']/a")); - HtmlElement spanPrice = ((HtmlElement) htmlItem.getFirstByXPath(".//a/span[@class='result-price']")) ; - - // It is possible that an item doesn't have any price, we set the price to 0.0 in this case - String itemPrice = spanPrice == null ? "0.0" : spanPrice.asText() ; - - Item item = new Item(); - item.setTitle(itemAnchor.asText()); - item.setUrl( baseUrl + itemAnchor.getHrefAttribute()); - - item.setPrice(new BigDecimal(itemPrice.replace("$", ""))); - - - ObjectMapper mapper = new ObjectMapper(); - String jsonString = mapper.writeValueAsString(item) ; - - System.out.println(jsonString); - } - } - } catch(Exception e){ - e.printStackTrace(); - } - - } - -} diff --git a/src/main/java/blog/article1/e1_plain/E1_Plain.java b/src/main/java/blog/article1/e1_plain/E1_Plain.java new file mode 100644 index 0000000..4574b03 --- /dev/null +++ b/src/main/java/blog/article1/e1_plain/E1_Plain.java @@ -0,0 +1,39 @@ +package blog.article1.e1_plain; + +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlAnchor; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; + +import java.io.IOException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +class E1_Plain { + + public static void main(String[] args) throws IOException { + var searchQuery = "iphone 13"; + var searchUrl = "https://newyork.craigslist.org/search/moa?query=%s".formatted(URLEncoder.encode(searchQuery, StandardCharsets.UTF_8)); + + System.out.println("searchUrl = " + searchUrl); + + try (var client = new WebClient()) { + client.getOptions().setCssEnabled(false); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setThrowExceptionOnFailingStatusCode(false); + client.getOptions().setThrowExceptionOnScriptError(false); + + HtmlPage page = client.getPage(searchUrl); + for (var htmlItem : page.getByXPath("//li[contains(@class,'cl-static-search-result')]")) { + HtmlAnchor itemAnchor = htmlItem.getFirstByXPath(".//a"); + HtmlElement itemTitle = htmlItem.getFirstByXPath(".//div[@class='title']"); + HtmlElement itemPrice = htmlItem.getFirstByXPath(".//div[@class='price']"); + HtmlElement itemLocation = htmlItem.getFirstByXPath(".//div[@class='location']"); + + if (itemAnchor != null && itemTitle != null) { + System.out.printf("Name: %s, Price: %s, Location: %s, URL: %s%n", itemTitle.asNormalizedText(), itemPrice.asNormalizedText(), (itemLocation == null) ? "N/A" : itemLocation.asNormalizedText(), itemAnchor.getHrefAttribute()); + } + } + } + } +} diff --git a/src/main/java/blog/article1/e2_json/E2_ResultsAsJson.java b/src/main/java/blog/article1/e2_json/E2_ResultsAsJson.java new file mode 100644 index 0000000..174849d --- /dev/null +++ b/src/main/java/blog/article1/e2_json/E2_ResultsAsJson.java @@ -0,0 +1,52 @@ +package blog.article1.e2_json; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlAnchor; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +class E2_ResultsAsJson { + + private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws IOException { + var searchQuery = "iphone 13"; + var searchUrl = "https://newyork.craigslist.org/search/moa?query=%s".formatted(URLEncoder.encode(searchQuery, StandardCharsets.UTF_8)); + + System.out.println("searchUrl = " + searchUrl); + + try (var client = new WebClient()) { + client.getOptions().setCssEnabled(false); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setThrowExceptionOnFailingStatusCode(false); + client.getOptions().setThrowExceptionOnScriptError(false); + + HtmlPage page = client.getPage(searchUrl); + for (var htmlItem : page.getByXPath("//li[contains(@class,'cl-static-search-result')]")) { + HtmlAnchor itemAnchor = htmlItem.getFirstByXPath(".//a"); + HtmlElement itemTitle = htmlItem.getFirstByXPath(".//div[@class='title']"); + HtmlElement itemPrice = htmlItem.getFirstByXPath(".//div[@class='price']"); + HtmlElement itemLocation = htmlItem.getFirstByXPath(".//div[@class='location']"); + + if (itemAnchor != null && itemTitle != null) { + var itemName = itemTitle.asNormalizedText(); + var itemUrl = itemAnchor.getHrefAttribute(); + var itemPriceText = itemPrice.asNormalizedText(); + var itemLocationText = (itemLocation == null) ? "N/A" : itemLocation.asNormalizedText(); + + var item = new Item(itemName, new BigDecimal(itemPriceText.replace("$", "").replace(",", ".")), itemLocationText, itemUrl); + System.out.println("item = " + OBJECT_MAPPER.writeValueAsString(item)); + } + } + } + } + + record Item(String title, BigDecimal price, String location, String url) { + } +} diff --git a/src/main/java/blog/article1/e3_multiple_cities/E3_MultipleCities.java b/src/main/java/blog/article1/e3_multiple_cities/E3_MultipleCities.java new file mode 100644 index 0000000..34bc2ee --- /dev/null +++ b/src/main/java/blog/article1/e3_multiple_cities/E3_MultipleCities.java @@ -0,0 +1,57 @@ +package blog.article1.e3_multiple_cities; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlAnchor; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.List; + +class E3_MultipleCities { + + private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws IOException { + var searchQuery = "iphone 13"; + var cities = List.of("newyork", "boston", "washingtondc"); + + try (var client = new WebClient()) { + client.getOptions().setCssEnabled(false); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setThrowExceptionOnFailingStatusCode(false); + client.getOptions().setThrowExceptionOnScriptError(false); + + for (String city : cities) { + var searchUrl = "https://%s.craigslist.org/search/moa?query=%s".formatted(city, URLEncoder.encode(searchQuery, StandardCharsets.UTF_8)); + + System.out.println("searchUrl = " + searchUrl); + + HtmlPage page = client.getPage(searchUrl); + for (var htmlItem : page.getByXPath("//li[contains(@class,'cl-static-search-result')]")) { + HtmlAnchor itemAnchor = htmlItem.getFirstByXPath(".//a"); + HtmlElement itemTitle = htmlItem.getFirstByXPath(".//div[@class='title']"); + HtmlElement itemPrice = htmlItem.getFirstByXPath(".//div[@class='price']"); + HtmlElement itemLocation = htmlItem.getFirstByXPath(".//div[@class='location']"); + + if (itemAnchor != null && itemTitle != null) { + var itemName = itemTitle.asNormalizedText(); + var itemUrl = itemAnchor.getHrefAttribute(); + var itemPriceText = itemPrice.asNormalizedText(); + var itemLocationText = (itemLocation == null) ? "N/A" : itemLocation.asNormalizedText(); + + var item = new Item(itemName, new BigDecimal(itemPriceText.replace("$", "").replace(",", ".")), itemLocationText, itemUrl); + System.out.println("item = " + OBJECT_MAPPER.writeValueAsString(item)); + } + } + } + } + } + + record Item(String title, BigDecimal price, String location, String url) { + } +} diff --git a/src/main/java/blog/article1/e4_multiple_outputs/E4_MultipleOutputTypes.java b/src/main/java/blog/article1/e4_multiple_outputs/E4_MultipleOutputTypes.java new file mode 100644 index 0000000..ba4105f --- /dev/null +++ b/src/main/java/blog/article1/e4_multiple_outputs/E4_MultipleOutputTypes.java @@ -0,0 +1,108 @@ +package blog.article1.e4_multiple_outputs; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlAnchor; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +class E4_MultipleOutputTypes { + + public static void main(String[] args) { + timed(() -> { + var outputType = args.length == 1 ? args[0].toLowerCase() : ""; + var searchQuery = "iphone 13"; + var cities = List.of("newyork", "boston", "washingtondc", "losangeles", "chicago", "sanfrancisco", "seattle", "miami", "dallas", "denver"); + + var results = fetchCities(cities, searchQuery); + + switch (outputType) { + case "json" -> asJson(results); + case "csv" -> asCsv(results); + default -> System.out.println("unknown output type"); + } + }); + } + + private static void timed(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + var end = System.currentTimeMillis(); + System.out.printf("time = %dms%n", end - start); + } + + private static void asJson(Map> results) { + var objectMapper = new ObjectMapper(); + try { + System.out.println(objectMapper.writeValueAsString(results)); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + private static void asCsv(Map> results) { + System.out.println("city,title,price,location,url"); + for (Map.Entry> entry : results.entrySet()) { + for (Item item : entry.getValue()) { + System.out.printf("%s,%s,%s,%s,%s%n", entry.getKey(), item.title, item.price, item.location, item.url); + } + } + } + + private static Map> fetchCities(List cities, String searchQuery) { + try (var client = new WebClient()) { + client.getOptions().setCssEnabled(false); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setThrowExceptionOnFailingStatusCode(false); + client.getOptions().setThrowExceptionOnScriptError(false); + + Map> items = new HashMap<>(); + + for (String city : cities) { + var searchUrl = "https://%s.craigslist.org/search/moa?query=%s".formatted(city, URLEncoder.encode(searchQuery, StandardCharsets.UTF_8)); + + System.out.println("searchUrl = " + searchUrl); + + try { + HtmlPage page = client.getPage(searchUrl); + for (var htmlItem : page.getByXPath("//li[contains(@class,'cl-static-search-result')]")) { + HtmlAnchor itemAnchor = htmlItem.getFirstByXPath(".//a"); + HtmlElement itemTitle = htmlItem.getFirstByXPath(".//div[@class='title']"); + HtmlElement itemPrice = htmlItem.getFirstByXPath(".//div[@class='price']"); + HtmlElement itemLocation = htmlItem.getFirstByXPath(".//div[@class='location']"); + + if (itemAnchor != null && itemTitle != null) { + var itemName = itemTitle.asNormalizedText(); + var itemUrl = itemAnchor.getHrefAttribute(); + var itemPriceText = itemPrice.asNormalizedText(); + var itemLocationText = (itemLocation == null) ? "N/A" : itemLocation.asNormalizedText(); + + items.computeIfAbsent(city, _ -> new ArrayList<>()) + .add(new Item(itemName, new BigDecimal(itemPriceText.replace("$", "") + .replace(",", ".")), itemLocationText, itemUrl)); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + return items; + } + } + + record Item(String title, BigDecimal price, String location, String url) { + } + + +} diff --git a/src/main/java/blog/article1/e5_parallel/E5_ParallelExecution.java b/src/main/java/blog/article1/e5_parallel/E5_ParallelExecution.java new file mode 100644 index 0000000..92876d4 --- /dev/null +++ b/src/main/java/blog/article1/e5_parallel/E5_ParallelExecution.java @@ -0,0 +1,109 @@ +package blog.article1.e5_parallel; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.HtmlAnchor; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executors; +import java.util.stream.Collectors; + +class E5_ParallelExecution { + + public static void main(String[] args) { + timed(() -> { + var outputType = args.length == 1 ? args[0].toLowerCase() : ""; + var searchQuery = "iphone 13"; + var cities = List.of("newyork", "boston", "washingtondc", "losangeles", "chicago", "sanfrancisco", "seattle", "miami", "dallas", "denver"); + + var results = fetchCities(cities, searchQuery); + + switch (outputType) { + case "json" -> asJson(results); + case "csv" -> asCsv(results); + default -> System.out.println("unknown output type"); + } + }); + } + + private static void asJson(Map> results) { + var objectMapper = new ObjectMapper(); + try { + System.out.println(objectMapper.writeValueAsString(results)); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + private static void asCsv(Map> results) { + System.out.println("city,title,price,location,url"); + for (Map.Entry> entry : results.entrySet()) { + for (Item item : entry.getValue()) { + System.out.printf("%s,%s,%s,%s,%s%n", entry.getKey(), item.title, item.price, item.location, item.url); + } + } + } + + private static Map> fetchCities(List cities, String searchQuery) { + try (var client = new WebClient()) { + client.getOptions().setCssEnabled(false); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setThrowExceptionOnFailingStatusCode(false); + client.getOptions().setThrowExceptionOnScriptError(false); + + return cities.stream() + .map(city -> Map.entry(city, CompletableFuture.supplyAsync(() -> { + var searchUrl = "https://%s.craigslist.org/search/moa?query=%s".formatted(city, URLEncoder.encode(searchQuery, StandardCharsets.UTF_8)); + + System.out.println("fetching: " + searchUrl); + + try { + var results = new ArrayList(); + HtmlPage page = client.getPage(searchUrl); + for (var htmlItem : page.getByXPath("//li[contains(@class,'cl-static-search-result')]")) { + HtmlAnchor itemAnchor = htmlItem.getFirstByXPath(".//a"); + HtmlElement itemTitle = htmlItem.getFirstByXPath(".//div[@class='title']"); + HtmlElement itemPrice = htmlItem.getFirstByXPath(".//div[@class='price']"); + HtmlElement itemLocation = htmlItem.getFirstByXPath(".//div[@class='location']"); + + if (itemAnchor != null && itemTitle != null) { + var itemName = itemTitle.asNormalizedText(); + var itemUrl = itemAnchor.getHrefAttribute(); + var itemPriceText = itemPrice.asNormalizedText(); + var itemLocationText = (itemLocation == null) ? "N/A" : itemLocation.asNormalizedText(); + + var item = new Item(itemName, new BigDecimal(itemPriceText.replace("$", "").replace(",", ".")), itemLocationText, itemUrl); + results.add(item); + } + } + return results; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, Executors.newVirtualThreadPerTaskExecutor()))) + .toList() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().join())); + } + } + + record Item(String title, BigDecimal price, String location, String url) { + } + + private static void timed(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + var end = System.currentTimeMillis(); + System.out.printf("time: %dms%n", end - start); + } +} diff --git a/src/main/java/blog/article3/BillDownloader.java b/src/main/java/blog/article3/BillDownloader.java index 7bdb091..d05845f 100644 --- a/src/main/java/blog/article3/BillDownloader.java +++ b/src/main/java/blog/article3/BillDownloader.java @@ -17,7 +17,6 @@ import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlTable; import com.gargoylesoftware.htmlunit.html.HtmlTableRow; -import com.gargoylesoftware.htmlunit.javascript.host.URL; import blog.article2.Authenticator; @@ -31,21 +30,21 @@ public static void main(String[] args) { WebClient client = Authenticator.autoLogin(baseUrl + "/login", login, password); HtmlPage page = client.getPage("https://cloud.digitalocean.com/settings/billing"); - if(page.asText().contains("You need to sign in for access to this page")){ + if(page.asNormalizedText().contains("You need to sign in for access to this page")){ throw new Exception(String.format("Error during login on %s , check your credentials", baseUrl)); } List bills = new ArrayList(); HtmlTable billsTable = (HtmlTable) page.getFirstByXPath("//table[@class='listing Billing--history']"); for(HtmlTableRow row : billsTable.getBodies().get(0).getRows()){ - String label = row.getCell(1).asText(); + String label = row.getCell(1).asNormalizedText(); // We only want the invoice row, not the payment one if(!label.contains("Invoice")){ continue ; } - Date date = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH).parse(row.getCell(0).asText()); - BigDecimal amount =new BigDecimal(row.getCell(2).asText().replace("$", "")); + Date date = new SimpleDateFormat("MMMM d, yyyy", Locale.ENGLISH).parse(row.getCell(0).asNormalizedText()); + BigDecimal amount =new BigDecimal(row.getCell(2).asNormalizedText().replace("$", "")); String url = ((HtmlAnchor) row.getCell(3).getFirstChild()).getHrefAttribute(); Bill bill = new Bill(label, amount, date, url); diff --git a/src/main/java/blog/article6/SchemaScraper.java b/src/main/java/blog/article6/SchemaScraper.java index b04e258..e1b9dec 100644 --- a/src/main/java/blog/article6/SchemaScraper.java +++ b/src/main/java/blog/article6/SchemaScraper.java @@ -5,7 +5,6 @@ import java.net.MalformedURLException; import java.net.URL; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.WebClient; @@ -14,7 +13,7 @@ public class SchemaScraper { - public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { + public static void main(String[] args) throws FailingHttpStatusCodeException, IOException { WebClient client = new WebClient(); client.getOptions().setCssEnabled(false); client.getOptions().setJavaScriptEnabled(false); @@ -27,10 +26,10 @@ public static void main(String[] args) throws FailingHttpStatusCodeException, Ma .getAttribute("src")); HtmlElement offers = ((HtmlElement) productNode.getFirstByXPath("./span[@itemprop='offers']")); - BigDecimal price = new BigDecimal(((HtmlElement) offers.getFirstByXPath("./span[@itemprop='price']")).asText()); - String productName = (((HtmlElement) productNode.getFirstByXPath("./span[@itemprop='name']")).asText()); + BigDecimal price = new BigDecimal(((HtmlElement) offers.getFirstByXPath("./span[@itemprop='price']")).asNormalizedText()); + String productName = (((HtmlElement) productNode.getFirstByXPath("./span[@itemprop='name']")).asNormalizedText()); String currency = (((HtmlElement) offers.getFirstByXPath("./*[@itemprop='priceCurrency']")).getAttribute("content")); - String productSKU = (((HtmlElement) productNode.getFirstByXPath("./span[@itemprop='sku']")).asText()); + String productSKU = (((HtmlElement) productNode.getFirstByXPath("./span[@itemprop='sku']")).asNormalizedText()); Product product = new Product(price, productName, productSKU, imageUrl, currency); ObjectMapper mapper = new ObjectMapper();