From 1e88dde725329029038becf0126a06e4fbc65036 Mon Sep 17 00:00:00 2001 From: d4n Date: Tue, 21 Sep 2021 18:28:10 -0500 Subject: [PATCH] v1.0 --- .../GScraperExample/GScraperExample.csproj | 2 +- examples/GScraperExample/Program.cs | 64 ++-- src/GScraper/Brave/BraveCountries.cs | 187 ++++++++++ src/GScraper/Brave/BraveImageColor.cs | 69 ++++ src/GScraper/Brave/BraveImageLayout.cs | 25 ++ src/GScraper/Brave/BraveImageLicense.cs | 33 ++ src/GScraper/Brave/BraveImageResult.cs | 76 +++++ src/GScraper/Brave/BraveImageSize.cs | 29 ++ src/GScraper/Brave/BraveImageType.cs | 29 ++ src/GScraper/Brave/BraveScraper.cs | 172 ++++++++++ .../DuckDuckGo/DuckDuckGoImageColor.cs | 69 ++++ .../DuckDuckGo/DuckDuckGoImageLayout.cs | 25 ++ .../DuckDuckGo/DuckDuckGoImageLicense.cs | 37 ++ .../DuckDuckGo/DuckDuckGoImageResult.cs | 56 +++ .../DuckDuckGo/DuckDuckGoImageSize.cs | 29 ++ .../DuckDuckGo/DuckDuckGoImageTime.cs | 25 ++ .../DuckDuckGo/DuckDuckGoImageType.cs | 33 ++ src/GScraper/DuckDuckGo/DuckDuckGoRegions.cs | 323 ++++++++++++++++++ src/GScraper/DuckDuckGo/DuckDuckGoScraper.cs | 197 +++++++++++ src/GScraper/GScraper.csproj | 31 +- src/GScraper/GScraperException.cs | 29 ++ src/GScraper/GScraperExtensions.cs | 45 +-- src/GScraper/GScraperGuards.cs | 31 ++ src/GScraper/Google/GoogleImageColors.cs | 78 +++++ src/GScraper/Google/GoogleImageLicenses.cs | 18 + src/GScraper/Google/GoogleImageResult.cs | 56 +++ src/GScraper/Google/GoogleImageSize.cs | 25 ++ src/GScraper/Google/GoogleImageTime.cs | 29 ++ src/GScraper/Google/GoogleImageType.cs | 33 ++ src/GScraper/Google/GoogleLanguages.cs | 148 ++++++++ src/GScraper/Google/GoogleScraper.cs | 229 +++++++++++++ src/GScraper/GoogleScraper.cs | 181 ---------- src/GScraper/IImageResult.cs | 28 ++ src/GScraper/ImageResult.cs | 54 --- src/GScraper/SafeSearchLevel.cs | 23 ++ 35 files changed, 2216 insertions(+), 302 deletions(-) create mode 100644 src/GScraper/Brave/BraveCountries.cs create mode 100644 src/GScraper/Brave/BraveImageColor.cs create mode 100644 src/GScraper/Brave/BraveImageLayout.cs create mode 100644 src/GScraper/Brave/BraveImageLicense.cs create mode 100644 src/GScraper/Brave/BraveImageResult.cs create mode 100644 src/GScraper/Brave/BraveImageSize.cs create mode 100644 src/GScraper/Brave/BraveImageType.cs create mode 100644 src/GScraper/Brave/BraveScraper.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageColor.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageLayout.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageLicense.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageResult.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageSize.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageTime.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoImageType.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoRegions.cs create mode 100644 src/GScraper/DuckDuckGo/DuckDuckGoScraper.cs create mode 100644 src/GScraper/GScraperGuards.cs create mode 100644 src/GScraper/Google/GoogleImageColors.cs create mode 100644 src/GScraper/Google/GoogleImageLicenses.cs create mode 100644 src/GScraper/Google/GoogleImageResult.cs create mode 100644 src/GScraper/Google/GoogleImageSize.cs create mode 100644 src/GScraper/Google/GoogleImageTime.cs create mode 100644 src/GScraper/Google/GoogleImageType.cs create mode 100644 src/GScraper/Google/GoogleLanguages.cs create mode 100644 src/GScraper/Google/GoogleScraper.cs delete mode 100644 src/GScraper/GoogleScraper.cs create mode 100644 src/GScraper/IImageResult.cs delete mode 100644 src/GScraper/ImageResult.cs create mode 100644 src/GScraper/SafeSearchLevel.cs diff --git a/examples/GScraperExample/GScraperExample.csproj b/examples/GScraperExample/GScraperExample.csproj index d3a8bc8..7c305ab 100644 --- a/examples/GScraperExample/GScraperExample.csproj +++ b/examples/GScraperExample/GScraperExample.csproj @@ -2,7 +2,7 @@ Exe - netcoreapp2.1 + net5.0 diff --git a/examples/GScraperExample/Program.cs b/examples/GScraperExample/Program.cs index b32d5ca..27932fc 100644 --- a/examples/GScraperExample/Program.cs +++ b/examples/GScraperExample/Program.cs @@ -1,9 +1,10 @@ using System; using System.Collections.Generic; -using System.Globalization; using System.Net.Http; +using System.Text.Json; using System.Threading.Tasks; using GScraper; +using GScraper.Google; namespace GScraperExample { @@ -12,11 +13,14 @@ internal static class Program private static async Task Main() { Console.WriteLine("GScraper Example Program"); - var scraper = new GoogleScraper(); + using var scraper = new GoogleScraper(); + // Other scrapers: + // using var scraper = new GScraper.DuckDuckGo.DuckDuckGoScraper(); + // using var scraper = new GScraper.Brave.BraveScraper(); while (true) { - Console.Write("Query (enter \'e\' to exit): "); + Console.Write("Query (enter 'e' to exit): "); string text = Console.ReadLine(); if (string.IsNullOrEmpty(text)) @@ -25,40 +29,52 @@ private static async Task Main() if (text == "e") break; - Console.Write("Limit?: "); - if (!int.TryParse(Console.ReadLine(), NumberStyles.Integer, CultureInfo.InvariantCulture, out int limit)) - continue; - - IReadOnlyList images; + IEnumerable images; try { - images = await scraper.GetImagesAsync(text, limit).ConfigureAwait(false); + images = await scraper.GetImagesAsync(text); } - catch (HttpRequestException e) - { - Console.WriteLine(e); - continue; - } - catch (GScraperException e) + catch (Exception e) when (e is HttpRequestException or GScraperException) { Console.WriteLine(e); continue; } + bool enumerateAll = false; + bool stop = false; foreach (var image in images) { - Console.WriteLine($"Title: {image.Title}"); - Console.WriteLine($"Link: {image.Link}"); - Console.WriteLine($"ThumbnailLink: {image.ThumbnailLink}"); - Console.WriteLine($"ContextLink: {image.ContextLink}"); - Console.WriteLine($"DisplayLink: {image.DisplayLink}"); - Console.WriteLine($"Width: {image.Width}"); - Console.WriteLine($"Height: {image.Height}"); Console.WriteLine(); + Console.WriteLine(JsonSerializer.Serialize(image, image.GetType(), new JsonSerializerOptions { WriteIndented = true })); + Console.WriteLine(); + + if (!enumerateAll) + { + Console.Write("Press 'n' to send the next image, 'a' to enumerate all images and 's' to stop: "); + var key = Console.ReadKey().Key; + Console.WriteLine(); + + switch (key) + { + case ConsoleKey.A: + enumerateAll = true; + break; + + case ConsoleKey.S: + stop = true; + break; + + default: + break; + } + } + + if (stop) + { + break; + } } } - - scraper.Dispose(); } } } \ No newline at end of file diff --git a/src/GScraper/Brave/BraveCountries.cs b/src/GScraper/Brave/BraveCountries.cs new file mode 100644 index 0000000..cc401a9 --- /dev/null +++ b/src/GScraper/Brave/BraveCountries.cs @@ -0,0 +1,187 @@ +namespace GScraper.Brave +{ + /// + /// Contains the possible countries in Brave search. + /// + public static class BraveCountries + { + /// + /// All regions. + /// + public const string All = "all"; + + /// + /// Argentina. + /// + public const string Argentina = "ar"; + + /// + /// Australia. + /// + public const string Australia = "au"; + + /// + /// Belgium. + /// + public const string Belgium = "be"; + + /// + /// Brazil. + /// + public const string Brazil = "br"; + + /// + /// Canada. + /// + public const string Canada = "ca"; + + /// + /// Chile. + /// + public const string Chile = "cl"; + + /// + /// Denmark. + /// + public const string Denmark = "dk"; + + /// + /// Finland. + /// + public const string Finland = "fi"; + + /// + /// France. + /// + public const string France = "fr"; + /// + /// Germany. + /// + public const string Germany = "de"; + + /// + /// Hong Kong. + /// + public const string HongKong = "hk"; + + /// + /// India. + /// + public const string India = "in"; + + /// + /// Indonesia. + /// + public const string Indonesia = "id"; + + /// + /// Italy. + /// + public const string Italy = "it"; + + /// + /// Japan. + /// + public const string Japan = "jp"; + + /// + /// Korea. + /// + public const string Korea = "kr"; + + /// + /// Malaysia. + /// + public const string Malaysia = "my"; + + /// + /// Mexico. + /// + public const string Mexico = "mx"; + + /// + /// Netherlands. + /// + public const string Netherlands = "nl"; + + /// + /// New Zealand. + /// + public const string NewZealand = "nz"; + + /// + /// Norway. + /// + public const string Norway = "no"; + + /// + /// China. + /// + public const string China = "cn"; + + /// + /// Poland. + /// + public const string Poland = "pl"; + + /// + /// Portugal. + /// + public const string Portugal = "pl"; + + /// + /// Philippines. + /// + public const string Philippines = "ph"; + + /// + /// Russia. + /// + public const string Russia = "ru"; + + /// + /// Saudi Arabia. + /// + public const string SaudiArabia = "sa"; + + /// + /// South Africa. + /// + public const string SouthAfrica = "za"; + + /// + /// Spain. + /// + public const string Spain = "es"; + + /// + /// Sweden. + /// + public const string Sweden = "se"; + + /// + /// Switzerland. + /// + public const string Switzerland = "ch"; + + /// + /// Taiwan. + /// + public const string Taiwan = "tw"; + + /// + /// Turkey. + /// + public const string Turkey = "tr"; + + /// + /// United Kingdom. + /// + public const string UnitedKingdom = "gb"; + + /// + /// United States. + /// + public const string UnitedStates = "us"; + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageColor.cs b/src/GScraper/Brave/BraveImageColor.cs new file mode 100644 index 0000000..d98a520 --- /dev/null +++ b/src/GScraper/Brave/BraveImageColor.cs @@ -0,0 +1,69 @@ +namespace GScraper.Brave +{ + /// + /// Specifies the image colors in Brave search. + /// + public enum BraveImageColor + { + /// + /// All colors. + /// + All, + /// + /// Black and white. + /// + Monochrome, + /// + /// Only colors. + /// + ColorOnly, + /// + /// Red color. + /// + Red, + /// + /// Orange color. + /// + Orange, + /// + /// Yellow color. + /// + Yellow, + /// + /// Green color. + /// + Green, + /// + /// Blue color. + /// + Blue, + /// + /// Purple color. + /// + Purple, + /// + /// Pink color. + /// + Pink, + /// + /// Brown color. + /// + Brown, + /// + /// Black color. + /// + Black, + /// + /// Gray color. + /// + Gray, + /// + /// Teal color. + /// + Teal, + /// + /// White color. + /// + White + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageLayout.cs b/src/GScraper/Brave/BraveImageLayout.cs new file mode 100644 index 0000000..48c5559 --- /dev/null +++ b/src/GScraper/Brave/BraveImageLayout.cs @@ -0,0 +1,25 @@ +namespace GScraper.Brave +{ + /// + /// Specifies the image layouts in Brave search. + /// + public enum BraveImageLayout + { + /// + /// All layouts + /// + All, + /// + /// Square layout. + /// + Square, + /// + /// Tall layout. + /// + Tall, + /// + /// Wide layout. + /// + Wide + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageLicense.cs b/src/GScraper/Brave/BraveImageLicense.cs new file mode 100644 index 0000000..84ae6f7 --- /dev/null +++ b/src/GScraper/Brave/BraveImageLicense.cs @@ -0,0 +1,33 @@ +namespace GScraper.Brave +{ + /// + /// Specifies the image licenses in Brave search. + /// + public enum BraveImageLicense + { + /// + /// All licenses. + /// + All, + /// + /// Public Domain. + /// + Public, + /// + /// Free to Share and Use. + /// + Share, + /// + /// Free to Share and Use Commercially. + /// + ShareCommercially, + /// + /// Free to Modify, Share and Use. + /// + Modify, + /// + /// Free to Modify, Share and Use Commercially. + /// + ModifyCommercially + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageResult.cs b/src/GScraper/Brave/BraveImageResult.cs new file mode 100644 index 0000000..1c54324 --- /dev/null +++ b/src/GScraper/Brave/BraveImageResult.cs @@ -0,0 +1,76 @@ +using System; +using System.Diagnostics; + +namespace GScraper.Brave +{ + /// + /// Represents an image result from Brave. + /// + [DebuggerDisplay("Title: {Title}, Url: {Url}")] + public class BraveImageResult : IImageResult + { + internal BraveImageResult(string url, string title, int width, int height, string sourceUrl, + DateTimeOffset pageAge, string source, string thumbnailUrl, string resizedUrl, string format) + { + Url = url; + Title = title; + Width = width; + Height = height; + SourceUrl = sourceUrl; + PageAge = pageAge; + Source = source; + ThumbnailUrl = thumbnailUrl; + ResizedUrl = resizedUrl; + Format = format; + } + + + /// + public string Url { get; } + + /// + public string Title { get; } + + /// + public int Width { get; } + + /// + public int Height { get; } + + /// + /// Gets a URL pointing to the webpage hosting the image. + /// + public string SourceUrl { get; } + + /// + /// Gets the page age. + /// + public DateTimeOffset PageAge { get; } + + /// + /// Gets the name or the root URL of the website this image comes from. + /// + public string Source { get; } + + /// + /// Gets a URL pointing to the thumbnail image. + /// + public string ThumbnailUrl { get; } + + /// + /// Gets a URL pointing to the resized image. + /// + public string ResizedUrl { get; } + + /// + /// Gets the format of the image. + /// + public string Format { get; } + + /// + /// Returns the URL of this result. + /// + /// The URL of this result. + public override string ToString() => Url; + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageSize.cs b/src/GScraper/Brave/BraveImageSize.cs new file mode 100644 index 0000000..9224c54 --- /dev/null +++ b/src/GScraper/Brave/BraveImageSize.cs @@ -0,0 +1,29 @@ +namespace GScraper.Brave +{ + /// + /// Specifies the image sizes in Brave search. + /// + public enum BraveImageSize + { + /// + /// Any sizes. + /// + All, + /// + /// Small sizes. + /// + Small, + /// + /// Medium sizes. + /// + Medium, + /// + /// Large sizes. + /// + Large, + /// + /// Wallpaper sizes. + /// + Wallpaper + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveImageType.cs b/src/GScraper/Brave/BraveImageType.cs new file mode 100644 index 0000000..d343a29 --- /dev/null +++ b/src/GScraper/Brave/BraveImageType.cs @@ -0,0 +1,29 @@ +namespace GScraper.Brave +{ + /// + /// Specifies the image types in Brave search. + /// + public enum BraveImageType + { + /// + /// All types. + /// + All, + /// + /// Photograph. + /// + Photo, + /// + /// Clip Art. + /// + Cliparts, + /// + /// Animated GIF. + /// + AnimatedGifHttps, + /// + /// Transparent. + /// + Transparent + } +} \ No newline at end of file diff --git a/src/GScraper/Brave/BraveScraper.cs b/src/GScraper/Brave/BraveScraper.cs new file mode 100644 index 0000000..eb0f3a6 --- /dev/null +++ b/src/GScraper/Brave/BraveScraper.cs @@ -0,0 +1,172 @@ +using System; +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace GScraper.Brave +{ + /// + /// Represents a Brave Search scraper. + /// + public class BraveScraper : IDisposable + { + /// + /// Returns the default API endpoint. + /// + public const string DefaultApiEndpoint = "https://search.brave.com/api/"; + + private readonly HttpClient _httpClient; + private const string _defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"; + private bool _disposed; + + /// + /// Initializes a new instance of the class. + /// + public BraveScraper() : this(new HttpClient()) + { + } + + /// + /// Initializes a new instance of the class using the provided . + /// + public BraveScraper(HttpClient client) : this(client, DefaultApiEndpoint) + { + } + + /// + /// Initializes a new instance of the class using the provided and API endpoint. + /// + public BraveScraper(HttpClient client, string apiEndpoint) + { + GScraperGuards.NotNull(client, nameof(client)); + GScraperGuards.NotNullOrEmpty(apiEndpoint, nameof(apiEndpoint)); + _httpClient = client; + _httpClient.BaseAddress = new Uri(apiEndpoint); + if (_httpClient.DefaultRequestHeaders.UserAgent.Count == 0) + { + _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(_defaultUserAgent); + } + } + + /// + /// Gets images from Brave Search. + /// + /// This method returns at most 150 image results (unless Brave changes something in their API). + /// The search query. + /// The safe search level. + /// The country. contains the countries that can be used here. + /// The image size. + /// The image type. + /// The image layout. + /// The image color. + /// The image license. + /// A task representing the asynchronous operation. The result contains an of . + /// is null or empty. + /// An error occurred during the scraping process. + public async Task> GetImagesAsync(string query, SafeSearchLevel safeSearch = SafeSearchLevel.Moderate, + string? country = null, BraveImageSize size = BraveImageSize.All, BraveImageType type = BraveImageType.All, + BraveImageLayout layout = BraveImageLayout.All, BraveImageColor color = BraveImageColor.All, BraveImageLicense license = BraveImageLicense.All) + { + GScraperGuards.NotNull(query, nameof(query)); + + byte[] bytes; + using (var request = new HttpRequestMessage()) + { + string cookie = $"safesearch={safeSearch.ToString().ToLowerInvariant()}"; + if (!string.IsNullOrEmpty(country)) + { + cookie += $"; country={country}"; + } + + request.Method = HttpMethod.Get; + request.RequestUri = new Uri(BuildImageQuery(query, size, type, layout, color, license), UriKind.Relative); + request.Headers.Add("cookie", cookie); + + var response = await _httpClient.SendAsync(request).ConfigureAwait(false); + response.EnsureSuccessStatusCode(); + + bytes = await response.Content.ReadAsByteArrayAsync().ConfigureAwait(false); + } + + var document = JsonDocument.Parse(bytes); + var results = document.RootElement.GetPropertyOrDefault("results"); + + return EnumerateResults(results); + } + + private static string BuildImageQuery(string query, BraveImageSize size, BraveImageType type, + BraveImageLayout layout, BraveImageColor color, BraveImageLicense license) + { + string url = $"images?q={Uri.EscapeDataString(query)}"; + + // Doesn't seem to work via query strings + /* + if (level != SafeSearchLevel.Moderate) + url += $"&safesearch={level.ToString().ToLowerInvariant()}"; + + if (!string.IsNullOrEmpty(country)) + url += $"&country={country}"; + */ + + if (size != BraveImageSize.All) + url += $"&size={size}"; + + if (type != BraveImageType.All) + url += $"&_type={type}"; + + if (layout != BraveImageLayout.All) + url += $"&layout={layout}"; + + if (color != BraveImageColor.All) + url += $"&color={color}"; + + if (license != BraveImageLicense.All) + url += $"&license={license}"; + + return url; + } + + private static IEnumerable EnumerateResults(JsonElement results) + { + if (results.ValueKind != JsonValueKind.Array) + { + yield break; + } + + foreach (var result in results.EnumerateArray()) + { + var properties = result.GetPropertyOrDefault("properties"); + string url = properties.GetPropertyOrDefault("url").GetStringOrDefault(); + string title = result.GetPropertyOrDefault("title").GetStringOrDefault(); + int width = properties.GetPropertyOrDefault("width").GetInt32OrDefault(); + int height = properties.GetPropertyOrDefault("height").GetInt32OrDefault(); + string sourceUrl = result.GetPropertyOrDefault("url").GetStringOrDefault(); + var pageAge = result.GetPropertyOrDefault("page_age").GetDateTimeOffsetOrDefault(); + string source = result.GetPropertyOrDefault("source").GetStringOrDefault(); + string thumbnailUrl = result.GetPropertyOrDefault("thumbnail").GetPropertyOrDefault("src").GetStringOrDefault(); + string resizedUrl = properties.GetPropertyOrDefault("resized").GetStringOrDefault(); + string format = properties.GetPropertyOrDefault("format").GetStringOrDefault(); + + yield return new BraveImageResult(url, title, width, height, sourceUrl, pageAge, source, thumbnailUrl, resizedUrl, format); + } + } + + /// + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + /// + protected virtual void Dispose(bool disposing) + { + if (_disposed) return; + if (disposing) + _httpClient.Dispose(); + + _disposed = true; + } + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageColor.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageColor.cs new file mode 100644 index 0000000..53d0ab4 --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageColor.cs @@ -0,0 +1,69 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the image colors in Brave search. + /// + public enum DuckDuckGoImageColor + { + /// + /// All colors. + /// + All, + /// + /// Only colors. + /// + Color, + /// + /// Black and white. + /// + Monochrome, + /// + /// Red color. + /// + Red, + /// + /// Orange color. + /// + Orange, + /// + /// Yellow color. + /// + Yellow, + /// + /// Green color. + /// + Green, + /// + /// Blue color. + /// + Blue, + /// + /// Purple color. + /// + Purple, + /// + /// Pink color. + /// + Pink, + /// + /// Brown color. + /// + Brown, + /// + /// Black color. + /// + Black, + /// + /// Gray color. + /// + Gray, + /// + /// Teal color. + /// + Teal, + /// + /// White color. + /// + White + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageLayout.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageLayout.cs new file mode 100644 index 0000000..16a38a5 --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageLayout.cs @@ -0,0 +1,25 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the image layouts in DuckDuckGo. + /// + public enum DuckDuckGoImageLayout + { + /// + /// All layouts + /// + All, + /// + /// Square layout. + /// + Square, + /// + /// Tall layout. + /// + Tall, + /// + /// Wide layout. + /// + Wide + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageLicense.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageLicense.cs new file mode 100644 index 0000000..8965c44 --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageLicense.cs @@ -0,0 +1,37 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the image licenses in DuckDuckGo. + /// + public enum DuckDuckGoImageLicense + { + /// + /// All Licenses. + /// + All, + /// + /// All Creative Commons. + /// + Any, + /// + /// Public Domain. + /// + Public, + /// + /// Free to Share and Use. + /// + Share, + /// + /// Free to Share and Use Commercially. + /// + ShareCommercially, + /// + /// Free to Modify, Share and Use. + /// + Modify, + /// + /// Free to Modify, Share and Use Commercially. + /// + ModifyCommercially + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageResult.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageResult.cs new file mode 100644 index 0000000..45b0b8f --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageResult.cs @@ -0,0 +1,56 @@ +using System.Diagnostics; + +namespace GScraper.DuckDuckGo +{ + /// + /// Represents an image result from DuckDuckGo. + /// + [DebuggerDisplay("Title: {Title}, Url: {Url}")] + public class DuckDuckGoImageResult : IImageResult + { + internal DuckDuckGoImageResult(string url, string title, int width, int height, string sourceUrl, + string thumbnailUrl, string source) + { + Url = url; + Title = title; + Width = width; + Height = height; + SourceUrl = sourceUrl; + ThumbnailUrl = thumbnailUrl; + Source = source; + } + + /// + public string Url { get; } + + /// + public string Title { get; } + + /// + public int Width { get; } + + /// + public int Height { get; } + + /// + /// Gets a URL pointing to the webpage hosting the image. + /// + public string SourceUrl { get; } + + /// + /// Gets a URL pointing to the thumbnail image. + /// + public string ThumbnailUrl { get; } + + /// + /// Gets the search engine this result comes from. + /// + public string Source { get; } + + /// + /// Returns the URL of this result. + /// + /// The URL of this result. + public override string ToString() => Url; + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageSize.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageSize.cs new file mode 100644 index 0000000..fc788ef --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageSize.cs @@ -0,0 +1,29 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the image sizes in DuckDuckGo. + /// + public enum DuckDuckGoImageSize + { + /// + /// All sizes. + /// + All, + /// + /// Small sizes. + /// + Small, + /// + /// Medium sizes. + /// + Medium, + /// + /// Large sizes. + /// + Large, + /// + /// Wallpaper sizes. + /// + Wallpaper + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageTime.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageTime.cs new file mode 100644 index 0000000..d38c16c --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageTime.cs @@ -0,0 +1,25 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the possible times in DuckDuckGo. + /// + public enum DuckDuckGoImageTime + { + /// + /// Any time. + /// + Any, + /// + /// Past day. + /// + Day, + /// + /// Past week. + /// + Week, + /// + /// Past month. + /// + Month + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoImageType.cs b/src/GScraper/DuckDuckGo/DuckDuckGoImageType.cs new file mode 100644 index 0000000..2a3b94f --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoImageType.cs @@ -0,0 +1,33 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Specifies the image types in DuckDuckGo. + /// + public enum DuckDuckGoImageType + { + /// + /// All types. + /// + All, + /// + /// Photograph. + /// + Photo, + /// + /// Clip Art. + /// + Clipart, + /// + /// Animated GIF. + /// + Gif, + /// + /// Transparent. + /// + Transparent, + /// + /// Line drawing. + /// + Line + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoRegions.cs b/src/GScraper/DuckDuckGo/DuckDuckGoRegions.cs new file mode 100644 index 0000000..4d688a5 --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoRegions.cs @@ -0,0 +1,323 @@ +namespace GScraper.DuckDuckGo +{ + /// + /// Contains the possible regions in DuckDuckGo. + /// + public static class DuckDuckGoRegions + { + /// + /// All regions. + /// + public const string All = "wt-wt"; + + /// + /// Argentina. + /// + public const string Argentina = "ar-es"; + + /// + /// Australia. + /// + public const string Australia = "au-en"; + + /// + /// Austria. + /// + public const string Austria = "at-de"; + + /// + /// Belgium (fr). + /// + public const string BelgiumFr = "be-fr"; + + /// + /// Belgium (nl). + /// + public const string BelgiumNl = "be-nl"; + + /// + /// Brazil. + /// + public const string Brazil = "br-pt"; + + /// + /// Bulgaria. + /// + public const string Bulgaria = "bg-bg"; + + /// + /// Canada (en). + /// + public const string CanadaEn = "ca-en"; + + /// + /// Canada (fr). + /// + public const string CanadaFr = "ca-fr"; + + /// + /// Catalonia. + /// + public const string Catalonia = "ct-ca"; + + /// + /// Chile. + /// + public const string Chile = "cl-es"; + + /// + /// China. + /// + public const string China = "cn-zh"; + + /// + /// Colombia. + /// + public const string Colombia = "co-es"; + + /// + /// Croatia. + /// + public const string Croatia = "hr-hr"; + + /// + /// Czech Republic. + /// + public const string CzechRepublic = "cz-cs"; + + /// + /// Denmark. + /// + public const string Denmark = "dk-da"; + + /// + /// Estonia. + /// + public const string Estonia = "ee-et"; + + /// + /// Finland. + /// + public const string Finland = "fi-fi"; + + /// + /// France. + /// + public const string France = "fr-fr"; + + /// + /// Germany. + /// + public const string Germany = "de-de"; + + /// + /// Greece. + /// + public const string Greece = "gr-el"; + + /// + /// Hong Kong. + /// + public const string HongKong = "hk-tzh"; + + /// + /// Hungary. + /// + public const string Hungary = "hu-hu"; + + /// + /// India (en). + /// + public const string IndiaEn = "in-en"; + + /// + /// Indonesia (en). + /// + public const string IndonesiaEn = "id-en"; + + /// + /// Ireland. + /// + public const string Ireland = "ie-en"; + + /// + /// Israel (en). + /// + public const string IsraelEn = "il-en"; + + /// + /// Italy. + /// + public const string Italy = "it-it"; + + /// + /// Japan. + /// + public const string Japan = "jp-jp"; + + /// + /// Korea. + /// + public const string Korea = "kr-kr"; + + /// + /// Latvia. + /// + public const string Latvia = "lv-lv"; + + /// + /// Lithuania. + /// + public const string Lithuania = "lt-lt"; + + /// + /// Malaysia (en). + /// + public const string MalaysiaEn = "my-en"; + + /// + /// Mexico. + /// + public const string Mexico = "mx-es"; + + /// + /// Netherlands. + /// + public const string Netherlands = "nl-nl"; + + /// + /// New Zealand. + /// + public const string NewZealand = "nz-en"; + + /// + /// Norway. + /// + public const string Norway = "no-no"; + + /// + /// Pakistan (en). + /// + public const string PakistanEn = "pk-en"; + + /// + /// Peru. + /// + public const string Peru = "pe-es"; + + /// + /// Philippines (en). + /// + public const string PhilippinesEn = "ph-en"; + + /// + /// Poland. + /// + public const string Poland = "pl-pl"; + + /// + /// Portugal. + /// + public const string Portugal = "pt-pt"; + + /// + /// Romania. + /// + public const string Romania = "ro-ro"; + + /// + /// Russia. + /// + public const string Russia = "ru-ru"; + + /// + /// Saudi Arabia. + /// + public const string SaudiArabia = "xa-ar"; + + /// + /// Singapore. + /// + public const string Singapore = "sg-en"; + + /// + /// Slovakia. + /// + public const string Slovakia = "sk-sk"; + + /// + /// Slovenia. + /// + public const string Slovenia = "sl-sl"; + + /// + /// South Africa. + /// + public const string SouthAfrica = "za-en"; + + /// + /// Spain (ca). + /// + public const string SpainCa = "es-ca"; + + /// + /// Spain (es). + /// + public const string SpainEs = "es-es"; + + /// + /// Sweden. + /// + public const string Sweden = "se-sv"; + + /// + /// Switzerland (de). + /// + public const string SwitzerlandDe = "ch-de"; + + /// + /// Switzerland (fr). + /// + public const string SwitzerlandFr = "ch-fr"; + + /// + /// Taiwan. + /// + public const string Taiwan = "tw-tzh"; + + /// + /// Thailand (en). + /// + public const string ThailandEn = "th-en"; + + /// + /// Turkey. + /// + public const string Turkey = "tr-tr"; + + /// + /// US (English). + /// + public const string UsEnglish = "us-en"; + + /// + /// US (Spanish). + /// + public const string UsSpanish = "us-es"; + + /// + /// Ukraine. + /// + public const string Ukraine = "ua-uk"; + + /// + /// United Kingdom. + /// + public const string UnitedKingdom = "uk-en"; + + /// + /// Vietnam (en). + /// + public const string VietnamEn = "vn-en"; + } +} \ No newline at end of file diff --git a/src/GScraper/DuckDuckGo/DuckDuckGoScraper.cs b/src/GScraper/DuckDuckGo/DuckDuckGoScraper.cs new file mode 100644 index 0000000..8cf594f --- /dev/null +++ b/src/GScraper/DuckDuckGo/DuckDuckGoScraper.cs @@ -0,0 +1,197 @@ +using System; +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace GScraper.DuckDuckGo +{ + /// + /// Represents a DuckDuckGo scraper. + /// + public class DuckDuckGoScraper : IDisposable + { + /// + /// Returns the default API endpoint. + /// + public const string DefaultApiEndpoint = "https://duckduckgo.com"; + + /// + /// Returns the maximum query length. + /// + public const int MaxQueryLength = 500; + + private readonly HttpClient _httpClient; + private const string _defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"; + private bool _disposed; + + /// + /// Initializes a new instance of the class. + /// + public DuckDuckGoScraper() : this(new HttpClient()) + { + } + + /// + /// Initializes a new instance of the class using the provided . + /// + public DuckDuckGoScraper(HttpClient client) : this(client, DefaultApiEndpoint) + { + } + + /// + /// Initializes a new instance of the class using the provided and API endpoint. + /// + public DuckDuckGoScraper(HttpClient client, string apiEndpoint) + { + GScraperGuards.NotNull(client, nameof(client)); + GScraperGuards.NotNullOrEmpty(apiEndpoint, nameof(apiEndpoint)); + _httpClient = client; + _httpClient.BaseAddress = new Uri(apiEndpoint); + if (_httpClient.DefaultRequestHeaders.UserAgent.Count == 0) + { + _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(_defaultUserAgent); + } + } + + /// + /// Gets images from DuckDuckGo. + /// + /// This method returns at most 100 image results. + /// The search query. + /// The safe search level. + /// The image time. + /// The image size. + /// The image color. + /// The image type. + /// The image layout. + /// The image license. + /// The region. contains the regions that can be used here. + /// A task representing the asynchronous operation. The result contains an of . + /// is null or empty. + /// is larger than . + /// An error occurred during the scraping process. + public async Task> GetImagesAsync(string query, SafeSearchLevel safeSearch = SafeSearchLevel.Moderate, + DuckDuckGoImageTime time = DuckDuckGoImageTime.Any, DuckDuckGoImageSize size = DuckDuckGoImageSize.Small, DuckDuckGoImageColor color = DuckDuckGoImageColor.All, + DuckDuckGoImageType type = DuckDuckGoImageType.All, DuckDuckGoImageLayout layout = DuckDuckGoImageLayout.All, DuckDuckGoImageLicense license = DuckDuckGoImageLicense.All, + string region = DuckDuckGoRegions.UsEnglish) + { + GScraperGuards.NotNull(query, nameof(query)); + GScraperGuards.NotNullOrEmpty(region, nameof(region)); + GScraperGuards.ArgumentInRange(query.Length, MaxQueryLength, nameof(query), $"The query cannot be larger than {MaxQueryLength}."); + + string token = await GetTokenAsync(query); + var uri = new Uri($"i.js{BuildImageQuery(token, query, safeSearch, time, size, color, type, layout, license, region)}", UriKind.Relative); + + byte[] bytes; + using (var request = new HttpRequestMessage()) + { + request.Method = HttpMethod.Get; + request.RequestUri = uri; + + if (safeSearch == SafeSearchLevel.Strict) + { + request.Headers.Add("cookie", "p=1"); + } + else if (safeSearch == SafeSearchLevel.Off) + { + request.Headers.Add("cookie", "p=-2"); + } + var response = await _httpClient.SendAsync(request).ConfigureAwait(false); + response.EnsureSuccessStatusCode(); + + bytes = await response.Content.ReadAsByteArrayAsync().ConfigureAwait(false); + } + + var document = JsonDocument.Parse(bytes); + var results = document.RootElement.GetPropertyOrDefault("results"); + + return EnumerateResults(results); + } + + private static IEnumerable EnumerateResults(JsonElement results) + { + if (results.ValueKind != JsonValueKind.Array) + { + yield break; + } + + foreach (var result in results.EnumerateArray()) + { + string url = result.GetPropertyOrDefault("image").GetStringOrDefault(); + string title = result.GetPropertyOrDefault("title").GetStringOrDefault(); + int width = result.GetPropertyOrDefault("width").GetInt32OrDefault(); + int height = result.GetPropertyOrDefault("height").GetInt32OrDefault(); + string sourceUrl = result.GetPropertyOrDefault("url").GetStringOrDefault(); + string thumbnailUrl = result.GetPropertyOrDefault("thumbnail").GetStringOrDefault(); + string source = result.GetPropertyOrDefault("source").GetStringOrDefault(); + + yield return new DuckDuckGoImageResult(url, title, width, height, sourceUrl, thumbnailUrl, source); + } + } + + private static string BuildImageQuery(string token, string query, SafeSearchLevel safeSearch, DuckDuckGoImageTime time, DuckDuckGoImageSize size, + DuckDuckGoImageColor color, DuckDuckGoImageType type, DuckDuckGoImageLayout layout, DuckDuckGoImageLicense license, string region) + { + string url = $"?l={region}" + + "&o=json" + + $"&q={Uri.EscapeDataString(query)}" + + $"&vqd={token}" + + "&f="; + + url += time == DuckDuckGoImageTime.Any ? ',' : $"time:{time},"; + url += size == DuckDuckGoImageSize.All ? ',' : $"size:{size},"; + url += color == DuckDuckGoImageColor.All ? ',' : $"color:{color.ToString().ToLowerInvariant()},"; + url += type == DuckDuckGoImageType.All ? ',' : $"type:{type},"; + url += layout == DuckDuckGoImageLayout.All ? ',' : $"layout:{layout},"; + url += license == DuckDuckGoImageLicense.All ? "" : $"license:{license}"; + url += $"&p={(safeSearch == SafeSearchLevel.Off ? "-1" : "1")}"; + + return url; + } + + private async Task GetTokenAsync(string keywords) + { + string html = await _httpClient.GetStringAsync(new Uri($"?q={Uri.EscapeDataString(keywords)}", UriKind.Relative)).ConfigureAwait(false); + return GetToken(html.AsSpan()); + } + + private static string GetToken(ReadOnlySpan rawHtml) + { + const string start = "vqd='"; + int startIndex = rawHtml.IndexOf(start.AsSpan()) + start.Length; + + if (startIndex == -1) + { + throw new GScraperException("Failed to get the DuckDuckGo token.", "DuckDuckGo"); + } + + var sliced = rawHtml.Slice(startIndex); + int endIndex = sliced.IndexOf('\''); + + if (endIndex == -1) + { + throw new GScraperException("Failed to get the DuckDuckGo token.", "DuckDuckGo"); + } + + return sliced.Slice(0, endIndex).ToString(); + } + + /// + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + /// + protected virtual void Dispose(bool disposing) + { + if (_disposed) return; + if (disposing) + _httpClient.Dispose(); + + _disposed = true; + } + } +} \ No newline at end of file diff --git a/src/GScraper/GScraper.csproj b/src/GScraper/GScraper.csproj index b6ec1cd..74da89f 100644 --- a/src/GScraper/GScraper.csproj +++ b/src/GScraper/GScraper.csproj @@ -1,31 +1,32 @@ - + - netstandard1.1 - 0.5.3 - A scraper for Google Images. + netstandard2.0 + 1.0 + 9 + enable d4n3436 + netstandard2.0 en https://github.com/d4n3436/GScraper - LICENSE - true - true https://github.com/d4n3436/GScraper + true true snupkg true gscraper;scraping;web-scraping;google;google-images + MIT + A collection of search engine image scrapers (Google Images, DuckDuckGo and Brave) + +- Added 2 new scrapers (DuckDuckGo and Brave). +- Now targeting .NET Standard 2.0. +- Use System.Text.Json instead of Newtonsoft.Json, improving the performance and memory usage. +- Added more customization options when requesting images, like image size, type, color, etc. + - - - - - - True - - + diff --git a/src/GScraper/GScraperException.cs b/src/GScraper/GScraperException.cs index 2973fa9..ddcb87b 100644 --- a/src/GScraper/GScraperException.cs +++ b/src/GScraper/GScraperException.cs @@ -7,6 +7,11 @@ namespace GScraper /// public class GScraperException : Exception { + /// + /// Gets the search engine that caused this exception. + /// + public string Engine { get; } = "Unknown"; + /// /// Initializes a new instance of the class. /// @@ -17,15 +22,39 @@ public GScraperException() /// /// Initializes a new instance of the class with a specified error message. /// + /// The message that describes the error. public GScraperException(string message) : base(message) { } + /// + /// Initializes a new instance of the class with a specified error message. + /// + /// The message that describes the error. + /// The search engine that caused this exception. + public GScraperException(string message, string engine) : this(message) + { + Engine = engine; + } + /// /// Initializes a new instance of the class with a specified error message and a reference to the inner exception that is the cause of this exception. /// + /// The message that describes the error. + /// The exception that is the cause of the current exception, or a null reference (Nothing in Visual Basic) if no inner exception is specified. public GScraperException(string message, Exception innerException) : base(message, innerException) { } + + /// + /// Initializes a new instance of the class with a specified error message and a reference to the inner exception that is the cause of this exception. + /// + /// The message that describes the error. + /// The search engine that caused this exception. + /// The exception that is the cause of the current exception, or a null reference (Nothing in Visual Basic) if no inner exception is specified. + public GScraperException(string message, string engine, Exception innerException) : this(message, innerException) + { + Engine = engine; + } } } \ No newline at end of file diff --git a/src/GScraper/GScraperExtensions.cs b/src/GScraper/GScraperExtensions.cs index 2bcbb3e..5ec73cd 100644 --- a/src/GScraper/GScraperExtensions.cs +++ b/src/GScraper/GScraperExtensions.cs @@ -1,36 +1,27 @@ -using System.Collections.Generic; -using Newtonsoft.Json.Linq; +using System; +using System.Linq; +using System.Text.Json; namespace GScraper { - /// - /// Represents the extension methods that uses. - /// internal static class GScraperExtensions { - /// - /// Gets the value of converted to the specified type or gets the default value if is not a . - /// - /// The type to convert the value to. - /// A cast as a of . - /// default() if is not a ; otherwise, the converted value. - public static T ValueOrDefault(this IEnumerable token) - => token is JValue ? token.Value() : default; + public static JsonElement FirstOrDefault(this JsonElement element) + => element.ValueKind == JsonValueKind.Array ? element.EnumerateArray().FirstOrDefault() : default; - /// - /// Gets the with the specified key converted to the specified type or the default value. - /// - /// The type to convert the token to. - /// The token. - /// The token key. - /// - /// The converted token value if is an and is a , - /// or if is a and is a or a ; - /// otherwise, default(). - public static T ValueOrDefault(this JToken token, object key) - => (key is int && token is JArray) || (key is string && token is JValue) || (key is string && token is JObject) ? token.Value(key) : default; + public static JsonElement ElementAtOrDefault(this JsonElement element, int index) + => element.ValueKind == JsonValueKind.Array ? element.EnumerateArray().ElementAtOrDefault(index) : default; - /// - public static JToken ValueOrDefault(this JToken token, object key) => ValueOrDefault(token, key); + public static JsonElement GetPropertyOrDefault(this JsonElement element, string propertyName) + => element.ValueKind == JsonValueKind.Object && element.TryGetProperty(propertyName, out var value) ? value : default; + + public static string GetStringOrDefault(this JsonElement element, string defaultValue = "") + => element.ValueKind is JsonValueKind.String or JsonValueKind.Null ? element.GetString() ?? defaultValue : defaultValue; + + public static int GetInt32OrDefault(this JsonElement element) + => element.ValueKind == JsonValueKind.Number && element.TryGetInt32(out int value) ? value : default; + + public static DateTimeOffset GetDateTimeOffsetOrDefault(this JsonElement element) + => element.ValueKind == JsonValueKind.String && element.TryGetDateTimeOffset(out var value) ? value : default; } } \ No newline at end of file diff --git a/src/GScraper/GScraperGuards.cs b/src/GScraper/GScraperGuards.cs new file mode 100644 index 0000000..bdebedd --- /dev/null +++ b/src/GScraper/GScraperGuards.cs @@ -0,0 +1,31 @@ +using System; + +namespace GScraper +{ + internal static class GScraperGuards + { + public static void NotNull(T? obj, string parameterName) where T : class + { + if (obj is null) + { + throw new ArgumentNullException(parameterName); + } + } + + public static void NotNullOrEmpty(string? str, string parameterName) + { + if (string.IsNullOrEmpty(str)) + { + throw new ArgumentNullException(parameterName); + } + } + + public static void ArgumentInRange(int length, int max, string parameterName, string message) + { + if (length > max) + { + throw new ArgumentOutOfRangeException(parameterName, message); + } + } + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageColors.cs b/src/GScraper/Google/GoogleImageColors.cs new file mode 100644 index 0000000..4b57933 --- /dev/null +++ b/src/GScraper/Google/GoogleImageColors.cs @@ -0,0 +1,78 @@ +namespace GScraper.Google +{ + /// + /// Contains the possible colors in Google Images. + /// + public static class GoogleImageColors + { + /// + /// Black and white. + /// + public const string BlackAndWhite = "gray"; + + /// + /// Transparent. + /// + public const string Transparent = "trans"; + + /// + /// Red. + /// + public const string Red = "specific,isc:red"; + + /// + /// Orange. + /// + public const string Orange = "specific,isc:orange"; + + /// + /// Yellow. + /// + public const string Yellow = "specific,isc:yellow"; + + /// + /// Green. + /// + public const string Green = "specific,isc:green"; + + /// + /// Teal. + /// + public const string Teal = "specific,isc:teal"; + + /// + /// Blue. + /// + public const string Blue = "specific,isc:blue"; + + /// + /// Purple. + /// + public const string Purple = "specific,isc:purple"; + + /// + /// Pink. + /// + public const string Pink = "specific,isc:pink"; + + /// + /// White. + /// + public const string White = "specific,isc:white"; + + /// + /// Gray. + /// + public const string Gray = "specific,isc:gray"; + + /// + /// Black. + /// + public const string Black = "specific,isc:black"; + + /// + /// Brown. + /// + public const string Brown = "specific,isc:brown"; + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageLicenses.cs b/src/GScraper/Google/GoogleImageLicenses.cs new file mode 100644 index 0000000..48c363b --- /dev/null +++ b/src/GScraper/Google/GoogleImageLicenses.cs @@ -0,0 +1,18 @@ +namespace GScraper.Google +{ + /// + /// Contains the possible licenses (usage rights) in Google Images. + /// + public static class GoogleImageLicenses + { + /// + /// Returns the code for Creative Commons licenses. + /// + public const string CreativeCommons = "cl"; + + /// + /// Returns the code for Commercial and other licenses. + /// + public const string Commercial = "ol"; + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageResult.cs b/src/GScraper/Google/GoogleImageResult.cs new file mode 100644 index 0000000..2566314 --- /dev/null +++ b/src/GScraper/Google/GoogleImageResult.cs @@ -0,0 +1,56 @@ +using System.Diagnostics; + +namespace GScraper +{ + /// + /// Represents an image result from Google Images. + /// + [DebuggerDisplay("Title: {Title}, Url: {Url}")] + public class GoogleImageResult : IImageResult + { + internal GoogleImageResult(string url, string title, int width, int height, string displayUrl, + string sourceUrl, string thumbnailUrl) + { + Url = url; + Title = title; + Width = width; + Height = height; + DisplayUrl = displayUrl; + SourceUrl = sourceUrl; + ThumbnailUrl = thumbnailUrl; + } + + /// + public string Url { get; } + + /// + public string Title { get; } + + /// + public int Width { get; } + + /// + public int Height { get; } + + /// + /// Gets an abridged version of , e.g. www.example.com. + /// + public string DisplayUrl { get; } + + /// + /// Gets a URL pointing to the webpage hosting the image. + /// + public string SourceUrl { get; } + + /// + /// Gets a URL pointing to the thumbnail image. + /// + public string ThumbnailUrl { get; } + + /// + /// Returns the URL of this result. + /// + /// The URL of this result. + public override string ToString() => Url; + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageSize.cs b/src/GScraper/Google/GoogleImageSize.cs new file mode 100644 index 0000000..7e20165 --- /dev/null +++ b/src/GScraper/Google/GoogleImageSize.cs @@ -0,0 +1,25 @@ +namespace GScraper.Google +{ + /// + /// Specifies the image sizes in Google Images. + /// + public enum GoogleImageSize + { + /// + /// Any size. + /// + Any, + /// + /// Large size. + /// + Large = 'l', + /// + /// Medium size. + /// + Medium = 'm', + /// + /// Icon size. + /// + Icon = 'i' + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageTime.cs b/src/GScraper/Google/GoogleImageTime.cs new file mode 100644 index 0000000..a18ce25 --- /dev/null +++ b/src/GScraper/Google/GoogleImageTime.cs @@ -0,0 +1,29 @@ +namespace GScraper.Google +{ + /// + /// Specifies the possible times in Google Images. + /// + public enum GoogleImageTime + { + /// + /// Any time. + /// + Any, + /// + /// Past 24 hours. + /// + Day = 'd', + /// + /// Past week. + /// + Week = 'w', + /// + /// Past month. + /// + Month = 'm', + /// + /// Past year. + /// + Year = 'y' + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleImageType.cs b/src/GScraper/Google/GoogleImageType.cs new file mode 100644 index 0000000..7e37b16 --- /dev/null +++ b/src/GScraper/Google/GoogleImageType.cs @@ -0,0 +1,33 @@ +namespace GScraper.Google +{ + /// + /// Specifies the image types in Google Images. + /// + public enum GoogleImageType + { + /// + /// Any image type. + /// + Any, + /// + /// Face. + /// + Face, + /// + /// Photograph. + /// + Photo, + /// + /// Clip Art. + /// + ClipArt, + /// + /// Line Art. + /// + LineArt, + /// + /// GIF. + /// + Animated + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleLanguages.cs b/src/GScraper/Google/GoogleLanguages.cs new file mode 100644 index 0000000..dedf0d6 --- /dev/null +++ b/src/GScraper/Google/GoogleLanguages.cs @@ -0,0 +1,148 @@ +namespace GScraper.Google +{ + /// + /// Contains the languages used in Google Search/Images. + /// + public static class GoogleLanguages + { + /// + /// Returns the language code for: Arabic. + /// + public const string Arabic = "ar"; + + /// + /// Returns the language code for: Chinese (Simplified). + /// + public const string ChineseSimplified = "zh-CN"; + + /// + /// Returns the language code for: Chinese (Traditional). + /// + public const string ChineseTraditional = "zh-TW"; + + /// + /// Returns the language code for: Czech. + /// + public const string Czech = "cs"; + + /// + /// Returns the language code for: Danish. + /// + public const string Danish = "da"; + + /// + /// Returns the language code for: Dutch. + /// + public const string Dutch = "nl"; + + /// + /// Returns the language code for: English. + /// + public const string English = "en"; + + /// + /// Returns the language code for: Estonian. + /// + public const string Estonian = "et"; + + /// + /// Returns the language code for: Finnish. + /// + public const string Finnish = "fi"; + + /// + /// Returns the language code for: French. + /// + public const string French = "fr"; + + /// + /// Returns the language code for: German. + /// + public const string German = "de"; + + /// + /// Returns the language code for: Greek. + /// + public const string Greek = "el"; + + /// + /// Returns the language code for: Hebrew. + /// + public const string Hebrew = "iw"; + + /// + /// Returns the language code for: Hungarian. + /// + public const string Hungarian = "hu"; + + /// + /// Returns the language code for: Icelandic. + /// + public const string Icelandic = "is"; + + /// + /// Returns the language code for: Italian. + /// + public const string Italian = "it"; + + /// + /// Returns the language code for: Japanese. + /// + public const string Japanese = "ja"; + + /// + /// Returns the language code for: Korean. + /// + public const string Korean = "ko"; + + /// + /// Returns the language code for: Latvian. + /// + public const string Latvian = "lv"; + + /// + /// Returns the language code for: Lithuanian. + /// + public const string Lithuanian = "lt"; + + /// + /// Returns the language code for: Norwegian. + /// + public const string Norwegian = "no"; + + /// + /// Returns the language code for: Portuguese. + /// + public const string Portuguese = "pt"; + + /// + /// Returns the language code for: Polish. + /// + public const string Polish = "pl"; + + /// + /// Returns the language code for: Romanian. + /// + public const string Romanian = "ro"; + + /// + /// Returns the language code for: Russian. + /// + public const string Russian = "ru"; + + /// + /// Returns the language code for: Spanish. + /// + public const string Spanish = "es"; + + /// + /// Returns the language code for: Swedish. + /// + public const string Swedish = "sv"; + + /// + /// Returns the language code for: Turkish. + /// + public const string Turkish = "tr"; + } +} \ No newline at end of file diff --git a/src/GScraper/Google/GoogleScraper.cs b/src/GScraper/Google/GoogleScraper.cs new file mode 100644 index 0000000..2b3bb28 --- /dev/null +++ b/src/GScraper/Google/GoogleScraper.cs @@ -0,0 +1,229 @@ +using System; +using System.Collections.Generic; +using System.Net.Http; +using System.Text.Json; +using System.Threading.Tasks; + +namespace GScraper.Google +{ + /// + /// Represents a Google Search scraper. + /// + public class GoogleScraper : IDisposable + { + /// + /// Returns the default API endpoint. + /// + public const string DefaultApiEndpoint = "https://www.google.com/search"; + + private readonly HttpClient _httpClient; + private const string _defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"; + private bool _disposed; + + /// + /// Initializes a new instance of the class. + /// + public GoogleScraper() : this(new HttpClient()) + { + } + + /// + /// Initializes a new instance of the class using the provided . + /// + public GoogleScraper(HttpClient client) : this(client, DefaultApiEndpoint) + { + } + + /// + /// Initializes a new instance of the class using the provided and API endpoint. + /// + public GoogleScraper(HttpClient client, string apiEndpoint) + { + GScraperGuards.NotNull(client, nameof(client)); + GScraperGuards.NotNullOrEmpty(apiEndpoint, nameof(apiEndpoint)); + _httpClient = client; + _httpClient.BaseAddress = new Uri(apiEndpoint); + if (_httpClient.DefaultRequestHeaders.UserAgent.Count == 0) + { + _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(_defaultUserAgent); + } + } + + /// + /// Gets images from Google Images. + /// + /// This method returns at most 100 image results. + /// The search query. + /// The safe search level. + /// The image size. + /// The image color. contains the colors that can be used here. + /// The image type. + /// The image time. + /// The image license. contains the licenses that can be used here. + /// The language code to use. contains the language codes that can be used here. + /// A task representing the asynchronous operation. The result contains an of . + /// is null or empty. + /// An error occurred during the scraping process. + public async Task> GetImagesAsync(string query, SafeSearchLevel safeSearch = SafeSearchLevel.Off, GoogleImageSize size = GoogleImageSize.Any, + string? color = null, GoogleImageType type = GoogleImageType.Any, GoogleImageTime time = GoogleImageTime.Any, + string? license = null, string? language = null) + { + GScraperGuards.NotNull(query, nameof(query)); + + var uri = new Uri(BuildImageQuery(query, safeSearch, size, color, type, time, license, language), UriKind.Relative); + + string page = await _httpClient.GetStringAsync(uri).ConfigureAwait(false); + + JsonElement rawImages; + try + { + rawImages = ExtractDataPack(page); + } + catch (Exception e) when (e is ArgumentOutOfRangeException or JsonException) + { + throw new GScraperException("Failed to unpack the image object data.", "Google", e); + } + + return EnumerateResults(rawImages); + } + + private static IEnumerable EnumerateResults(JsonElement rawImages) + { + if (rawImages.ValueKind != JsonValueKind.Array) + { + yield break; + } + + foreach (var rawImage in rawImages.EnumerateArray()) + { + if (rawImage.FirstOrDefault().GetInt32OrDefault() != 1) + { + continue; + } + + var image = FormatImageObject(rawImage); + + if (image != null) + { + yield return image; + } + } + } + + private static JsonElement ExtractDataPack(string page) + { + // Extract the JSON data pack from the page. + int startLine = page.IndexOf("AF_initDataCallback({key: 'ds:1'", StringComparison.Ordinal) - 10; + int startObject = page.IndexOf('[', startLine + 1); + int endObject = page.LastIndexOf(']', page.IndexOf("", startObject + 1, StringComparison.Ordinal)) + 1; + var rawObject = page.AsMemory().Slice(startObject, endObject - startObject); + + var document = JsonDocument.Parse(rawObject); + + return document.RootElement + .ElementAtOrDefault(31) + .FirstOrDefault() + .ElementAtOrDefault(12) + .ElementAtOrDefault(2); + } + + private static GoogleImageResult? FormatImageObject(JsonElement element) + { + var data = element.ElementAtOrDefault(1); + if (data.ValueKind != JsonValueKind.Array) + return null; + + var main = data.ElementAtOrDefault(3); + var info = data.ElementAtOrDefault(9); + + if (info.ValueKind != JsonValueKind.Object) + info = data.ElementAtOrDefault(11); + + string url = main + .FirstOrDefault() + .GetStringOrDefault(); + + string title = info + .GetPropertyOrDefault("2003") + .ElementAtOrDefault(3) + .GetStringOrDefault(); + + int width = main + .ElementAtOrDefault(2) + .GetInt32OrDefault(); + + int height = main + .ElementAtOrDefault(1) + .GetInt32OrDefault(); + + string displayUrl = info + .GetPropertyOrDefault("183836587") + .FirstOrDefault() + .GetStringOrDefault(); + + string sourceUrl = info + .GetPropertyOrDefault("2003") + .ElementAtOrDefault(2) + .GetStringOrDefault(); + + string thumbnailUrl = data + .ElementAtOrDefault(2) + .FirstOrDefault() + .GetStringOrDefault(); + + return new GoogleImageResult(url, title, width, height, displayUrl, sourceUrl, thumbnailUrl); + } + + private static string BuildImageQuery(string query, SafeSearchLevel safeSearch, GoogleImageSize size, string? color, + GoogleImageType type, GoogleImageTime time, string? license, string? language) + { + string url = $"?q={Uri.EscapeDataString(query)}&tbs="; + + url += size == GoogleImageSize.Any ? ',' : $"isz:{(char)size},"; + url += string.IsNullOrEmpty(color) ? ',' : $"ic:{color},"; + url += type == GoogleImageType.Any ? ',' : $"itp:{type.ToString().ToLowerInvariant()},"; + url += time == GoogleImageTime.Any ? ',' : $"qdr:{(char)time},"; + url += string.IsNullOrEmpty(license) ? "" : $"il:{license}"; + + url += "&espv=2" + + "&biw=1366" + + "&bih=667" + + "&site=webhp" + + "&source=lnms" + + "&tbm=isch" + + "&sa=X" + + "&ei=XosDVaCXD8TasATItgE" + + "&ved=0CAcQ_AUoAg"; + + url += "&safe=" + safeSearch switch + { + SafeSearchLevel.Off => "off", + SafeSearchLevel.Moderate => "medium", + SafeSearchLevel.Strict => "high", + _ => throw new ArgumentException("Invalid safe search level.", nameof(safeSearch)) + }; + + if (!string.IsNullOrEmpty(language)) + url += $"&lr=lang_{language}&hl={language}"; + + return url; + } + + /// + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + /// + protected virtual void Dispose(bool disposing) + { + if (_disposed) return; + if (disposing) + _httpClient.Dispose(); + + _disposed = true; + } + } +} \ No newline at end of file diff --git a/src/GScraper/GoogleScraper.cs b/src/GScraper/GoogleScraper.cs deleted file mode 100644 index 7cb54ba..0000000 --- a/src/GScraper/GoogleScraper.cs +++ /dev/null @@ -1,181 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Collections.ObjectModel; -using System.Diagnostics; -using System.Linq; -using System.Net.Http; -using System.Text.RegularExpressions; -using System.Threading.Tasks; -using Newtonsoft.Json; -using Newtonsoft.Json.Linq; - -namespace GScraper -{ - /// - /// Represents a simple Google Images scraper. - /// - public class GoogleScraper : IDisposable - { - /// - /// Returns the maximum number of images that can be returned per request. - /// - public const int ImageLimit = 100; - - private readonly HttpClient _httpClient = new HttpClient(); - private const string _defaultUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"; - private bool _disposed; - - /// - /// Initializes a new instance of the class. - /// - public GoogleScraper() - { - _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(_defaultUserAgent); - } - - /// - /// Initializes a new instance of the class with the provided User-Agent. - /// - /// The User-Agent to use in the requests. - public GoogleScraper(string userAgent) - { - _httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(userAgent); - } - - /// - /// Gets images from Google Images. - /// - /// The keywords. - /// The results limit. - /// Whether to use safe search filter. - /// A task representing the asynchronous operation. The result contains a read-only list of . - /// Thrown when is null or empty. - /// - /// Thrown when an error occurs during the scraping process. - public async Task> GetImagesAsync(string query, int limit = ImageLimit, bool safeSearch = false) - { - if (string.IsNullOrEmpty(query)) - { - throw new ArgumentNullException(nameof(query)); - } - - string url = BuildSearchUrl(query, safeSearch); - Debug.WriteLine($"[GScraper] Obtaining image objects from: {url}"); - - string page = await _httpClient.GetStringAsync(new Uri(url)).ConfigureAwait(false); - IEnumerable rawImages; - try - { - rawImages = GetImageObjectsFromPack(ExtractDataPack(page)); - } - catch (Exception e) when (e is ArgumentOutOfRangeException || e is JsonReaderException) - { - throw new GScraperException("Failed to unpack the image object data.", e); - } - - limit = Math.Max(limit, 1); - limit = Math.Min(limit, ImageLimit); - - var formattedImages = new List(); - - foreach (var rawImage in rawImages) - { - if (formattedImages.Count == limit) - break; - - var image = FormatImageObject(rawImage); - - if (image != null) - formattedImages.Add(image); - } - - Debug.WriteLine($"[GScraper] {formattedImages.Count}/{limit} image objects."); - - return new ReadOnlyCollection(formattedImages); - } - - private static string ExtractDataPack(string page) - { - // Extract the JSON data pack from the page. - int startLine = page.IndexOf("AF_initDataCallback({key: 'ds:1'", StringComparison.OrdinalIgnoreCase) - 10; - int startObject = page.IndexOf('[', startLine + 1); - int endObject = page.LastIndexOf(']', page.IndexOf("", startObject + 1, StringComparison.OrdinalIgnoreCase)) + 1; - string rawObject = page.Substring(startObject, endObject - startObject); - - // This will prevent Regex.Unescape() to unescape escaped backlashes (\\) - rawObject = rawObject.Replace("\\", "\\\\"); - - return Regex.Unescape(rawObject); - } - - private static IEnumerable GetImageObjectsFromPack(string data) - { - // Extract the raw image objects from the JSON data pack. - return JToken.Parse(data) - .ElementAtOrDefault(31)? - .FirstOrDefault()? - .ElementAtOrDefault(12)? - .ElementAtOrDefault(2)? - .Where(x => x?.FirstOrDefault()?.ValueOrDefault() == 1) ?? Enumerable.Empty(); - } - - private static ImageResult FormatImageObject(JToken obj) - { - var data = obj?.ElementAtOrDefault(1); - var main = data?.ElementAtOrDefault(3); - var info = data?.ElementAtOrDefault(9); - - if (data == null) - return null; - - if (string.IsNullOrEmpty(info?.ToString())) - info = data.ElementAtOrDefault(11); - - return new ImageResult( - main?.ElementAtOrDefault(2)?.ValueOrDefault(), - main?.ElementAtOrDefault(1)?.ValueOrDefault(), - main?.FirstOrDefault()?.ValueOrDefault(), - info?.ValueOrDefault("2003")?.ElementAtOrDefault(3)?.ValueOrDefault(), - info?.ValueOrDefault("183836587")?.FirstOrDefault()?.ValueOrDefault(), - info?.ValueOrDefault("2003")?.ElementAtOrDefault(2)?.ValueOrDefault(), - data.ElementAtOrDefault(2)?.FirstOrDefault()?.ValueOrDefault()); - } - - private static string BuildSearchUrl(string query, bool safeSearch) - { - string url = "https://www.google.com/search" + - $"?q={Uri.EscapeDataString(query)}" + - "&espv=2" + - "&biw=1366" + - "&bih=667" + - "&site=webhp" + - "&source=lnms" + - "&tbm=isch" + - "&sa=X" + - "&ei=XosDVaCXD8TasATItgE" + - "&ved=0CAcQ_AUoAg"; - - if (safeSearch) - url += "&safe=active"; - - return url; - } - - /// - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } - - /// - protected virtual void Dispose(bool disposing) - { - if (_disposed) return; - if (disposing) - _httpClient.Dispose(); - - _disposed = true; - } - } -} \ No newline at end of file diff --git a/src/GScraper/IImageResult.cs b/src/GScraper/IImageResult.cs new file mode 100644 index 0000000..1621941 --- /dev/null +++ b/src/GScraper/IImageResult.cs @@ -0,0 +1,28 @@ +namespace GScraper +{ + /// + /// Represents an image result. + /// + public interface IImageResult + { + /// + /// Gets a URL pointing to the image. + /// + string Url { get; } + + /// + /// Gets the title of the image result. + /// + string Title { get; } + + /// + /// Gets the width of the image, in pixels. + /// + int Width { get; } + + /// + /// Gets the height of the image, in pixels. + /// + int Height { get; } + } +} \ No newline at end of file diff --git a/src/GScraper/ImageResult.cs b/src/GScraper/ImageResult.cs deleted file mode 100644 index 5614ef3..0000000 --- a/src/GScraper/ImageResult.cs +++ /dev/null @@ -1,54 +0,0 @@ -namespace GScraper -{ - /// - /// Represents a Google Images result. - /// - public class ImageResult - { - internal ImageResult(int? height, int? width, string link, string title, string displayLink, string contextLink, string thumbnailLink) - { - Height = height; - Width = width; - Link = link; - Title = title; - DisplayLink = displayLink; - ContextLink = contextLink; - ThumbnailLink = thumbnailLink; - } - - /// - /// Gets the height of the image, in pixels. - /// - public int? Height { get; } - - /// - /// Gets the width of the image, in pixels. - /// - public int? Width { get; } - - /// - /// Gets a URL pointing to the image. - /// - public string Link { get; } - - /// - /// Gets the title of the image result. - /// - public string Title { get; } - - /// - /// Gets an abridged version of , e.g. www.example.com. - /// - public string DisplayLink { get; } - - /// - /// Gets a URL pointing to the webpage hosting the image. - /// - public string ContextLink { get; } - - /// - /// Gets a URL pointing to the thumbnail image. - /// - public string ThumbnailLink { get; } - } -} \ No newline at end of file diff --git a/src/GScraper/SafeSearchLevel.cs b/src/GScraper/SafeSearchLevel.cs new file mode 100644 index 0000000..5a43d1c --- /dev/null +++ b/src/GScraper/SafeSearchLevel.cs @@ -0,0 +1,23 @@ +namespace GScraper +{ + /// + /// Specifies the Safe Search levels. + /// + public enum SafeSearchLevel + { + /// + /// Safe Search is off. + /// + Off, + + /// + /// Don't show explicit results. + /// + Moderate, + + /// + /// Strict safe search. + /// + Strict + } +} \ No newline at end of file