From 11ec0ac8102e33c2f50f1b557b2d67dd4486e16d Mon Sep 17 00:00:00 2001 From: LuanRT Date: Tue, 9 Nov 2021 23:12:55 -0300 Subject: [PATCH] fix: returning no organic results in some countries --- lib/constants.js | 2 +- lib/google_this.js | 348 ++++----------------------------------------- lib/parser.js | 300 ++++++++++++++++++++++++++++++++++++++ lib/utils.js | 18 ++- 4 files changed, 341 insertions(+), 327 deletions(-) create mode 100644 lib/parser.js diff --git a/lib/constants.js b/lib/constants.js index 03185b1..dc7b384 100644 --- a/lib/constants.js +++ b/lib/constants.js @@ -24,7 +24,7 @@ module.exports = { KNO_PANEL_TITLE: ['div.BkwXh > div', 'div > span.u9DLmf'], KNO_PANEL_DESCRIPTION: 'div[class="kno-rdesc"] > span', KNO_PANEL_URL: 'div[class="kno-rdesc"] > span > a', - KNO_PANEL_METADATA: 'div.rVusze > span.GRkHZd.w8qArf', + KNO_PANEL_METADATA: 'div.rVusze > span', KNO_PANEL_TYPE: 'div.BkwXh > div', KNO_PANEL_SONG_LYRICS: 'div.ujudUb', KNO_PANEL_AVAILABLE_ON: 'div[class="ellip bclEt"]', diff --git a/lib/google_this.js b/lib/google_this.js index 25e09e3..b5396ac 100644 --- a/lib/google_this.js +++ b/lib/google_this.js @@ -2,25 +2,23 @@ const Axios = require('axios'); const Utils = require('./utils'); +const Parser = require('./parser'); const Cheerio = require('cheerio'); const Constants = require('./constants'); -const NormalizeText = require('replace-special-characters'); -let debugging = false; let filtered_domains = ['gstatic.com']; -async function search (search_query, options = { ris: false, page: 0, match_all_images: false, debugging: false, additional_params: null }) { - debugging = options.debugging; - +async function search(search_query, options = { ris: false, page: 0, match_all_images: false, additional_params: null }) { const page = options.page * 10; const query = search_query.trim().split(/ +/).join('+').toLowerCase(); - const url = encodeURI(options.ris ? `${Constants.URLS.W_GOOGLE}searchbyimage?image_url=${search_query}`: `${Constants.URLS.GOOGLE}search?q=${query}&aqs=chrome..69i57.1685j0j4&client=ms-android-motorola-rev2&sourceid=chrome-mobile&ie=UTF-8&aomd=1${options.safe ? '&safe=active': ''}&start=${page || 0}`); + const url = encodeURI(options.ris ? `${Constants.URLS.W_GOOGLE}searchbyimage?image_url=${search_query}` : `${Constants.URLS.GOOGLE}search?q=${query}&aqs=chrome..69i57.1685j0j4&client=ms-android-motorola-rev2&sourceid=chrome-mobile&ie=UTF-8&aomd=1${options.safe ? '&safe=active': ''}&start=${page || 0}`); const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(true) }).catch((error) => error); if (response instanceof Error) throw new Error(`Could not search on Google: ${response.message}`); - + const $ = Cheerio.load(Utils.formatHtml(response.data)); - + const parser = new Parser($, response.data); + const final_data = { results: [], did_you_mean: '', @@ -31,345 +29,57 @@ async function search (search_query, options = { ris: false, page: 0, match_all_ people_also_search_for: [] }; - final_data.results = getOrganicResults($); - final_data.knowledge_panel = getKnowledgeGraph($, response.data); - final_data.featured_snippet = getFeaturedSnippet($); + final_data.results = parser.getOrganicResults(); + final_data.knowledge_panel = parser.getKnowledgeGraph($, response.data); + final_data.featured_snippet = parser.getFeaturedSnippet($); const did_you_mean = $(Constants.SELECTORS.DID_YOU_MEAN).text(); did_you_mean && (final_data.did_you_mean = did_you_mean) || (delete final_data.did_you_mean); - const unit_converter = getConverters($); + const unit_converter = parser.getConverters($); unit_converter && (final_data.unit_converter = unit_converter); - const weather_forecast = getWeather($); + const weather_forecast = parser.getWeather($); weather_forecast && (final_data.weather = weather_forecast); - const time = getTime($); + const time = parser.getTime($); time && (final_data.current_time = time); - const location = getLocation($, time); + const location = parser.getLocation($, time); location && (final_data.location = location); - const dictionary = getDictionary($); + const dictionary = parser.getDictionary($); dictionary && (final_data.dictionary = dictionary); - const translation = getTranslation($); + const translation = parser.getTranslation($); translation && (final_data.translation = translation); - const top_stories = getTopStories($); + const top_stories = parser.getTopStories($); final_data.top_stories = top_stories; - const people_also_ask = getPaa($); + const people_also_ask = parser.getPaa($); final_data.people_also_ask = people_also_ask; - const people_also_search_for = getPas($); + const people_also_search_for = parser.getPas($); final_data.people_also_search_for = people_also_search_for; - - return final_data; -} - -function getPaa($) { - let people_also_ask = []; - Constants.SELECTORS.PAA.forEach((item) => $(item).each((i, el) => people_also_ask.push($(el).text()))); - people_also_ask.shift(); - return people_also_ask; -} - -function getPas($) { - return $(Constants.SELECTORS.PASF).map((i, el) => { - if (!$(el).attr('data-src')) return; - return { - title: $(el).attr('alt'), - thumbnail: `https:${$(el).attr('data-src')}` - }; - }).get(); -} - -function getTime($) { - const hours = $(Constants.SELECTORS.CURRENT_TIME_HOUR).text(); - const date = $(Constants.SELECTORS.CURRENT_TIME_DATE).map((i, el) => $(el).text()).get()[1]; - - if (date) { - return { - hours: hours.trim(), - date: date.trim() - }; - } -} - -function getWeather($) { - const weather_location = $(Constants.SELECTORS.WEATHER_LOCATION).text(); - const weather_forecast = $(Constants.SELECTORS.WEATHER_FORECAST).text(); - const precipitation = $(Constants.SELECTORS.PRECIPITATION).text(); - const air_humidity = $(Constants.SELECTORS.AIR_HUMIDITY).text(); - const temperature = $(Constants.SELECTORS.TEMPERATURE).text(); - const wind_speed = $(Constants.SELECTORS.WIND_SPEED).text(); - - if (weather_location && weather_forecast) { - return { - location: weather_location, - forecast: weather_forecast, - precipitation, - humidity: air_humidity, - temperature: temperature, - wind: wind_speed - }; - } -} - -function getLocation($, date) { - const location_title = $(Constants.SELECTORS.LOCATION_TITLE).text(); - const location_distance = $(Constants.SELECTORS.LOCATION_DISTANCE).text(); - const location_image = $(Constants.SELECTORS.LOCATION_IMAGE).attr('src'); - - if (location_title && location_distance && !date) { - return { - title: location_title, - distance: location_distance, - map: 'https://google.com'+location_image - }; - } -} - -function getTranslation($) { - const source_language = $(Constants.SELECTORS.TR_SOURCE_LANGUAGE).text(); - const target_language = $(Constants.SELECTORS.TR_TARGET_LANGUAGE).text(); - - const source_text = $(Constants.SELECTORS.TR_SOURCE_TEXT).text(); - const target_text = $(Constants.SELECTORS.TR_TARGET_TEXT).text(); - - if (source_text.length > 0) { - return { - source_language, - target_language, - source_text, - target_text - }; - } -} - -function getDictionary($) { - const word = $(Constants.SELECTORS.GD_WORD).text(); - const phonetic = $(Constants.SELECTORS.GD_PHONETIC).text(); - const audio = $(Constants.SELECTORS.GD_AUDIO).attr('src'); - - if (word) { - return { - word: word || 'N/A', - phonetic: phonetic || 'N/A', - audio: audio ? `https:${audio}`: 'N/A', - definitions: $(Constants.SELECTORS.GD_DEFINITIONS).map((i, el) => $(el).text()).get(), - examples: $(Constants.SELECTORS.GD_EXAMPLES).map((i, el) => $(el).text()).get() - }; - } -} - -function getConverters($) { - const unit_converter_input = $(Constants.SELECTORS.UNIT_CONVERTER_INPUT).attr('value'); - const unit_converter_output = $(Constants.SELECTORS.UNIT_CONVERTER_OUTPUT).attr('value'); - const unit_converter_formula = $(Constants.SELECTORS.UNIT_CONVERTER_FORMULA).text(); - - const input_currency_name = $(Constants.SELECTORS.INPUT_CURRENCY_NAME).attr('data-name'); - const output_currency_name = $(Constants.SELECTORS.OUTPUT_CURRENCY_NAME).attr('data-name'); - const currency_converter_input = $(Constants.SELECTORS.CURRENCY_CONVERTER_INPUT).text(); - const currency_converter_output = $(Constants.SELECTORS.CURRENCY_CONVERTER_OUTPUT).text(); - - if (unit_converter_input && unit_converter_output) { - return { - input: unit_converter_input, - output: unit_converter_output, - formula: unit_converter_formula - }; - } else if (currency_converter_input && currency_converter_output) { - return { - input: { - name: input_currency_name, - value: currency_converter_input - }, - output: { - name: output_currency_name, - value: currency_converter_output - } - }; - } -} - -function getTopStories($) { - // Removes unnecessary texts from the short description - $(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((i, el) => $(el).remove()); - $(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((i, el) => $(el).remove()); - - const top_stories_descriptions = Constants.SELECTORS.TOP_STORIES_DESCRIPTION.map((selector) => $(selector).map((i, el) => $(el).text().slice(1)).get()).filter((descs) => descs.length > 0)[0]; - const top_stories_urls = $(Constants.SELECTORS.TOP_STORIES_URL).map((i, el) => $(el).attr('href')).get(); - - return top_stories_urls.map((item, i) => { - if (!top_stories_descriptions) return; - return { - description: top_stories_descriptions[i], - url: item, - }; - }).filter((story) => story); -} - -function getFeaturedSnippet($) { - const featured_snippet_title = $(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[0]).text() || $(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[1]).text() || $(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[2]).text() || undefined; - const featured_snippet_url = $(Constants.SELECTORS.FEATURED_SNIPPET_URL).map((i, el) => $(el).attr('href')).get()[0]; - const featured_snippet = Constants.SELECTORS.FEATURED_SNIPPET_DESC.map((selector) => { - if ($(selector)[0] && selector != Constants.SELECTORS.FEATURED_SNIPPET_DESC[2]) { - const text = $(selector).html().replace(/<\/li>|<\/b>|/g, '').replace(/&/g, '&').split('
  • ').join('\n').trim(); - return text; - } else if (selector == Constants.SELECTORS.FEATURED_SNIPPET_DESC[2]) { - const text = $(selector).text(); - return text; - } else { - return undefined; - } - }).filter(text => text != undefined && text.length != 0)[0]; - - return { - title: featured_snippet_title || 'N/A', - description: featured_snippet || 'N/A', - url: featured_snippet_url || 'N/A' - }; -} - -function getKnowledgeGraph($, data) { - const knowledge_panel = {}; - - knowledge_panel.title = $(Constants.SELECTORS.KNO_PANEL_TITLE[0]).first().text() || $(Constants.SELECTORS.KNO_PANEL_TITLE[1]).text() || 'N/A'; - knowledge_panel.description = $(Constants.SELECTORS.KNO_PANEL_DESCRIPTION).first().text() || 'N/A'; - knowledge_panel.url = $(Constants.SELECTORS.KNO_PANEL_URL).attr('href') || 'N/A'; - - // Extracts metadata from the knowledge graph - $(Constants.SELECTORS.KNO_PANEL_METADATA).each((i, el) => { - const key = $(el).text().trim().slice(0, -1); - const value = $(el).next().text().trim(); - knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_'))] = value.trim(); - }); - - const knowledge_panel_type = $(Constants.SELECTORS.KNO_PANEL_TYPE).last().text(); - if (knowledge_panel_type && knowledge_panel_type !== knowledge_panel.title) { - knowledge_panel.type = knowledge_panel_type; - } - - knowledge_panel.books = $(Constants.SELECTORS.KNO_PANEL_BOOKS).map((i, el) => { - if ($(el).next().text().trim() == '') return; - return { - title: $(el).first().text().trim(), - year: $(el).next().text().trim() - }; - }).get(); - knowledge_panel.books.length == 0 && delete knowledge_panel.books; - - knowledge_panel.tv_shows_and_movies = $(Constants.SELECTORS.KNO_PANEL_TV_SHOWS_AND_MOVIES).map((i, el) => { - if ($(el).next().text().trim() == '') return; - return { - title: $(el).first().text().trim(), - year: $(el).next().text().trim() - }; - }).get(); - knowledge_panel.tv_shows_and_movies.length == 0 && delete knowledge_panel.tv_shows_and_movies; - - const song_lyrics = $(Constants.SELECTORS.KNO_PANEL_SONG_LYRICS).map((i, el) => $($(el).html().replace(/<\/span><\/div>
    /g, '\n\n').replace(/
    /g, '\n')).text()).get(); - song_lyrics.length > 0 && (knowledge_panel.lyrics = song_lyrics.join('\n\n')); - - const google_users_rating = $(Constants.SELECTORS.KNO_PANEL_FILM_GOOGLEUSERS_RATING)[0]; - if (google_users_rating) { - const rating = $(google_users_rating.children[0].children[0]).text() || 'N/A'; - knowledge_panel.ratings = []; - knowledge_panel.ratings.push({ - name: 'Google Users', rating: rating - }); - } - - $(Constants.SELECTORS.KNO_PANEL_FILM_RATINGS[0]).each((i, el) => { - knowledge_panel.ratings = knowledge_panel.ratings || []; - const name = $($(Constants.SELECTORS.KNO_PANEL_FILM_RATINGS[1])[i]).attr('title'); - const rating = $(el).text(); - knowledge_panel.ratings.push({ - name: name, rating: rating - }); - }); - knowledge_panel.available_on = $(Constants.SELECTORS.KNO_PANEL_AVAILABLE_ON).map((i, el) => $(el).text()); - knowledge_panel.available_on.length == 0 && delete knowledge_panel.available_on; - - knowledge_panel.images = $(Constants.SELECTORS.KNO_PANEL_IMAGES).map((i, elem) => { - return { - url: $(elem).attr('data-src'), - source: $(elem).parent().parent().parent().attr('data-lpage'), - }; - }).get().filter((img) => img.url !== undefined); - knowledge_panel.images.length == 0 && delete knowledge_panel.images; - - const animal_preview = Utils.getStringBetweenStrings(data, 'source src\\x3d\\x22', '.mp4'); - animal_preview && (knowledge_panel.demonstration = animal_preview +'.mp4'); - - return knowledge_panel; -} - -function getOrganicResults($) { - const titles = $(Constants.SELECTORS.TITLE).map((i, el) => { - if (el.parent.attribs.style != '-webkit-line-clamp:2') // ignores ad titles - return $(el.children).text().trim(); - }).get(); - const descriptions = $(Constants.SELECTORS.DESCRIPTION).map((i, el) => { - if (el.parent.attribs.class != 'w1C3Le') // ignores ad descriptions - return $(el).text().trim(); - }).get(); - const urls = $(Constants.SELECTORS.URL).map((i, el) => $(el).attr('href')).get(); - - correctFuzzyData(titles, descriptions, urls); - - return titles.map((title, index) => { - return { - title: title || 'N/A', - description: descriptions[index] || 'N/A', - url: urls[index] || 'N/A', - favicons: { - high_res: `https://api.faviconkit.com/${new URL(urls[index] || 'https://google.com').hostname}/192`, - low_res: `https://www.google.com/s2/favicons?sz=64&domain_url=${new URL(urls[index] || 'https://google.com').hostname}` - } - }; - }); -} - -function correctFuzzyData (titles, descriptions, urls) { - // Correcting wrongly parsed data, this is quite rare tho. - if (titles.length < Constants.URLS.length && titles.length < descriptions.length) { - urls.shift(); - } else if (Constants.URLS.length > titles.length) { - urls.shift(); - } - - const innacurate_data = descriptions.length > urls.slice(1).length ? false: true; - - urls.forEach((item, index) => { - // Why YouTube? Because video results usually don't have a description. - if (item.includes('m.youtube.com') && innacurate_data && Constants.URLS.length > 1) { - debug('Removing malformed block containing the link: ' + item); - - urls.splice(index, 1); - titles.splice(index, 1); - index--; - } - }); + return final_data; } -async function image (query, options = { safe: false, exclude_domains: [], additional_params: null }) { +async function image(query, options = { safe: false, exclude_domains: [], additional_params: null }) { let search_query = query.trim().split(/ +/).join('+').toLowerCase(); let formatted_search_url = `${Constants.URLS.GIS}search?tbm=isch${options.safe ? '&safe=active': ''}&q=${search_query}`; filtered_domains = filtered_domains.concat(options.exclude_domains); if (options.exclude_domains) { - formatted_search_url += ' ' + filtered_domains.map((site) => '-site:'+site).join(''); + formatted_search_url += ' ' + filtered_domains.map((site) => '-site:' + site).join(''); } const response = await Axios.get(encodeURI(formatted_search_url), { params: options.additional_params, headers: Utils.getHeaders(false) }).catch((error) => error); if (response instanceof Error) throw new Error(`Could not search on Google: ${response.message}`); const image_search_regex = Constants.REGEX.IMAGE_SEARCH; - + let index = 0; let final_data = []; let origin = parseImageOriginData(response.data); @@ -387,11 +97,11 @@ async function image (query, options = { safe: false, exclude_domains: [], addit } parsed_data = image_search_regex.exec(response.data); } - + return final_data; } -function parseImageOriginData (data) { +function parseImageOriginData(data) { const image_origin_regex = Constants.REGEX.IMAGE_ORIGIN; let parsed_data = image_origin_regex.exec(data); @@ -408,9 +118,9 @@ function parseImageOriginData (data) { return processed_data; } -async function getTopNews () { +async function getTopNews() { const formatted_url = `${Constants.URLS.GOOGLE_NEWS}topstories?tab=in&hl=en-US&gl=US&ceid=US:en`; - + const response = await Axios.get(formatted_url, { headers: Utils.getHeaders(true) }).catch((error) => error); if (response instanceof Error) throw new Error(`Could not get top news: ${response.message}`); @@ -434,12 +144,8 @@ async function getTopNews () { by: headline_stories_publishers[i] }); }); - - return final_data; -} -function debug (text) { - if (debugging) return typeof text === 'object' ? console.table(text): console.debug('[INFO]:', text); + return final_data; } module.exports = { diff --git a/lib/parser.js b/lib/parser.js new file mode 100644 index 0000000..c97575f --- /dev/null +++ b/lib/parser.js @@ -0,0 +1,300 @@ +'use strict'; + +const Utils = require('./utils'); +const Constants = require('./constants'); +const NormalizeText = require('replace-special-characters'); + +class Parser { + constructor($, data) { + this.$ = $; + this.data = data; + } + + getOrganicResults() { + const titles = this.$(Constants.SELECTORS.TITLE).map((i, el) => { + if (el.parent.attribs.style != '-webkit-line-clamp:2') // ignores ad titles + return this.$(el.children).text().trim(); + }).get(); + const descriptions = this.$(Constants.SELECTORS.DESCRIPTION).map((i, el) => { + if (el.parent.attribs.class != 'w1C3Le') // ignores ad descriptions + return this.$(el).text().trim(); + }).get(); + const urls = this.$(Constants.SELECTORS.URL).map((i, el) => this.$(el).attr('href')).get(); + + this.correctFuzzyData(titles, descriptions, urls); + + return titles.map((title, index) => { + return { + title: title || 'N/A', + description: descriptions[index] || 'N/A', + url: urls[index] || 'N/A', + favicons: { + high_res: `https://api.faviconkit.com/${new URL(urls[index] || 'https://google.com').hostname}/192`, + low_res: `https://www.google.com/s2/favicons?sz=64&domain_url=${new URL(urls[index] || 'https://google.com').hostname}` + } + }; + }); + } + + getKnowledgeGraph(data) { + const knowledge_panel = {}; + + knowledge_panel.title = this.$(Constants.SELECTORS.KNO_PANEL_TITLE[0]).first().text() || this.$(Constants.SELECTORS.KNO_PANEL_TITLE[1]).text() || 'N/A'; + knowledge_panel.description = this.$(Constants.SELECTORS.KNO_PANEL_DESCRIPTION).first().text() || 'N/A'; + knowledge_panel.url = this.$(Constants.SELECTORS.KNO_PANEL_URL).attr('href') || 'N/A'; + + // Extracts metadata from the knowledge graph + this.$(Constants.SELECTORS.KNO_PANEL_METADATA).each((i, el) => { + const key = this.$(el).first().text().trim().slice(0, -1); + const value = this.$(el).next().text().trim(); + value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_'))] = value.trim()); + }); + + const knowledge_panel_type = this.$(Constants.SELECTORS.KNO_PANEL_TYPE).last().text(); + if (knowledge_panel_type && knowledge_panel_type !== knowledge_panel.title) { + knowledge_panel.type = knowledge_panel_type; + } + + knowledge_panel.books = this.$(Constants.SELECTORS.KNO_PANEL_BOOKS).map((i, el) => { + if (this.$(el).next().text().trim() == '') return; + return { + title: this.$(el).first().text().trim(), + year: this.$(el).next().text().trim() + }; + }).get(); + knowledge_panel.books.length == 0 && delete knowledge_panel.books; + + knowledge_panel.tv_shows_and_movies = this.$(Constants.SELECTORS.KNO_PANEL_TV_SHOWS_AND_MOVIES).map((i, el) => { + if (this.$(el).next().text().trim() == '') return; + return { + title: this.$(el).first().text().trim(), + year: this.$(el).next().text().trim() + }; + }).get(); + knowledge_panel.tv_shows_and_movies.length == 0 && delete knowledge_panel.tv_shows_and_movies; + + const song_lyrics = this.$(Constants.SELECTORS.KNO_PANEL_SONG_LYRICS).map((i, el) => this.$(this.$(el).html().replace(/<\/span><\/div>
    /g, '\n\n').replace(/
    /g, '\n')).text()).get(); + song_lyrics.length > 0 && (knowledge_panel.lyrics = song_lyrics.join('\n\n')); + + const google_users_rating = this.$(Constants.SELECTORS.KNO_PANEL_FILM_GOOGLEUSERS_RATING)[0]; + if (google_users_rating) { + const rating = this.$(google_users_rating.children[0].children[0]).text() || 'N/A'; + knowledge_panel.ratings = []; + knowledge_panel.ratings.push({ + name: 'Google Users', + rating: rating + }); + } + + this.$(Constants.SELECTORS.KNO_PANEL_FILM_RATINGS[0]).each((i, el) => { + knowledge_panel.ratings = knowledge_panel.ratings || []; + const name = this.$(this.$(Constants.SELECTORS.KNO_PANEL_FILM_RATINGS[1])[i]).attr('title'); + const rating = this.$(el).text(); + knowledge_panel.ratings.push({ + name: name, + rating: rating + }); + }); + + knowledge_panel.available_on = this.$(Constants.SELECTORS.KNO_PANEL_AVAILABLE_ON).map((i, el) => this.$(el).text()).get(); + knowledge_panel.available_on.length == 0 && delete knowledge_panel.available_on; + + knowledge_panel.images = this.$(Constants.SELECTORS.KNO_PANEL_IMAGES).map((i, elem) => { + return { + url: this.$(elem).attr('data-src'), + source: this.$(elem).parent().parent().parent().attr('data-lpage'), + }; + }).get().filter((img) => img.url !== undefined); + knowledge_panel.images.length == 0 && delete knowledge_panel.images; + + const animal_preview = Utils.getStringBetweenStrings(this.data, 'source src\\x3d\\x22', '.mp4'); + animal_preview && (knowledge_panel.demonstration = animal_preview + '.mp4'); + + return knowledge_panel; + } + + getFeaturedSnippet() { + const featured_snippet_title = this.$(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[0]).text() || this.$(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[1]).text() || this.$(Constants.SELECTORS.FEATURED_SNIPPET_TITLE[2]).text() || undefined; + const featured_snippet_url = this.$(Constants.SELECTORS.FEATURED_SNIPPET_URL).map((i, el) => this.$(el).attr('href')).get()[0]; + const featured_snippet = Constants.SELECTORS.FEATURED_SNIPPET_DESC.map((selector) => { + if (this.$(selector)[0] && selector != Constants.SELECTORS.FEATURED_SNIPPET_DESC[2]) { + const text = this.$(selector).html().replace(/<\/li>|<\/b>|/g, '').replace(/&/g, '&').split('
  • ').join('\n').trim(); + return text; + } else if (selector == Constants.SELECTORS.FEATURED_SNIPPET_DESC[2]) { + const text = this.$(selector).text(); + return text; + } else { + return undefined; + } + }).filter(text => text != undefined && text.length != 0)[0]; + + return { + title: featured_snippet_title || 'N/A', + description: featured_snippet || 'N/A', + url: featured_snippet_url || 'N/A' + }; + } + + getTopStories() { + // Removes unnecessary text from the short description + this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((i, el) => this.$(el).remove()); + this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((i, el) => this.$(el).remove()); + + const top_stories_descriptions = Constants.SELECTORS.TOP_STORIES_DESCRIPTION.map((selector) => this.$(selector).map((i, el) => this.$(el).text().slice(1)).get()).filter((descs) => descs.length > 0)[0]; + const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((i, el) => this.$(el).attr('href')).get(); + + return top_stories_urls.map((item, i) => { + if (!top_stories_descriptions) return; + return { + description: top_stories_descriptions[i], + url: item, + }; + }).filter((story) => story); + } + + getPaa() { + let people_also_ask = []; + Constants.SELECTORS.PAA.forEach((item) => this.$(item).each((i, el) => people_also_ask.push(this.$(el).text()))); + people_also_ask.shift(); + return people_also_ask; + } + + getPas() { + return this.$(Constants.SELECTORS.PASF).map((i, el) => { + if (!this.$(el).attr('data-src')) return; + return { + title: this.$(el).attr('alt'), + thumbnail: `https:${this.$(el).attr('data-src')}` + }; + }).get(); + } + + getTime() { + const hours = this.$(Constants.SELECTORS.CURRENT_TIME_HOUR).text(); + const date = this.$(Constants.SELECTORS.CURRENT_TIME_DATE).map((i, el) => this.$(el).text()).get()[1]; + + if (date) { + return { + hours: hours.trim(), + date: date.trim() + }; + } + } + + getWeather() { + const weather_location = this.$(Constants.SELECTORS.WEATHER_LOCATION).text(); + const weather_forecast = this.$(Constants.SELECTORS.WEATHER_FORECAST).text(); + const precipitation = this.$(Constants.SELECTORS.PRECIPITATION).text(); + const air_humidity = this.$(Constants.SELECTORS.AIR_HUMIDITY).text(); + const temperature = this.$(Constants.SELECTORS.TEMPERATURE).text(); + const wind_speed = this.$(Constants.SELECTORS.WIND_SPEED).text(); + + if (weather_location && weather_forecast) { + return { + location: weather_location, + forecast: weather_forecast, + precipitation, + humidity: air_humidity, + temperature: temperature, + wind: wind_speed + }; + } + } + + getLocation(date) { + const location_title = this.$(Constants.SELECTORS.LOCATION_TITLE).text(); + const location_distance = this.$(Constants.SELECTORS.LOCATION_DISTANCE).text(); + const location_image = this.$(Constants.SELECTORS.LOCATION_IMAGE).attr('src'); + + if (location_title && location_distance && !date) { + return { + title: location_title, + distance: location_distance, + map: 'https://google.com' + location_image + }; + } + } + + getTranslation() { + const source_language = this.$(Constants.SELECTORS.TR_SOURCE_LANGUAGE).text(); + const target_language = this.$(Constants.SELECTORS.TR_TARGET_LANGUAGE).text(); + + const source_text = this.$(Constants.SELECTORS.TR_SOURCE_TEXT).text(); + const target_text = this.$(Constants.SELECTORS.TR_TARGET_TEXT).text(); + + if (source_text.length > 0) { + return { + source_language, + target_language, + source_text, + target_text + }; + } + } + + getDictionary() { + const word = this.$(Constants.SELECTORS.GD_WORD).text(); + const phonetic = this.$(Constants.SELECTORS.GD_PHONETIC).text(); + const audio = this.$(Constants.SELECTORS.GD_AUDIO).attr('src'); + + if (word) { + return { + word: word || 'N/A', + phonetic: phonetic || 'N/A', + audio: audio ? `https:${audio}` : 'N/A', + definitions: this.$(Constants.SELECTORS.GD_DEFINITIONS).map((i, el) => this.$(el).text()).get(), + examples: this.$(Constants.SELECTORS.GD_EXAMPLES).map((i, el) => this.$(el).text()).get() + }; + } + } + + getConverters() { + const unit_converter_input = this.$(Constants.SELECTORS.UNIT_CONVERTER_INPUT).attr('value'); + const unit_converter_output = this.$(Constants.SELECTORS.UNIT_CONVERTER_OUTPUT).attr('value'); + const unit_converter_formula = this.$(Constants.SELECTORS.UNIT_CONVERTER_FORMULA).text(); + + const input_currency_name = this.$(Constants.SELECTORS.INPUT_CURRENCY_NAME).attr('data-name'); + const output_currency_name = this.$(Constants.SELECTORS.OUTPUT_CURRENCY_NAME).attr('data-name'); + const currency_converter_input = this.$(Constants.SELECTORS.CURRENCY_CONVERTER_INPUT).text(); + const currency_converter_output = this.$(Constants.SELECTORS.CURRENCY_CONVERTER_OUTPUT).text(); + + if (unit_converter_input && unit_converter_output) { + return { + input: unit_converter_input, + output: unit_converter_output, + formula: unit_converter_formula + }; + } else if (currency_converter_input && currency_converter_output) { + return { + input: { + name: input_currency_name, + value: currency_converter_input + }, + output: { + name: output_currency_name, + value: currency_converter_output + } + }; + } + } + + correctFuzzyData(titles, descriptions, urls) { + // Corrects wrongly parsed data, this doesn't often happen though. + + titles.length < urls.length && titles.length < descriptions.length && urls.shift(); + urls.length > titles.length && urls.shift(); + + const innacurate_data = descriptions.length > urls.slice(1).length ? false : true; + + urls.forEach((item, index) => { + // Why YouTube? Because video results usually don't have a description. + if (item.includes('m.youtube.com') && innacurate_data && Constants.URLS.length > 1) { + urls.splice(index, 1); + titles.splice(index, 1); + index--; + } + }); + } +} + +module.exports = Parser; \ No newline at end of file diff --git a/lib/utils.js b/lib/utils.js index a30b6bb..25d61d8 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -10,26 +10,34 @@ function getHeaders(mobile) { 'dnt': 1, 'referer': 'https://www.google.com/', 'upgrade-insecure-requests': 1, - 'user-agent': mobile ? new UserAgent(/Android/).toString() : new UserAgent({ deviceCategory: 'desktop' }).toString() + 'user-agent': mobile && new UserAgent(/Android/).toString() || new UserAgent({ deviceCategory: 'desktop' }).toString() }; } function formatHtml(data) { return data - // Some garbage we don't need + // Gets rid of the data we don't need .replace(/N6jJud MUxGbd lyLwlc/g, '') .replace(/YjtGef ExmHv MUxGbd/g, '') .replace(/MUxGbd lyLwlc aLF0Z/g, '') - // Transforms all possible description classes in “MUxGbd yDYNvb” + /* + * Transforms all possible variations of some classes into a + * fixed string so it's easier to get consistent results. + **/ + + // Descriptions: -> MUxGbd yDYNvb .replace(/yDYNvb lEBKkf/g, 'yDYNvb') .replace(/VwiC3b MUxGbd yDYNvb/g, 'MUxGbd yDYNvb') - // Transforms all possible title classes in “yUTMj MBeuO ynAwRc gsrt PpBGzd YcUVQe” + // Urls: -> C8nzq BmP5tf + .replace(/cz3goc BmP5tf/g, 'C8nzq BmP5tf') + + // Titles: -> yUTMj MBeuO ynAwRc gsrt PpBGzd YcUVQe .replace(/yUTMj MBeuO ynAwRc PpBGzd YcUVQe/g, 'yUTMj MBeuO ynAwRc gsrt PpBGzd YcUVQe') + .replace(/oewGkc LeUQr/g, 'PpBGzd YcUVQe') .replace(/q8U8x MBeuO/g, 'yUTMj MBeuO') .replace(/ynAwRc PpBGzd/g, 'ynAwRc gsrt PpBGzd'); - } function getStringBetweenStrings(data, start_string, end_string) {