fix: parsing problems and other major bugs

LuanRT · Apr 8, 2022 · bc96493 · bc96493
1 parent 809d49e
commit bc96493
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 162 deletions.
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ async function start() {
     }
   }
 
-  const response = await google.search("TWDG", options);
+  const response = await google.search('TWDG', options);
   console.log(response); 
 }
 
@@ -215,7 +215,7 @@ start();
 
 ## What else can it do?
 
-As you may have noticed, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translator and song lyrics. 
+As you can see, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translate and song lyrics. 
 All you have to do is search something along the lines of; ```“define xyz”```, ```“translate x to y”``` or ```“xyz song lyrics”``` and the appropriated fields will appear in the response.
 
 #### Examples:

diff --git a/examples/index.js b/examples/index.js
@@ -18,8 +18,7 @@ async function start() {
   // Reverse Image Search
   const reverse = await google.search("https://i.pinimg.com/236x/92/16/d9/9216d9a222ef65eb6eabfff1970180d1.jpg", { ris: true });
   console.info('Reverse Image Search:', reverse.results);
-
-  // Top news
+
   const news = await google.getTopNews();
   console.info('Google Top News:', news);
 }

diff --git a/lib/constants.js b/lib/constants.js
@@ -29,7 +29,7 @@ module.exports = {
     KNO_PANEL_TYPE: 'div.BkwXh > div',
     KNO_PANEL_SONG_LYRICS: 'div.ujudUb',
     KNO_PANEL_AVAILABLE_ON: 'div[class="ellip bclEt"]',
-    KNO_PANEL_IMAGES: 'div > g-scrolling-carousel > div > div > div > g-inner-card > g-img > img',
+    KNO_PANEL_IMAGES: 'div > g-inner-card.xIfh4d > div > img',
     KNO_PANEL_BOOKS: 'div[data-attrid="kc:/book/author:books only"] > a > div > div > div.Bo9xMe > div',
     KNO_PANEL_TV_SHOWS_AND_MOVIES: 'div[data-attrid="kc:/people/person:tv-shows-and-movies"] > a > div > div > div.Bo9xMe > div',
     KNO_PANEL_FILM_GOOGLEUSERS_RATING: 'div[data-attrid="kc:/ugc:thumbs_up"] > div > div > div',
@@ -68,7 +68,7 @@ module.exports = {
 
     // Google Dictionary 
     GD_WORD: 'span[data-dobid="hdw"]',
-    GD_PHONETIC: 'div[class="S23sjd"]',
+    GD_PHONETIC: 'div.qexShd',
     GD_AUDIO: 'audio > source',
     GD_DEFINITIONS: 'div[data-dobid="dfn"]',
     GD_EXAMPLES: 'div[class="ubHt5c"]',
@@ -87,7 +87,7 @@ module.exports = {
     TOP_STORIES_WEBSITE: 'div[class="g5wfEd"] > div > g-img > img',
 
     // “People also ask” 
-    PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.wWOJcd > div > span', 'div.SC9Vz > div.zd9Fwc' ],
+    PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.gbCQS.u60jwe.gduDCb > div > span', 'div.JlqpRe > span' ],
 
     // “People also search for” 
     PASF: 'div[class="IHdOHf"] > img',

diff --git a/lib/googlethis.js b/lib/googlethis.js
@@ -7,10 +7,14 @@ const Cheerio = require('cheerio');
 const Constants = require('./constants');
 
 /**
- * Searches a given query on Google.
+ * Search a given query on Google.
  *
- * @param {string} query Query.
- * @param {object} options Search options.
+ * @param {string} query - Search query
+ * @param {object} [options] Search options
+ * @param {boolean} [options.ris] - Use reverse image search
+ * @param {boolean} [options.safe] - Safe search
+ * @param {number} [options.page] - Pagination
+ * @param {object} [options.additional_params] - Parameters that will be passed to Google
  */
 async function search(query, options = {}) {
   query = query.trim().split(/ +/).join('+');
@@ -27,10 +31,7 @@ async function search(query, options = {}) {
     '&start=' + options.page);
 
   const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(true) }).catch((err) => err);
-  if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);
-
-  const $ = Cheerio.load(Utils.refineData(response.data));
-  const parser = new Parser($, response.data);
+  if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });
 
   const results = {
     results: [],
@@ -41,12 +42,14 @@ async function search(query, options = {}) {
     people_also_ask: [],
     people_also_search_for: []
   };
+
+  const parser = new Parser(response.data);
 
   results.results = parser.getOrganicResults();
   results.knowledge_panel = parser.getKnowledgeGraph();
   results.featured_snippet = parser.getFeaturedSnippet();
 
-  const did_you_mean = $(Constants.SELECTORS.DID_YOU_MEAN).text();
+  const did_you_mean = parser.getDidYouMean();
   did_you_mean && (results.did_you_mean = did_you_mean) || (delete results.did_you_mean);
 
   const unit_converter = parser.getConverters();
@@ -80,10 +83,13 @@ async function search(query, options = {}) {
 }
 
 /**
- * Searches images on Google.
+ * Google image search.
  *
- * @param {string} query Search query.
- * @param {object} options Search options.
+ * @param {string} query - Search query
+ * @param {object} [options] - Search options
+ * @param {boolean} [options.safe] - Safe search
+ * @param {object} [options.additional_params] - Parameters that will be passed to Google
+ * @param {Array.<string>} [options.exclude_domains] - Domains that should be blocked
  */
 async function image(query, options = {}) {
   query = query.trim().split(/ +/).join('+');
@@ -97,7 +103,7 @@ async function image(query, options = {}) {
     ' ' + options.exclude_domains.map((site) => '-site:' + site);
 
   const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(false) }).catch((err) => err);
-  if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);
+  if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });
 
   const results = [];
   const origin = parseImageOriginData(response.data);
@@ -123,7 +129,7 @@ async function image(query, options = {}) {
 /**
  * Gets image origin data
  *
- * @param {string} data Raw html.
+ * @param {string} data - Raw html.
  */
 function parseImageOriginData(data) {
   let results = [];
@@ -132,7 +138,7 @@ function parseImageOriginData(data) {
   while (parsed_results != null) {
     results.push({
       title: parsed_results[4],
-      website: parsed_results[3],
+      source: parsed_results[3],
     });
     parsed_results = Constants.REGEX.IMAGE_ORIGIN.exec(data);
   }

diff --git a/lib/parser.js b/lib/parser.js
@@ -1,13 +1,15 @@
 'use strict';
 
 const Utils = require('./utils');
+const Unraw = require('unraw').default;
+const Cheerio = require('cheerio');
 const Constants = require('./constants');
 const NormalizeText = require('replace-special-characters');
-
+  
 class Parser {
-  constructor($, raw_data) {
-    this.$ = $;
-    this.raw_data = raw_data;
+  constructor(data) {
+    this.data = data;
+    this.$ = Cheerio.load(Utils.refineData(data));
   }
 
   getOrganicResults() {
@@ -23,7 +25,8 @@ class Parser {
           return this.$(el).text().trim();
       }).get();
 
-    const urls = this.$(Constants.SELECTORS.URL).map((i, el) => this.$(el).attr('href')).get();
+    const urls = this.$(Constants.SELECTORS.URL)
+      .map((i, el) => this.$(el).attr('href')).get();
 
     this.#correctFuzzyData(titles, descriptions, urls);
 
@@ -49,7 +52,7 @@ class Parser {
     this.$(Constants.SELECTORS.KNO_PANEL_METADATA).each((i, el) => {
       const key = this.$(el).first().text().trim().slice(0, -1);
       const value = this.$(el).next().text().trim();
-      value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_'))] = value.trim());
+      value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_').replace(/\(|\)/g, ''))] = value.trim());
     });
 
     const knowledge_panel_type = this.$(Constants.SELECTORS.KNO_PANEL_TYPE).last().text();
@@ -79,7 +82,7 @@ class Parser {
           .replace(/<\/span><\/div><div jsname="u8s5sf" class="ujudub"><span jsname="ys01ge">/g, '\n\n')
           .replace(/<br>/g, '\n')).text()).get();
 
-    song_lyrics.length > 0 && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));
+    song_lyrics.length && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));
 
     const google_users_rating = this.$(Constants.SELECTORS.KNO_PANEL_FILM_GOOGLEUSERS_RATING)[0];
     if (google_users_rating) {
@@ -106,20 +109,20 @@ class Parser {
     knowledge_panel.images = this.$(Constants.SELECTORS.KNO_PANEL_IMAGES).map((i, elem) => {
       return {
         url: this.$(elem).attr('data-src'),
-        source: this.$(elem).parent().parent().parent().attr('data-lpage'),
+        source: this.$(elem).parent().parent().parent().parent().attr('data-lpage'),
       };
-    }).get().filter((img) => img.url !== undefined);
+    }).get().filter((img) => img.url);
 
-    const demo = Utils.getStringBetweenStrings(this.raw_data, 'source src\\x3d\\x22', '.mp4');
+    const demo = Utils.getStringBetweenStrings(this.data, 'source src\\x3d\\x22', '.mp4');
     demo && (knowledge_panel.demonstration = demo + '.mp4');
 
-    knowledge_panel.books.length == 0 &&
+    !knowledge_panel.books.length &&
       delete knowledge_panel.books;
-    knowledge_panel.tv_shows_and_movies.length == 0 &&
+    !knowledge_panel.tv_shows_and_movies.length &&
       delete knowledge_panel.tv_shows_and_movies;
-    knowledge_panel.available_on.length == 0 &&
+    !knowledge_panel.available_on.length &&
       delete knowledge_panel.available_on;
-    knowledge_panel.images.length == 0 &&
+    !knowledge_panel.images.length &&
       delete knowledge_panel.images;
 
     return knowledge_panel;
@@ -149,7 +152,7 @@ class Parser {
       } else {
         return undefined;
       }
-    }).filter(text => text != undefined && text.length != 0)[0];
+    }).filter(text => text && text.length)[0];
 
     return {
       title: featured_snippet_title || 'N/A',
@@ -158,42 +161,28 @@ class Parser {
     };
   }
 
+  getDidYouMean() {
+    return this.$(Constants.SELECTORS.DID_YOU_MEAN).text();
+  }
+
   getTopStories() {
     // Removes unnecessary text from the description
-    this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((i, el) => this.$(el).remove());
-    this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((i, el) => this.$(el).remove());
+    this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((el) => this.$(el).remove());
+    this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((el) => this.$(el).remove());
 
     const top_stories_descriptions = Constants.SELECTORS.TOP_STORIES_DESCRIPTION.map((selector) =>
-      this.$(selector).map((i, el) => this.$(el).text().slice(1)).get()).filter((descs) => descs.length > 0)[0];
-    const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((i, el) => this.$(el).attr('href')).get();
+      this.$(selector).map((el) => this.$(el).text()).get()).filter((descs) => descs.length > 0)[0];
+    const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((el) => this.$(el).attr('href')).get();
 
     return top_stories_urls.map((item, i) => {
       if (!top_stories_descriptions) return;
       return {
         description: top_stories_descriptions[i],
-        url: item,
+        url: item
       };
     }).filter((story) => story);
   }
 
-  getPaa() {
-    let people_also_ask = [];
-    Constants.SELECTORS.PAA.forEach((item) =>
-      this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));
-    people_also_ask.shift();
-    return people_also_ask;
-  }
-
-  getPas() {
-    return this.$(Constants.SELECTORS.PASF).map((i, el) => {
-      if (!this.$(el).attr('data-src')) return;
-      return {
-        title: this.$(el).attr('alt'),
-        thumbnail: `https:${this.$(el).attr('data-src')}`
-      };
-    }).get();
-  }
-
   getTime() {
     const hours = this.$(Constants.SELECTORS.CURRENT_TIME_HOUR).text();
     const date = this.$(Constants.SELECTORS.CURRENT_TIME_DATE).map((i, el) => this.$(el).text()).get()[1];
@@ -303,15 +292,44 @@ class Parser {
     }
   }
 
+  getPaa() {
+    const people_also_ask = [];
+
+    Constants.SELECTORS.PAA.forEach((item) =>
+      this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));
+
+    people_also_ask.shift();
+
+    const extra_data = JSON.parse(Unraw(Utils.getStringBetweenStrings(this.data, 'var c=\'', '\';google') || '{}'));
+    const rfs = extra_data?.sb_wiz?.rfs;
+
+    rfs && rfs.forEach((el) => {
+      const item = el.replace(/<b>|<\/b>/g, '');
+      people_also_ask.push(item);
+    });
+
+    return people_also_ask;
+  }
+
+  getPas() {
+    return this.$(Constants.SELECTORS.PASF).map((i, el) => {
+      if (!this.$(el).attr('data-src')) return;
+      return {
+        title: this.$(el).attr('alt'),
+        thumbnail: `https:${this.$(el).attr('data-src')}`
+      };
+    }).get();
+  }
+
   #correctFuzzyData(titles, descriptions, urls) {
     titles.length < urls.length && titles.length < descriptions.length && urls.shift();
     urls.length > titles.length && urls.shift();
 
-    const innacurate_data = descriptions.length > urls.slice(1).length ? false : true;
-
+    const is_innacurate_data = descriptions.length < urls.slice(1).length;
+    
     urls.forEach((item, index) => {
       // Why YouTube? Because video results usually don't have a description.
-      if (item.includes('m.youtube.com') && innacurate_data && Constants.URLS.length > 1) {
+      if (item.includes('m.youtube.com') && is_innacurate_data) {
         urls.splice(index, 1);
         titles.splice(index, 1);
         index--;

diff --git a/lib/utils.js b/lib/utils.js
@@ -2,6 +2,14 @@
 
 const UserAgent = require('user-agents');
 
+function SearchError(message, info) {
+  this.info = info; 
+  this.stack = Error(message).stack; 
+}  
+
+SearchError.prototype = Object.create(Error.prototype);  
+SearchError.prototype.constructor = SearchError;
+
 /**
  * Returns headers with a random user agent.
  *
@@ -22,7 +30,7 @@ function getHeaders (is_mobile) {
 /**
  * Refines the html.
  *
- * @param {string} data Raw html data.
+ * @param {string} data - Raw html data.
  * @returns {string} Refined data.
  */
 function refineData (data) {
@@ -54,9 +62,9 @@ function refineData (data) {
 /**
  * Gets a string between two delimiters.
  *
- * @param {string} data The data.
- * @param {string} start_string Start string.
- * @param {string} end_string End string.
+ * @param {string} data - The data.
+ * @param {string} start_string - Start string.
+ * @param {string} end_string - End string.
  */
 function getStringBetweenStrings (data, start_string, end_string) {
   const regex = new RegExp(`${escapeStringRegexp(start_string)}(.*?)${escapeStringRegexp(end_string)}`, "s");
@@ -68,4 +76,4 @@ function escapeStringRegexp (string) {
   return string.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&').replace(/-/g, '\\x2d');
 }
 
-module.exports = { getHeaders, getStringBetweenStrings, refineData };
+module.exports = { SearchError, getHeaders, getStringBetweenStrings, refineData };