Skip to content

Commit

Permalink
fix: parsing problems and other major bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
LuanRT committed Apr 8, 2022
1 parent 809d49e commit bc96493
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 162 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ async function start() {
}
}

const response = await google.search("TWDG", options);
const response = await google.search('TWDG', options);
console.log(response);
}

Expand Down Expand Up @@ -215,7 +215,7 @@ start();

## What else can it do?

As you may have noticed, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translator and song lyrics.
As you can see, the library returns a lot of data. Currently it can parse everything from the knowledge graph, featured snippets and much more such as Google Dictionary, Google Translate and song lyrics.
All you have to do is search something along the lines of; ```“define xyz”```, ```“translate x to y”``` or ```“xyz song lyrics”``` and the appropriated fields will appear in the response.

#### Examples:
Expand Down
3 changes: 1 addition & 2 deletions examples/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ async function start() {
// Reverse Image Search
const reverse = await google.search("https://i.pinimg.com/236x/92/16/d9/9216d9a222ef65eb6eabfff1970180d1.jpg", { ris: true });
console.info('Reverse Image Search:', reverse.results);

// Top news

const news = await google.getTopNews();
console.info('Google Top News:', news);
}
Expand Down
6 changes: 3 additions & 3 deletions lib/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ module.exports = {
KNO_PANEL_TYPE: 'div.BkwXh > div',
KNO_PANEL_SONG_LYRICS: 'div.ujudUb',
KNO_PANEL_AVAILABLE_ON: 'div[class="ellip bclEt"]',
KNO_PANEL_IMAGES: 'div > g-scrolling-carousel > div > div > div > g-inner-card > g-img > img',
KNO_PANEL_IMAGES: 'div > g-inner-card.xIfh4d > div > img',
KNO_PANEL_BOOKS: 'div[data-attrid="kc:/book/author:books only"] > a > div > div > div.Bo9xMe > div',
KNO_PANEL_TV_SHOWS_AND_MOVIES: 'div[data-attrid="kc:/people/person:tv-shows-and-movies"] > a > div > div > div.Bo9xMe > div',
KNO_PANEL_FILM_GOOGLEUSERS_RATING: 'div[data-attrid="kc:/ugc:thumbs_up"] > div > div > div',
Expand Down Expand Up @@ -68,7 +68,7 @@ module.exports = {

// Google Dictionary
GD_WORD: 'span[data-dobid="hdw"]',
GD_PHONETIC: 'div[class="S23sjd"]',
GD_PHONETIC: 'div.qexShd',
GD_AUDIO: 'audio > source',
GD_DEFINITIONS: 'div[data-dobid="dfn"]',
GD_EXAMPLES: 'div[class="ubHt5c"]',
Expand All @@ -87,7 +87,7 @@ module.exports = {
TOP_STORIES_WEBSITE: 'div[class="g5wfEd"] > div > g-img > img',

// “People also ask”
PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.wWOJcd > div > span', 'div.SC9Vz > div.zd9Fwc' ],
PAA: [ 'div.s75CSd.u60jwe.gduDCb > span', 'div.gbCQS.u60jwe.gduDCb > div > span', 'div.JlqpRe > span' ],

// “People also search for”
PASF: 'div[class="IHdOHf"] > img',
Expand Down
34 changes: 20 additions & 14 deletions lib/googlethis.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ const Cheerio = require('cheerio');
const Constants = require('./constants');

/**
* Searches a given query on Google.
* Search a given query on Google.
*
* @param {string} query Query.
* @param {object} options Search options.
* @param {string} query - Search query
* @param {object} [options] Search options
* @param {boolean} [options.ris] - Use reverse image search
* @param {boolean} [options.safe] - Safe search
* @param {number} [options.page] - Pagination
* @param {object} [options.additional_params] - Parameters that will be passed to Google
*/
async function search(query, options = {}) {
query = query.trim().split(/ +/).join('+');
Expand All @@ -27,10 +31,7 @@ async function search(query, options = {}) {
'&start=' + options.page);

const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(true) }).catch((err) => err);
if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);

const $ = Cheerio.load(Utils.refineData(response.data));
const parser = new Parser($, response.data);
if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });

const results = {
results: [],
Expand All @@ -41,12 +42,14 @@ async function search(query, options = {}) {
people_also_ask: [],
people_also_search_for: []
};

const parser = new Parser(response.data);

results.results = parser.getOrganicResults();
results.knowledge_panel = parser.getKnowledgeGraph();
results.featured_snippet = parser.getFeaturedSnippet();

const did_you_mean = $(Constants.SELECTORS.DID_YOU_MEAN).text();
const did_you_mean = parser.getDidYouMean();
did_you_mean && (results.did_you_mean = did_you_mean) || (delete results.did_you_mean);

const unit_converter = parser.getConverters();
Expand Down Expand Up @@ -80,10 +83,13 @@ async function search(query, options = {}) {
}

/**
* Searches images on Google.
* Google image search.
*
* @param {string} query Search query.
* @param {object} options Search options.
* @param {string} query - Search query
* @param {object} [options] - Search options
* @param {boolean} [options.safe] - Safe search
* @param {object} [options.additional_params] - Parameters that will be passed to Google
* @param {Array.<string>} [options.exclude_domains] - Domains that should be blocked
*/
async function image(query, options = {}) {
query = query.trim().split(/ +/).join('+');
Expand All @@ -97,7 +103,7 @@ async function image(query, options = {}) {
' ' + options.exclude_domains.map((site) => '-site:' + site);

const response = await Axios.get(url, { params: options.additional_params, headers: Utils.getHeaders(false) }).catch((err) => err);
if (response instanceof Error) throw new Error('Could not search on Google: ' + response.message);
if (response instanceof Error) throw new Utils.SearchError('Could not execute search', { status_code: response?.status || 0, message: response?.message });

const results = [];
const origin = parseImageOriginData(response.data);
Expand All @@ -123,7 +129,7 @@ async function image(query, options = {}) {
/**
* Gets image origin data
*
* @param {string} data Raw html.
* @param {string} data - Raw html.
*/
function parseImageOriginData(data) {
let results = [];
Expand All @@ -132,7 +138,7 @@ function parseImageOriginData(data) {
while (parsed_results != null) {
results.push({
title: parsed_results[4],
website: parsed_results[3],
source: parsed_results[3],
});
parsed_results = Constants.REGEX.IMAGE_ORIGIN.exec(data);
}
Expand Down
100 changes: 59 additions & 41 deletions lib/parser.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
'use strict';

const Utils = require('./utils');
const Unraw = require('unraw').default;
const Cheerio = require('cheerio');
const Constants = require('./constants');
const NormalizeText = require('replace-special-characters');

class Parser {
constructor($, raw_data) {
this.$ = $;
this.raw_data = raw_data;
constructor(data) {
this.data = data;
this.$ = Cheerio.load(Utils.refineData(data));
}

getOrganicResults() {
Expand All @@ -23,7 +25,8 @@ class Parser {
return this.$(el).text().trim();
}).get();

const urls = this.$(Constants.SELECTORS.URL).map((i, el) => this.$(el).attr('href')).get();
const urls = this.$(Constants.SELECTORS.URL)
.map((i, el) => this.$(el).attr('href')).get();

this.#correctFuzzyData(titles, descriptions, urls);

Expand All @@ -49,7 +52,7 @@ class Parser {
this.$(Constants.SELECTORS.KNO_PANEL_METADATA).each((i, el) => {
const key = this.$(el).first().text().trim().slice(0, -1);
const value = this.$(el).next().text().trim();
value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_'))] = value.trim());
value.length && (knowledge_panel[NormalizeText(key.toLowerCase().replace(/ /g, '_').replace(/\(|\)/g, ''))] = value.trim());
});

const knowledge_panel_type = this.$(Constants.SELECTORS.KNO_PANEL_TYPE).last().text();
Expand Down Expand Up @@ -79,7 +82,7 @@ class Parser {
.replace(/<\/span><\/div><div jsname="u8s5sf" class="ujudub"><span jsname="ys01ge">/g, '\n\n')
.replace(/<br>/g, '\n')).text()).get();

song_lyrics.length > 0 && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));
song_lyrics.length && (knowledge_panel.lyrics = song_lyrics.join('\n\n'));

const google_users_rating = this.$(Constants.SELECTORS.KNO_PANEL_FILM_GOOGLEUSERS_RATING)[0];
if (google_users_rating) {
Expand All @@ -106,20 +109,20 @@ class Parser {
knowledge_panel.images = this.$(Constants.SELECTORS.KNO_PANEL_IMAGES).map((i, elem) => {
return {
url: this.$(elem).attr('data-src'),
source: this.$(elem).parent().parent().parent().attr('data-lpage'),
source: this.$(elem).parent().parent().parent().parent().attr('data-lpage'),
};
}).get().filter((img) => img.url !== undefined);
}).get().filter((img) => img.url);

const demo = Utils.getStringBetweenStrings(this.raw_data, 'source src\\x3d\\x22', '.mp4');
const demo = Utils.getStringBetweenStrings(this.data, 'source src\\x3d\\x22', '.mp4');
demo && (knowledge_panel.demonstration = demo + '.mp4');

knowledge_panel.books.length == 0 &&
!knowledge_panel.books.length &&
delete knowledge_panel.books;
knowledge_panel.tv_shows_and_movies.length == 0 &&
!knowledge_panel.tv_shows_and_movies.length &&
delete knowledge_panel.tv_shows_and_movies;
knowledge_panel.available_on.length == 0 &&
!knowledge_panel.available_on.length &&
delete knowledge_panel.available_on;
knowledge_panel.images.length == 0 &&
!knowledge_panel.images.length &&
delete knowledge_panel.images;

return knowledge_panel;
Expand Down Expand Up @@ -149,7 +152,7 @@ class Parser {
} else {
return undefined;
}
}).filter(text => text != undefined && text.length != 0)[0];
}).filter(text => text && text.length)[0];

return {
title: featured_snippet_title || 'N/A',
Expand All @@ -158,42 +161,28 @@ class Parser {
};
}

getDidYouMean() {
return this.$(Constants.SELECTORS.DID_YOU_MEAN).text();
}

getTopStories() {
// Removes unnecessary text from the description
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((i, el) => this.$(el).remove());
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((i, el) => this.$(el).remove());
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div.CEMjEf`).each((el) => this.$(el).remove());
this.$(`${Constants.SELECTORS.TOP_STORIES_DESCRIPTION[0]} > div > p`).each((el) => this.$(el).remove());

const top_stories_descriptions = Constants.SELECTORS.TOP_STORIES_DESCRIPTION.map((selector) =>
this.$(selector).map((i, el) => this.$(el).text().slice(1)).get()).filter((descs) => descs.length > 0)[0];
const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((i, el) => this.$(el).attr('href')).get();
this.$(selector).map((el) => this.$(el).text()).get()).filter((descs) => descs.length > 0)[0];
const top_stories_urls = this.$(Constants.SELECTORS.TOP_STORIES_URL).map((el) => this.$(el).attr('href')).get();

return top_stories_urls.map((item, i) => {
if (!top_stories_descriptions) return;
return {
description: top_stories_descriptions[i],
url: item,
url: item
};
}).filter((story) => story);
}

getPaa() {
let people_also_ask = [];
Constants.SELECTORS.PAA.forEach((item) =>
this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));
people_also_ask.shift();
return people_also_ask;
}

getPas() {
return this.$(Constants.SELECTORS.PASF).map((i, el) => {
if (!this.$(el).attr('data-src')) return;
return {
title: this.$(el).attr('alt'),
thumbnail: `https:${this.$(el).attr('data-src')}`
};
}).get();
}

getTime() {
const hours = this.$(Constants.SELECTORS.CURRENT_TIME_HOUR).text();
const date = this.$(Constants.SELECTORS.CURRENT_TIME_DATE).map((i, el) => this.$(el).text()).get()[1];
Expand Down Expand Up @@ -303,15 +292,44 @@ class Parser {
}
}

getPaa() {
const people_also_ask = [];

Constants.SELECTORS.PAA.forEach((item) =>
this.$(item).each((i, el) => people_also_ask.push(this.$(el).text())));

people_also_ask.shift();

const extra_data = JSON.parse(Unraw(Utils.getStringBetweenStrings(this.data, 'var c=\'', '\';google') || '{}'));
const rfs = extra_data?.sb_wiz?.rfs;

rfs && rfs.forEach((el) => {
const item = el.replace(/<b>|<\/b>/g, '');
people_also_ask.push(item);
});

return people_also_ask;
}

getPas() {
return this.$(Constants.SELECTORS.PASF).map((i, el) => {
if (!this.$(el).attr('data-src')) return;
return {
title: this.$(el).attr('alt'),
thumbnail: `https:${this.$(el).attr('data-src')}`
};
}).get();
}

#correctFuzzyData(titles, descriptions, urls) {
titles.length < urls.length && titles.length < descriptions.length && urls.shift();
urls.length > titles.length && urls.shift();

const innacurate_data = descriptions.length > urls.slice(1).length ? false : true;

const is_innacurate_data = descriptions.length < urls.slice(1).length;
urls.forEach((item, index) => {
// Why YouTube? Because video results usually don't have a description.
if (item.includes('m.youtube.com') && innacurate_data && Constants.URLS.length > 1) {
if (item.includes('m.youtube.com') && is_innacurate_data) {
urls.splice(index, 1);
titles.splice(index, 1);
index--;
Expand Down
18 changes: 13 additions & 5 deletions lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

const UserAgent = require('user-agents');

function SearchError(message, info) {
this.info = info;
this.stack = Error(message).stack;
}

SearchError.prototype = Object.create(Error.prototype);
SearchError.prototype.constructor = SearchError;

/**
* Returns headers with a random user agent.
*
Expand All @@ -22,7 +30,7 @@ function getHeaders (is_mobile) {
/**
* Refines the html.
*
* @param {string} data Raw html data.
* @param {string} data - Raw html data.
* @returns {string} Refined data.
*/
function refineData (data) {
Expand Down Expand Up @@ -54,9 +62,9 @@ function refineData (data) {
/**
* Gets a string between two delimiters.
*
* @param {string} data The data.
* @param {string} start_string Start string.
* @param {string} end_string End string.
* @param {string} data - The data.
* @param {string} start_string - Start string.
* @param {string} end_string - End string.
*/
function getStringBetweenStrings (data, start_string, end_string) {
const regex = new RegExp(`${escapeStringRegexp(start_string)}(.*?)${escapeStringRegexp(end_string)}`, "s");
Expand All @@ -68,4 +76,4 @@ function escapeStringRegexp (string) {
return string.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&').replace(/-/g, '\\x2d');
}

module.exports = { getHeaders, getStringBetweenStrings, refineData };
module.exports = { SearchError, getHeaders, getStringBetweenStrings, refineData };
Loading

0 comments on commit bc96493

Please sign in to comment.