Skip to content

Commit

Permalink
Upgrade async library
Browse files Browse the repository at this point in the history
  • Loading branch information
muodov committed Sep 4, 2024
1 parent 8cf124f commit 4bd1822
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 73 deletions.
71 changes: 33 additions & 38 deletions cli/crawl-cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,44 +46,39 @@ function createOutputPath(outputPath, url, fileType = 'json') {
* @param {string} outputPath
*/
function filterUrls(inputUrls, logFunction, outputPath) {
return new Promise((resolveFilterUrls, rejectFilterUrls) => {
asyncLib.filter(inputUrls, (item, filterCallback) => {
const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch {
logFunction(chalk.yellow('Invalid URL:'), urlString);
filterCallback(null, false);
return;
}

if (outputPath) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(outputPath, url);
fs.access(outputFile, err => {
if (err) {
filterCallback(null, true);
} else {
logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
filterCallback(null, false);
}
});
return;
}
filterCallback(null, true);
}, (err, results) => {
if (err) {
logFunction(chalk.red(`Could not filter URL list: ${err}`));
rejectFilterUrls(err);
}
resolveFilterUrls(results);
});
return asyncLib.filter(inputUrls, (item, filterCallback) => {
const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch {
logFunction(chalk.yellow('Invalid URL:'), urlString);
filterCallback(null, false);
return;
}

if (outputPath) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(outputPath, url);
fs.access(outputFile, err => {
if (err) {
filterCallback(null, true);
} else {
logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
filterCallback(null, false);
}
});
return;
}
filterCallback(null, true);
}).catch(err => {
logFunction(chalk.red(`Could not filter URL list: ${err}`));
throw err;
});
}

Expand Down
16 changes: 3 additions & 13 deletions crawlerConductor.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
const os = require('os');
const cores = os.cpus().length;
const chalk = require('chalk');
const async = require('async');
const asyncLib = require('async');
const crawl = require('./crawler');
const {createTimer} = require('./helpers/timer');
const createDeferred = require('./helpers/deferred');
const {downloadCustomChromium} = require('./helpers/chromiumDownload');
const notABot = require('./helpers/notABot');

Expand Down Expand Up @@ -52,7 +51,6 @@ async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstPa
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options
*/
module.exports = async options => {
const deferred = createDeferred();
const log = options.logFunction || (() => {});
const failureCallback = options.failureCallback || (() => {});

Expand All @@ -73,7 +71,7 @@ module.exports = async options => {
executablePath = await downloadCustomChromium(log, options.chromiumVersion);
}

async.eachOfLimit(options.urls, numberOfCrawlers, (urlItem, idx, callback) => {
await asyncLib.eachOfLimit(options.urls, numberOfCrawlers, (urlItem, idx, callback) => {
const urlString = (typeof urlItem === 'string') ? urlItem : urlItem.url;
let dataCollectors = options.dataCollectors;

Expand All @@ -87,7 +85,7 @@ module.exports = async options => {

const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags);

async.retry(MAX_NUMBER_OF_RETRIES, task, err => {
asyncLib.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
console.log(err);
log(chalk.red(`Max number of retries (${MAX_NUMBER_OF_RETRIES}) exceeded for "${urlString}".`));
Expand All @@ -98,15 +96,7 @@ module.exports = async options => {

callback();
});
}, err => {
if (err) {
deferred.reject(err);
} else {
deferred.resolve();
}
});

await deferred.promise;
};

/**
Expand Down
34 changes: 14 additions & 20 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"node": ">=16.0.0"
},
"devDependencies": {
"@types/async": "^2.4.1",
"@types/async": "^3.2.24",
"@types/mocha": "^9.1.1",
"@types/mockery": "^1.4.30",
"@types/node": "^20.12.1",
Expand All @@ -37,7 +37,7 @@
},
"dependencies": {
"@duckduckgo/autoconsent": "^10.15.0",
"async": "^2.6.1",
"async": "^3.2.6",
"chalk": "^4.1.2",
"clickhouse": "^2.6.0",
"commander": "^12.1.0",
Expand Down

0 comments on commit 4bd1822

Please sign in to comment.