Added concurrency option for processing compressed sitemaps. Updated …

…documentation and added a demo.
mdrideout · Sep 10, 2018 · 5201c8d · 5201c8d
1 parent e6b389a
commit 5201c8d
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 85 deletions.
diff --git a/.eslintrc b/.eslintrc
@@ -1,6 +1,6 @@
 {
 	"parserOptions": {
-		"ecmaVersion": 6
+		"ecmaVersion": 8
 	},
 	"env": {
 		"browser": false,

diff --git a/README.md b/README.md
@@ -1,9 +1,9 @@
 # xml-sitemap-url-scraper
 Call the function with an array of one or more XML sitemap urls. All sitemap urls provided in this array must not be compressed. Compressed sitemaps are only supported when nested under `<sitemapindex>` tags.
 
-_GOOD:_ https://www.example.com/sitemap.xml
+_Normal:_ https://www.example.com/sitemap.xml
 
-_BAD:_  https://www.example.com/sitemap.xml.gz
+_Compressed:_  https://www.example.com/sitemap.xml.gz
 
 **Returns** a promise that resolves with an array of all URLs from those sitemaps
 
@@ -13,6 +13,19 @@ _BAD:_  https://www.example.com/sitemap.xml.gz
 npm install --save xml-sitemap-url-scraper
 ```
 
+**Requirements**
+
+To be compatible with async in example
+
+ - [ECMAScript 2017 (version 8)](https://www.w3schools.com/js/js_versions.asp)
+ - [Node version > 8.2.1](https://node.green/)
+
+**Demo**
+
+```
+npm start
+```
+
 **Example**
 
 ```
@@ -22,7 +35,11 @@ let sitemapUrls = [
     "https://www.example.com/sitemap.xml"
 ]
 
-let urls = sitemapUrlScraper(sitemapUrls);
+// Define how many compressed sitemaps we want to decompress and process at once (if any are found)
+let concurrency = 5;
+
+// Function's concurrency defaults to 1 if no param is provided
+let urls = sitemapUrlScraper(sitemapUrls, concurrency);
 
 urls.then(result => {
     console.log("Returned URLs: ", result);
@@ -47,26 +64,26 @@ Nested sitemaps will automatically be traversed, and their urls will be included
 ```
 <sitemapindex xmlns="https://www.example.com/schemas/sitemap/0.84">
     <sitemap>
-    <loc>https://www.example.com/edu/sitemap.xml</loc>
+        <loc>https://www.example.com/edu/sitemap.xml</loc>
     </sitemap>
     <sitemap>
-    <loc>https://www.example.com/gmail/sitemap.xml</loc>
+        <loc>https://www.example.com/gmail/sitemap.xml</loc>
     </sitemap>
 </sitemapindex>
 ```
 
 ## Compressed Sitemaps
-Child sitemaps that are nested inside `<sitemapindex>` tags will be decompressed, and their urls will be included in the final output array. Compressed sitemaps are processed sequentially to avoid memory and CPU load issues. This function may take a long time to execute if there is a significant number of compressed sitemaps being scraped.
+Child sitemaps that are nested inside `<sitemapindex>` tags will be decompressed, and their urls will be included in the final output array. Compressed sitemaps are processed concurrently according to the parameter provided in the function call. This can help avoid memory and CPU load issues when processing a large number of compressed sitemaps. This function may take a long time to execute if there is a significant number of compressed sitemaps being scraped.
 
 **Example of compressed sitemaps**
 
 ```
 <sitemapindex xmlns="https://www.example.com/schemas/sitemap/0.84">
     <sitemap>
-    <loc>https://www.example.com/edu/sitemap.xml.gz</loc>
+        <loc>https://www.example.com/edu/sitemap.xml.gz</loc>
     </sitemap>
     <sitemap>
-    <loc>https://www.example.com/gmail/sitemap.xml.gz</loc>
+        <loc>https://www.example.com/gmail/sitemap.xml.gz</loc>
     </sitemap>
 </sitemapindex>
 ```
diff --git a/demo.js b/demo.js
@@ -0,0 +1,16 @@
+const { sitemapUrlScraper } = require('./index.js');
+
+let sitemapArray = [
+    "https://www.theguardian.com/sitemaps/news.xml",    // XML sitemap with <urlset> parent
+    "https://www.whitehouse.gov/sitemap_index.xml",     // XML sitemap with <sitemapindex> parent, and no compressed sitemaps
+    "https://www.delish.com/sitemap_index.xml",         // XML sitemap with <sitemapindex> parent, compressed child sitemaps
+];
+
+let urls = sitemapUrlScraper(sitemapArray, 5);
+
+urls.then(result => {
+    console.log("Returned URLs: ", result);
+})
+.catch(err => {
+    console.log(err);
+})
diff --git a/index.js b/index.js
@@ -2,13 +2,15 @@ const zlib = require('zlib');
 const parseString = require('xml2js').parseString;
 const axios = require('axios');
 const util = require('util')
+const pLimit = require('p-limit');
 
 /**
  * XML Sitemap URL Scraper
  * @param {array} sitemapArray is an array of xml sitemap urls, ex: "https://www.example.com/sitemap.xml"
+ * @param {number} compressedConcurrent is the number of compressed XML sitemaps to process at once (lower numbers save on CPU and Memory resources.)
  * @returns {array} of urls from all sitemaps provided
  */
-const sitemapUrlScraper = (sitemapArray) => {
+const sitemapUrlScraper = (sitemapArray, compressedConcurrent = 1) => {
     return new Promise((resolve, reject) => {
         // Array to hold all URLs parsed out of all sitemaps
         let allUrls = [];
@@ -54,7 +56,7 @@ const sitemapUrlScraper = (sitemapArray) => {
             // console.log("Completed parsing non-compressed child site maps: ", util.inspect(allUrls, { maxArrayLength: null }))
 
             // Parse child sitemaps that ARE compressed, as JSON
-            return handleCompressedXmlSync(childSitemaps.filter(currentSitemap => {
+            return handleCompressedXml(childSitemaps.filter(currentSitemap => {
                 if (/\.gz$/i.test(currentSitemap)) {
                     return true;
                 }
@@ -174,96 +176,93 @@ const sitemapUrlScraper = (sitemapArray) => {
 
 
         /**
-         * Handle Compressed XML Synchronously
-         * Synchronously unzips and parses as json compressed XML sitemaps. This is done synchronously to avoid memory or resource issues
-         * that may come if trying to unzip and read thousands of these at once.
+         * Handle Compressed XML
+         * Upzips and parses as json XML sitemaps. This is done with limited concurrency to avoid memory and CPU resource issues.
          * @param {array} compressedSitemapArray is an array of compressed XML sitemap URLS
          * @returns {*} promise resolving with an array of sitemaps parsed as JSON
          */
-        function handleCompressedXmlSync(compressedSitemapArray) {
+        function handleCompressedXml(compressedSitemapArray) {
             return new Promise((resolve, reject) => {
                 // Array to store parsed XML JSON
                 let parsedXmlArray = [];
 
-                // Use reduce to synchronously process each zipped XML file
-                let promises = compressedSitemapArray.reduce((promise, sitemapUrl) => {
-                    return promise.then(() => {
-                        return processCompressedXmlFile(sitemapUrl);
-                    });
-                }, Promise.resolve());
+
+
+                // Define our promise limiter with desired concurrency (taken from main function params)
+                const promiseLimit = pLimit(compressedConcurrent);
+
+                // Create an array of our promise returning functions
+                let promises = compressedSitemapArray.map(sitemapUrl => {
+                    return promiseLimit(() => processCompressedXmlFile(sitemapUrl));
+                });
 
-                // Retrieve a stream of the zip file, pipe it to gunzip, then parse the XML file as json - push the JSON to the parsedXmlArray - then resolve the promise to move to the next item
-                let processCompressedXmlFile = (sitemapUrl) => {
-                    return new Promise((resolve, reject) => {
-                        console.log("Processing Compressed XML Sitemap: ", sitemapUrl);
-
-                        // Configure axios to receive a response type of stream
-                        axios({
-                            method:'get',
-                            url: sitemapUrl,
-                            responseType:'stream'
-                        })
-                        .then((response) => {
-                            // Buffer to hold file download stream chunks
-                            let buffer = [];
-
-                            // Instantiate Gunzip
-                            let gunzip = zlib.createGunzip();
-
-                            // Pipe response stream data to gunzip instance
-                            response.data.pipe(gunzip);
-
-                            // Handle Data / End / Error events
-                            gunzip
-                            .on('data', function(data) {
-                                // decompression chunk ready, add it to the buffer
-                                buffer.push(data.toString())
-                            })
-                            .on("end", function() {
-                                // response and decompression complete, join the buffer
-                                let fullResponse = buffer.join("");
-
-                                // Parse the xml string into JSON
-                                parseString(fullResponse, (err, result) => {
-                                    if(err) {
-                                        console.log("Compressed sitemap error: ", err);
-                                        reject(err);
-                                    } else {
-                                        // Push the JSON to our array
-                                        parsedXmlArray.push(result);
-
-                                        // Resolve the promise to move onto the next item
-                                        setTimeout(() => {
-                                            resolve();
-                                        }, 1);
-
-                                    }
-                                });
-
-                            })
-                            .on("error", function(e) {
-                                reject(console.log("Gunzip Error: ", e));
-                            })
-                        })
-                        .catch(err => {
-                            reject("Axios gzip stream get error. ", err);
+                (async () => {
+                    // Only one promise is run at once
+                    const result = await Promise.all(promises);
+                    // console.log(result);
+                    return resolve(result);
+                })();
+            });
+        }
+
+
+        // Retrieve a stream of the zip file, pipe it to gunzip, then parse the XML file as json - push the JSON to the parsedXmlArray - then resolve the promise to move to the next item
+        function processCompressedXmlFile(sitemapUrl) {
+            return new Promise((resolve, reject) => {
+                console.log("Processing Compressed XML Sitemap: ", sitemapUrl);
+
+                // Configure axios to receive a response type of stream
+                axios({
+                    method:'get',
+                    url: sitemapUrl,
+                    responseType:'stream'
+                })
+                .then((response) => {
+                    // Buffer to hold file download stream chunks
+                    let buffer = [];
+
+                    // Instantiate Gunzip
+                    let gunzip = zlib.createGunzip();
+
+                    // Pipe response stream data to gunzip instance
+                    response.data.pipe(gunzip);
+
+                    // Handle Data / End / Error events
+                    gunzip
+                    .on('data', function(data) {
+                        // decompression chunk ready, add it to the buffer
+                        buffer.push(data.toString())
+                    })
+                    .on("end", function() {
+                        // response and decompression complete, join the buffer
+                        let fullResponse = buffer.join("");
+
+                        // Parse the xml string into JSON
+                        parseString(fullResponse, (err, result) => {
+                            if(err) {
+                                console.log("Compressed sitemap error: ", err);
+                                reject(err);
+                            } else {
+                                // Resolve with the JSON parsed
+                                resolve(result);                                
+                            }
                         });
-                    });
-                }
 
-                promises.then(result => {
-                    // console.log("Done getting batch of " + parsedXmlArray.length + " compressed XML sitemaps.");
-                    resolve(parsedXmlArray);
+                    })
+                    .on("error", function(e) {
+                        reject(console.log("Gunzip Error: ", e));
+                    })
                 })
                 .catch(err => {
-                    console.log("Promises error: ", err);
-                    reject(err);
-                })
+                    reject("Axios gzip stream get error. ", err);
+                });
             });
         }
     });
 }
 
+
+
 module.exports = {
     sitemapUrlScraper
 }
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -5,7 +5,7 @@
   "main": "index.js",
   "scripts": {
     "test": "mocha",
-    "start": "node index.js"
+    "start": "node demo.js"
   },
   "repository": {
     "type": "git",
@@ -31,6 +31,7 @@
   },
   "dependencies": {
     "axios": "^0.18.0",
+    "p-limit": "^2.0.0",
     "xml2js": "^0.4.19",
     "zlib": "^1.0.5"
   }