From 06c058e7def3604db607b378f81905cd66631945 Mon Sep 17 00:00:00 2001 From: Matthew Rideout Date: Wed, 5 Sep 2018 12:18:54 -0400 Subject: [PATCH] Successfully retrieving and reading compressed XML sitemap files. --- .eslintrc | 14 +++++ index.js | 152 ++++++++++++++++++++++++++++++++++++++++------ package-lock.json | 151 +++++++++++++++++++++++++++++++++++++++++++++ package.json | 8 ++- 4 files changed, 305 insertions(+), 20 deletions(-) create mode 100644 .eslintrc diff --git a/.eslintrc b/.eslintrc new file mode 100644 index 0000000..ec41034 --- /dev/null +++ b/.eslintrc @@ -0,0 +1,14 @@ +{ + "parserOptions": { + "ecmaVersion": 6 + }, + "env": { + "browser": false, + "node": true + }, + "rules": { + "valid-jsdoc": 2, + "no-console": 0, + "strict": 0 + } +} \ No newline at end of file diff --git a/index.js b/index.js index ec79361..8fe4e1c 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,8 @@ const zlib = require('zlib'); +const fs = require('fs'); const parseString = require('xml2js').parseString; const axios = require('axios'); +const util = require('util') let sitemapArray = [ @@ -19,7 +21,7 @@ let allUrls = []; let parentSiteMaps = getSitemapsAsJson(sitemapArray); parentSiteMaps .then((result) => { - console.log("Parent Site Maps: ", result); + console.log("Completed parsing parent Site Maps: ", result); // Add any parsed urlset(s) to the allUrls array handleUrlsets(result.filter(item => { @@ -28,22 +30,44 @@ parentSiteMaps } })); - - // Handle sitemapindex objects (under which more sitemaps are nested, potentially compressed) - handleSitemapindex(result.filter(item => { + // Get all child sitemap urls + let childSitemaps = getSitemapUrlsFromIndexes(result.filter(item => { if (typeof item["sitemapindex"] !== "undefined") { return true; } })); + console.log("Child Sitemaps", childSitemaps); + + // Parse child sitemaps that are not compressed as JSON + return getSitemapsAsJson(childSitemaps.filter(currentSitemap => { + if (/\.gz$/i.test(currentSitemap)) { + return false; + } else { + return true; + } + })) +}) +.then((result) => { + // Add any parsed urlset(s) to the allUrls array + handleUrlsets(result.filter(item => { + if(typeof item["urlset"] !== "undefined") { + return true; + } + })); + console.log("Completed parsing non-compressed child site maps: ", util.inspect(allUrls, { maxArrayLength: null })) }) +.catch(err => { + console.log("Error: ", err); +}) /** * Handle urlsets * @param {array} urlsetArray is an array of urlset objects parsed from xml + * @returns {*} nothing, just pushes urls into the allUrls array */ function handleUrlsets(urlsetArray) { // Push urls to allUrls array @@ -58,13 +82,27 @@ function handleUrlsets(urlsetArray) { /** - * Handle sitemapindex Objects - * Gets the URL's of all sitemaps listed in the sitemapindex, and proceeds to get them as JSON using getSitemapsAsJson + * Get Sitemap Urls From Indexes + * Gets the URL's of all sitemaps listed in the all of the provided sitemapindex(s) * @param {array} sitemapindexArray is an array of sitemapindex objects parsed from xml + * @returns {array} of child sitemaps */ -function handleSitemapindex(sitemapindexArray) { - // console.log("Site Map Indexes: ", sitemapindexArray); - console.log("Site Map Index Example: ", sitemapindexArray[0]["sitemapindex"]["sitemap"]); +function getSitemapUrlsFromIndexes(sitemapindexArray) { + // Create an array of all sitemap urls + let allChildSitemaps = []; + + // For each sitemapindex + sitemapindexArray.forEach(sitemapindex => { + + // for each sitemap object + sitemapindex["sitemapindex"]["sitemap"].forEach(sitemapObject => { + + // Add each sitemap url to our allChildSitemaps object (trim any trailing "/" to make consistent) + allChildSitemaps.push(sitemapObject["loc"][0].replace(/\/$/, "")); + }) + }) + + return allChildSitemaps; } @@ -72,15 +110,16 @@ function handleSitemapindex(sitemapindexArray) { * Get Sitemaps As JSON * Inputs an array of XML sitemap URLS * Outputs an array of JSON objects, converted from the XML - * @param {*} sitemapArray is an array of URL's to xml sitemaps + * @param {array} sitemapArray is an array of URL's to xml sitemaps + * @returns {array} a promise resolving with array of parsed xml sitemaps as JSON */ function getSitemapsAsJson(sitemapArray) { return new Promise ((resolve, reject) => { // Create an array of promises for each sitemap request we send const promises = sitemapArray.reduce((accumulator, currentSitemap) => { accumulator.push(new Promise((resolve, reject) => { - - // If sitemap is a normal URL + console.log("Retrieving data from xml sitemap URL...", currentSitemap); + // Else - if sitemap is a real URL axios.get(currentSitemap) .then((response) => { // Parse XML into JSON @@ -97,12 +136,6 @@ function getSitemapsAsJson(sitemapArray) { .catch(err => { reject(err); }); - - - // If sitemap is a compressed file - - - })); return accumulator; }, []); @@ -120,3 +153,86 @@ function getSitemapsAsJson(sitemapArray) { }); } + +/** + * Handle Compressed XML Synchronously + * Synchronously unzips and parses as json compressed XML sitemaps. This is done synchronously to avoid memory or resource issues + * that may come if trying to unzip and read thousands of these at once. + * @param {array} compressedSitemapArray is an array of compressed XML sitemap URLS + * @returns {*} promise resolving with an array of sitemaps parsed as JSON + */ +function handleCompressedXmlSync(compressedSitemapArray) { + + // Array to store parsed XML JSON + let parsedXmlArray = []; + + // Use reduce to synchronously process each zipped XML file + let promises = compressedSitemapArray.reduce((promise, sitemapUrl) => { + return promise.then(() => { + return processCompressedXmlFile(sitemapUrl); + }); + }, Promise.resolve()); + + // Retrieve a stream of the zip file, pipe it to gunzip, then parse the XML file as json - push the JSON to the parsedXmlArray - then resolve the promise to move to the next item + let processCompressedXmlFile = (sitemapUrl) => { + return new Promise((resolve, reject) => { + // Configure axios to receive a response type of stream + axios({ + method:'get', + url: sitemapUrl, + responseType:'stream' + }) + .then((response) => { + // Buffer to hold file download stream chunks + let buffer = []; + + // Instantiate Gunzip + let gunzip = zlib.createGunzip(); + + // Pipe response stream data to gunzip instance + response.data.pipe(gunzip); + + // Handle Data / End / Error events + gunzip + .on('data', function(data) { + // decompression chunk ready, add it to the buffer + buffer.push(data.toString()) + }) + .on("end", function() { + // response and decompression complete, join the buffer + let fullResponse = buffer.join(""); + + // Parse the xml string into JSON + parseString(fullResponse, (err, result) => { + if(err) { + console.log("Compressed sitemap error: ", err); + reject(err); + } else { + // Push the JSON to our array + parsedXmlArray.push(result); + + // Resolve the promise to move onto the next item + resolve(); + } + }); + + }) + .on("error", function(e) { + console.log("Gunzip Error: ", e); + }) + }) + .catch(err => { + reject("Axios gzip stream get error. ", err); + }); + }); + } + + promises.then(result => { + console.log("All Done!: ", parsedXmlArray); + }) + .catch(err => { + console.log("Promise error: ", err); + }) + +} + diff --git a/package-lock.json b/package-lock.json index a01cf4e..3a21fb7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,48 @@ "@babel/highlight": "^7.0.0" } }, + "@babel/generator": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.0.0.tgz", + "integrity": "sha512-/BM2vupkpbZXq22l1ALO7MqXJZH2k8bKVv8Y+pABFnzWdztDB/ZLveP5At21vLz5c2YtSE6p7j2FZEsqafMz5Q==", + "dev": true, + "requires": { + "@babel/types": "^7.0.0", + "jsesc": "^2.5.1", + "lodash": "^4.17.10", + "source-map": "^0.5.0", + "trim-right": "^1.0.1" + } + }, + "@babel/helper-function-name": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.0.0.tgz", + "integrity": "sha512-Zo+LGvfYp4rMtz84BLF3bavFTdf8y4rJtMPTe2J+rxYmnDOIeH8le++VFI/pRJU+rQhjqiXxE4LMaIau28Tv1Q==", + "dev": true, + "requires": { + "@babel/helper-get-function-arity": "^7.0.0", + "@babel/template": "^7.0.0", + "@babel/types": "^7.0.0" + } + }, + "@babel/helper-get-function-arity": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/helper-get-function-arity/-/helper-get-function-arity-7.0.0.tgz", + "integrity": "sha512-r2DbJeg4svYvt3HOS74U4eWKsUAMRH01Z1ds1zx8KNTPtpTL5JAsdFv8BNyOpVqdFhHkkRDIg5B4AsxmkjAlmQ==", + "dev": true, + "requires": { + "@babel/types": "^7.0.0" + } + }, + "@babel/helper-split-export-declaration": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.0.0.tgz", + "integrity": "sha512-MXkOJqva62dfC0w85mEf/LucPPS/1+04nmmRMPEBUB++hiiThQ2zPtX/mEWQ3mtzCEjIJvPY8nuwxXtQeQwUag==", + "dev": true, + "requires": { + "@babel/types": "^7.0.0" + } + }, "@babel/highlight": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.0.0.tgz", @@ -24,6 +66,51 @@ "js-tokens": "^4.0.0" } }, + "@babel/parser": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.0.0.tgz", + "integrity": "sha512-RgJhNdRinpO8zibnoHbzTTexNs4c8ROkXFBanNDZTLHjwbdLk8J5cJSKulx/bycWTLYmKVNCkxRtVCoJnqPk+g==", + "dev": true + }, + "@babel/template": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.0.0.tgz", + "integrity": "sha512-VLQZik/G5mjYJ6u19U3W2u7eM+rA/NGzH+GtHDFFkLTKLW66OasFrxZ/yK7hkyQcswrmvugFyZpDFRW0DjcjCw==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.0.0", + "@babel/parser": "^7.0.0", + "@babel/types": "^7.0.0" + } + }, + "@babel/traverse": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.0.0.tgz", + "integrity": "sha512-ka/lwaonJZTlJyn97C4g5FYjPOx+Oxd3ab05hbDr1Mx9aP1FclJ+SUHyLx3Tx40sGmOVJApDxE6puJhd3ld2kw==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.0.0", + "@babel/generator": "^7.0.0", + "@babel/helper-function-name": "^7.0.0", + "@babel/helper-split-export-declaration": "^7.0.0", + "@babel/parser": "^7.0.0", + "@babel/types": "^7.0.0", + "debug": "^3.1.0", + "globals": "^11.1.0", + "lodash": "^4.17.10" + } + }, + "@babel/types": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.0.0.tgz", + "integrity": "sha512-5tPDap4bGKTLPtci2SUl/B7Gv8RnuJFuQoWx26RJobS0fFrz4reUA3JnwIM+HVHEmWE0C1mzKhDtTp8NsWY02Q==", + "dev": true, + "requires": { + "esutils": "^2.0.2", + "lodash": "^4.17.10", + "to-fast-properties": "^2.0.0" + } + }, "acorn": { "version": "5.7.2", "resolved": "https://registry.npmjs.org/acorn/-/acorn-5.7.2.tgz", @@ -123,6 +210,32 @@ "is-buffer": "^1.1.5" } }, + "babel-eslint": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/babel-eslint/-/babel-eslint-9.0.0.tgz", + "integrity": "sha512-itv1MwE3TMbY0QtNfeL7wzak1mV47Uy+n6HtSOO4Xd7rvmO+tsGQSgyOEEgo6Y2vHZKZphaoelNeSVj4vkLA1g==", + "dev": true, + "requires": { + "@babel/code-frame": "^7.0.0", + "@babel/parser": "^7.0.0", + "@babel/traverse": "^7.0.0", + "@babel/types": "^7.0.0", + "eslint-scope": "3.7.1", + "eslint-visitor-keys": "^1.0.0" + }, + "dependencies": { + "eslint-scope": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-3.7.1.tgz", + "integrity": "sha1-PWPD7f2gLgbgGkUq2IyqzHzctug=", + "dev": true, + "requires": { + "esrecurse": "^4.1.0", + "estraverse": "^4.1.1" + } + } + } + }, "balanced-match": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", @@ -363,6 +476,15 @@ "text-table": "^0.2.0" } }, + "eslint-plugin-flowtype": { + "version": "2.50.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-flowtype/-/eslint-plugin-flowtype-2.50.0.tgz", + "integrity": "sha512-10FnBXCp8odYcpUFXGAh+Zko7py0hUWutTd3BN/R9riukH360qNPLYPR3/xV9eu9K7OJDjJrsflBnL6RwxFnlw==", + "dev": true, + "requires": { + "lodash": "^4.17.10" + } + }, "eslint-scope": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-4.0.0.tgz", @@ -702,6 +824,12 @@ "esprima": "^4.0.0" } }, + "jsesc": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.1.tgz", + "integrity": "sha1-5CGiqOINawgZ3yiQj3glJrlt0f4=", + "dev": true + }, "json-schema-traverse": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", @@ -1046,6 +1174,12 @@ "is-fullwidth-code-point": "^2.0.0" } }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + }, "sprintf-js": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", @@ -1121,6 +1255,18 @@ "os-tmpdir": "~1.0.2" } }, + "to-fast-properties": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz", + "integrity": "sha1-3F5pjL0HkmW8c+A3doGk5Og/YW4=", + "dev": true + }, + "trim-right": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", + "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=", + "dev": true + }, "tslib": { "version": "1.9.3", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.9.3.tgz", @@ -1194,6 +1340,11 @@ "version": "9.0.7", "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz", "integrity": "sha1-Ey7mPS7FVlxVfiD0wi35rKaGsQ0=" + }, + "zlib": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz", + "integrity": "sha1-bnyXL8NxxkWmr7A6sUdp3vEU/MA=" } } } diff --git a/package.json b/package.json index c5eb8e5..9318a5a 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,8 @@ "description": "Scrapes all urls from an xml sitemap, or an array of xml sitemaps, including nested xml sitemaps.", "main": "index.js", "scripts": { - "test": "mocha" + "test": "mocha", + "start": "node index.js" }, "repository": { "type": "git", @@ -22,12 +23,15 @@ }, "homepage": "https://github.com/boon4376/xml-sitemap-url-scraper#readme", "devDependencies": { + "babel-eslint": "^9.0.0", "chai": "^4.1.2", "eslint": "^5.5.0", + "eslint-plugin-flowtype": "^2.50.0", "mocha": "^5.2.0" }, "dependencies": { "axios": "^0.18.0", - "xml2js": "^0.4.19" + "xml2js": "^0.4.19", + "zlib": "^1.0.5" } }