From 4e98aa11598ff4506874e54703fc89bef1d68312 Mon Sep 17 00:00:00 2001 From: segment-srl Date: Wed, 14 Nov 2018 21:24:43 +0100 Subject: [PATCH] moved to headless chrome --- .gitignore | 5 +- README.md | 12 +- core/crawl/crawler.py | 43 +- core/crawl/crawler_thread.py | 13 +- core/crawl/lib/utils.py | 4 +- core/crawl/probe/chrome-probe/analyze.js | 167 +++ core/crawl/probe/chrome-probe/ckdeps.js | 5 + core/crawl/probe/chrome-probe/htcap/htcap.js | 217 ++++ core/crawl/probe/chrome-probe/htcap/main.js | 328 +++++ .../crawl/probe/chrome-probe/htcap/options.js | 78 ++ .../probe/chrome-probe/htcap/package.json | 11 + core/crawl/probe/chrome-probe/htcap/probe.js | 1077 +++++++++++++++++ .../probe/chrome-probe/htcap/shingleprint.js | 473 ++++++++ core/crawl/probe/chrome-probe/htcap/utils.js | 337 ++++++ core/crawl/probe/chrome-probe/package.json | 9 + core/crawl/probe/chrome-probe/utils.js | 346 ++++++ core/crawl/probe/options.js | 3 +- core/crawl/probe/probe.js | 1 + core/lib/texthash.py | 3 +- core/lib/utils.py | 55 +- requirements.txt | 1 - 21 files changed, 3154 insertions(+), 34 deletions(-) create mode 100644 core/crawl/probe/chrome-probe/analyze.js create mode 100644 core/crawl/probe/chrome-probe/ckdeps.js create mode 100644 core/crawl/probe/chrome-probe/htcap/htcap.js create mode 100644 core/crawl/probe/chrome-probe/htcap/main.js create mode 100644 core/crawl/probe/chrome-probe/htcap/options.js create mode 100644 core/crawl/probe/chrome-probe/htcap/package.json create mode 100644 core/crawl/probe/chrome-probe/htcap/probe.js create mode 100644 core/crawl/probe/chrome-probe/htcap/shingleprint.js create mode 100644 core/crawl/probe/chrome-probe/htcap/utils.js create mode 100644 core/crawl/probe/chrome-probe/package.json create mode 100644 core/crawl/probe/chrome-probe/utils.js delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 6974d27..8c8cc7f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ *.py[cod] tmp.py -.idea *.db tmp +node_modules/ +.* +!/.gitignore +package-lock.json \ No newline at end of file diff --git a/README.md b/README.md index feefa33..2b571c0 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting ajax calls and DOM changes. Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications. +This is the very first release that uses headless chrome instead of phantomjs. +Htcap’s Javascript crawling engine has been rewritten to take advantage of the new async/await features of ecmascript and has been converted to a nodjes module build on top of [Puppetteer](https://github.com/GoogleChrome/puppeteer). + More infos at [htcap.org](http://htcap.org). ## SETUP @@ -10,23 +13,20 @@ More infos at [htcap.org](http://htcap.org). ### Requirements 1. Python 2.7 - 2. PhantomJS v2 + 2. Nodejs and npm 3. Sqlmap (for sqlmap scanner module) 4. Arachni (for arachni scanner module) ### Download and Run ```console -$ git clone https://github.com/segment-srl/htcap.git htcap +$ git clone https://github.com/fcavallarin/htcap.git htcap $ htcap/htcap.py ``` -PhantomJs can be downloaded [here](http://phantomjs.org//download.html). It comes as a self-contained executable with all libraries linked statically, so there is no need to install or compile anything else. - - ## DOCUMENTATION -Documentation, examples and demos can be found at the official website [http://htcap.org](http://htcap.org). +Documentation, examples and demos can be found at the official website [https://htcap.org](https://htcap.org). ## LICENSE diff --git a/core/crawl/crawler.py b/core/crawl/crawler.py index 6b37612..df6aae1 100644 --- a/core/crawl/crawler.py +++ b/core/crawl/crawler.py @@ -59,7 +59,7 @@ def __init__(self, argv): self.request_patterns = [] self.defaults = { - "useragent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', + "useragent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3582.0 Safari/537.36', "num_threads": 10, "max_redirects": 10, "out_file_overwrite": False, @@ -75,7 +75,9 @@ def __init__(self, argv): "max_post_depth": 10, "override_timeout_functions": True, 'crawl_forms': True, # only if mode == CRAWLMODE_AGGRESSIVE - 'deduplicate_pages': True + 'deduplicate_pages': True, + 'use_legacy_browser': False, + 'headless_chrome': True } @@ -123,6 +125,8 @@ def usage(self): " -O dont't override timeout functions (setTimeout, setInterval)\n" " -K keep elements in the DOM (prevent removal)\n" " -e disable hEuristic page deduplication\n" + " -L use Legacy browser (phantomjs) instead of chrome\n" + " -l do not run chrome in headless mode\n" ) @@ -373,10 +377,11 @@ def main(self, argv): Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() - - probe_cmd = get_phantomjs_cmd() - if not probe_cmd: - print "Error: unable to find phantomjs executable" + deps_errors = check_dependences(self.base_dir) + if len(deps_errors) > 0: + print "Dependences errors: " + for err in deps_errors: + print " %s" % err sys.exit(1) start_cookies = [] @@ -398,7 +403,7 @@ def main(self, argv): user_script = None try: - opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:e') + opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:eLl') except getopt.GetoptError as err: print str(err) sys.exit(1) @@ -491,6 +496,15 @@ def main(self, argv): sys.exit(1) elif o == "-e": Shared.options['deduplicate_pages'] = False + elif o == "-L": + Shared.options['use_legacy_browser'] = True + elif o == "-l": + Shared.options['headless_chrome'] = False + + probe_cmd = get_phantomjs_cmd() if Shared.options['use_legacy_browser'] else get_node_cmd() + if not probe_cmd: # maybe useless + print "Error: unable to find node (or phantomjs) executable" + sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0: @@ -508,11 +522,18 @@ def main(self, argv): if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events - if Shared.options['proxy']: - probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) - probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) - probe_cmd.append(self.base_dir + 'probe/analyze.js') + if Shared.options['use_legacy_browser']: + if Shared.options['proxy']: + probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) + probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) + probe_cmd.append(self.base_dir + 'probe/analyze.js') + else: + if Shared.options['proxy']: + probe_options.extend(["-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port'])]) + if not Shared.options['headless_chrome']: + probe_options.append("-l") + probe_cmd.append(self.base_dir + 'probe/chrome-probe/analyze.js') if len(Shared.excluded_urls) > 0: diff --git a/core/crawl/crawler_thread.py b/core/crawl/crawler_thread.py index a30898a..a3b1ff2 100644 --- a/core/crawl/crawler_thread.py +++ b/core/crawl/crawler_thread.py @@ -98,7 +98,7 @@ def load_probe_json(self, jsn): try: return json.loads(jsn) except Exception: - #print "-- JSON DECODE ERROR %s" % jsn + print "-- JSON DECODE ERROR %s" % jsn raise @@ -146,15 +146,16 @@ def send_probe(self, request, errors): # print cmd_to_str(Shared.probe_cmd + params) # print "" + + cmd = CommandExecutor(Shared.probe_cmd + params, True) + jsn, err = cmd.execute(Shared.options['process_timeout'] + 2) - cmd = CommandExecutor(Shared.probe_cmd + params) - jsn = cmd.execute(Shared.options['process_timeout'] + 2) + if err: + print err + jsn = None if jsn == None: errors.append(ERROR_PROBEKILLED) - # time.sleep(self.process_retries_interval) # ... ??? - # retries -= 1 - # continue break diff --git a/core/crawl/lib/utils.py b/core/crawl/lib/utils.py index c197ae9..6158f4e 100644 --- a/core/crawl/lib/utils.py +++ b/core/crawl/lib/utils.py @@ -97,8 +97,8 @@ def request_is_crawlable(request): return False types = [REQTYPE_LINK, REQTYPE_REDIRECT] - if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: - types.append(REQTYPE_FORM) + # if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: + # types.append(REQTYPE_FORM) return request.type in types and re.match("^https?://", request.url, re.I) diff --git a/core/crawl/probe/chrome-probe/analyze.js b/core/crawl/probe/chrome-probe/analyze.js new file mode 100644 index 0000000..ff6cb98 --- /dev/null +++ b/core/crawl/probe/chrome-probe/analyze.js @@ -0,0 +1,167 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + + +"use strict"; + +const htcap = require("./htcap"); +const utils = require('./utils'); +const process = require('process'); + + +var sleep = function(n){ + return new Promise(resolve => { + setTimeout(resolve, n); + }); +}; + + + +var argv = utils.parseArgs(process.argv, "hVaftUJdICc:MSEp:Tsx:A:r:mHX:PD:R:Oi:u:vy:l", {}); +var options = argv.opts + +var targetUrl = argv.args[0]; + + + +if(!targetUrl){ + utils.usage(); + process.exit(-1); +} + +targetUrl = targetUrl.trim(); +if(targetUrl.length < 4 || targetUrl.substring(0,4).toLowerCase() != "http"){ + targetUrl = "http://" + targetUrl; +} + + + +htcap.launch(targetUrl, options).then( crawler => { + const page = crawler.page(); + var execTO = null; + + console.log("["); + + function exit(){ + clearTimeout(execTO); + crawler.browser().close(); + } + + crawler.on("redirect", async function(e, crawler){ + // console.log(crawler.redirect()); + // console.log(e.params.url); + // utils.printCookies(crawler); + // utils.printRequest({type:'link',method:"GET",url:e.params.url}); + // exit(); + }); + + + crawler.on("domcontentloaded", async function(e, crawler){ + //utils.printCookies(crawler); + await utils.printLinks("html", crawler.page()) + await utils.printForms("html", crawler.page()) + + //await sleep(4000) + }); + + crawler.on("start", function(e, crawler){ + //console.log("--->Start"); + }) + + + crawler.on("newdom", async function(e, crawler){ + //console.log(e.params) + }) + + crawler.on("xhr", async function(e, crawler){ + utils.printRequest(e.params.request) + + //return false + }); + + crawler.on("xhrCompleted", function(e, crawler){ + //console.log("XHR completed") + }); + + + crawler.on("jsonp", function(e, crawler){ + utils.printRequest(e.params.request) + }); + + crawler.on("jsonpCompleted", function(e, crawler){ + + }); + + crawler.on("websocket", function(e, crawler){ + utils.printRequest(e.params.request) + }); + + crawler.on("websocketMessage", function(e, crawler){ + + }); + + crawler.on("websocketSend", function(e, crawler){ + + }); + + crawler.on("formSubmit", function(e, crawler){ + utils.printRequest(e.params.request) + }); + + crawler.on("navigation", function(e, crawler){ + e.params.request.type="link"; + utils.printRequest(e.params.request) + }); + + crawler.on("eventtriggered", function(e, crawler){ + //console.log(e.params) + }); + + crawler.on("triggerevent", function(e, crawler){ + //console.log(e.params) + }); + + crawler.on("earlydetach", function(e, crawler){ + //console.log('["warning","earlydetach of element ' + e.params.node + '],') + //crawler.browser().close(); + }); + + + async function end(){ + if(!crawler.redirect()){ + const el = await crawler.page().$("html"); + const v = await el.getProperty('innerText'); + const hash = await v.jsonValue(); + var json = '["page_hash",' + JSON.stringify(hash) + '],'; + console.log(json); + + if(options.returnHtml){ + json = '["html",' + JSON.stringify(hash) + '],'; + console.log(json); + } + } + + utils.printStatus(crawler); + exit(); + } + + crawler.on("end", end); + + execTO = setTimeout(function(){ // (very dirty solution) + crawler.on("end", function(){}); + crawler.errors().push(["probe_timeout", "maximum execution time reached"]); + end(); + }, options.maxExecTime); + + + crawler.start() + +}) \ No newline at end of file diff --git a/core/crawl/probe/chrome-probe/ckdeps.js b/core/crawl/probe/chrome-probe/ckdeps.js new file mode 100644 index 0000000..6438522 --- /dev/null +++ b/core/crawl/probe/chrome-probe/ckdeps.js @@ -0,0 +1,5 @@ +try{ + require("puppeteer") +} catch(e) { + console.log("puppeteer") +} \ No newline at end of file diff --git a/core/crawl/probe/chrome-probe/htcap/htcap.js b/core/crawl/probe/chrome-probe/htcap/htcap.js new file mode 100644 index 0000000..362c55b --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/htcap.js @@ -0,0 +1,217 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +"use strict"; +const probe = require("./probe"); + +module.exports = { + loadProbe: loadProbe +}; + + + +function loadProbe(page, options) { + // generate a static map of random values using a "static" seed for input fields + // the same seed generates the same values + // generated values MUST be the same for all analyze.js call othewise the same form will look different + // for example if a page sends a form to itself with input=random1, + // the same form on the same page (after first post) will became input=random2 + // => form.data1 != form.data2 => form.data2 is considered a different request and it'll be crawled. + // this process will lead to and infinite loop! + var inputValues = generateRandomValues(options.randomSeed); + + page.evaluateOnNewDocument(probe.initProbe, options, inputValues); + page.evaluateOnNewDocument(function(options) { + //alert(window.__PROBE__) + + if(options.mapEvents){ + + Node.prototype.originaladdEventListener = Node.prototype.addEventListener; + Node.prototype.addEventListener = function(event, func, useCapture){ + if(event != "DOMContentLoaded"){ // is this ok??? + window.__PROBE__.addEventToMap(this, event); + } + this.originaladdEventListener(event, func, useCapture); + }; + + window.addEventListener = (function(originalAddEventListener){ + return function(event, func, useCapture){ + if(event != "load"){ // is this ok??? + window.__PROBE__.addEventToMap(this, event); + } + originalAddEventListener.apply(this,[event, func, useCapture]); + } + })(window.addEventListener); + } + + if(options.checkAjax){ + XMLHttpRequest.prototype.originalOpen = XMLHttpRequest.prototype.open; + XMLHttpRequest.prototype.open = function(method, url, async, user, password){ + + var _url = window.__PROBE__.removeUrlParameter(url, "_"); + this.__request = new window.__PROBE__.Request("xhr", method, _url, null, null); + + return this.originalOpen(method, url, async, user, password); + } + + + + XMLHttpRequest.prototype.originalSend = XMLHttpRequest.prototype.send; + + XMLHttpRequest.prototype.send = async function(data){ + var _this = this; + this.__request.data = data; + + var absurl = window.__PROBE__.getAbsoluteUrl(this.__request.url); + for(var a = 0; a < options.excludedUrls.length; a++){ + if(absurl.match(options.excludedUrls[a])){ + this.__skipped = true; + } + } + + + this.__request.trigger = window.__PROBE__.getTrigger(); + + + // check if request has already been sent + var rk = this.__request.key(); + if(window.__PROBE__.sentAjax.indexOf(rk) != -1){ + return; + } + + + //var ueRet = window.__PROBE__.triggerUserEvent("onXhr",[this.__request]); + var uRet = await window.__PROBE__.dispatchProbeEvent("xhr", { + request: window.__PROBE__.requestToObject(this.__request) + }); + // console.log("----------adsdsd_---------------") + // console.log(uRet) + if(uRet){ + this.addEventListener("readystatechange", function ev(e){ + if(_this.readyState != 4) return; + window.__PROBE__.dispatchProbeEvent("xhrCompleted", { + request: window.__PROBE__.requestToObject(this.__request), + response: _this.responseText + }); + _this.removeEventListener("readystatechange", ev); + //_this.originalRemoveEventListener("readystatechange", ev); + }); + + // pending ajax + window.__PROBE__.pendingAjax.push(this); + window.__PROBE__.sentAjax.push(rk); + window.__PROBE__.addRequestToPrintQueue(this.__request); + + + if(!this.__skipped) + return this.originalSend(data); + } + + return; + } + + } + + + if(options.checkScriptInsertion){ + + Node.prototype.originalappendChild = Node.prototype.appendChild; + Node.prototype.appendChild = function(node){ + window.__PROBE__.printJSONP(node); + + window.__PROBE__.triggerJsonpEvent(node); + return this.originalappendChild(node); + } + + Node.prototype.originalinsertBefore = Node.prototype.insertBefore; + Node.prototype.insertBefore = function(node, element){ + window.__PROBE__.printJSONP(node); + window.__PROBE__.triggerJsonpEvent(node); + return this.originalinsertBefore(node, element); + } + + Node.prototype.originalreplaceChild = Node.prototype.replaceChild; + Node.prototype.replaceChild = function(node, oldNode){ + window.__PROBE__.printJSONP(node); + window.__PROBE__.triggerJsonpEvent(node); + return this.originalreplaceChild(node, oldNode); + } + } + + if(options.checkWebsockets){ + window.WebSocket = (function(WebSocket){ + return function(url, protocols){ + //window.__PROBE__.printWebsocket(url); //websockets.push(url); + window.__PROBE__.triggerWebsocketEvent(url); + //return WebSocket.prototype; + var ws = new WebSocket(url); + ws.__originalSend = ws.send; + ws.send = function(message){ + window.__PROBE__.triggerWebsocketSendEvent(url, message); + return ws.__originalSend(message); + } + ws.addEventListener("message", function(message){ + window.__PROBE__.triggerWebsocketMessageEvent(url, message.data); + }); + return ws; + } + })(window.WebSocket); + } + + + if(options.overrideTimeoutFunctions){ + window.setTimeout = (function(setTimeout){ + return function(func, time, setTime){ + var t = setTime ? time : 0; + return setTimeout(func, t); + } + })(window.setTimeout); + + window.setInterval = (function(setInterval){ + return function(func, time, setTime){ + var t = setTime ? time : 0; + return setInterval(func, t); + } + })(window.setInterval); + + } + + if(options.preventElementRemoval){ + //Node.prototype.originalremoveChild = Node.prototype.removeChild; + Node.prototype.removeChild = function(node){ + //console.log(node); + return node; + } + } + + HTMLFormElement.prototype.originalSubmit = HTMLFormElement.prototype.submit; + HTMLFormElement.prototype.submit = function(){ + //console.log("=-->"+this.action) + var req = window.__PROBE__.getFormAsRequest(this); + window.__PROBE__.printRequest(req); + window.__PROBE__.triggerFormSubmitEvent(this); + return this.originalSubmit(); + } + + // prevent window.close + window.close = function(){ return } + window.print = function(){ return } + + window.open = function(url, name, specs, replace){ + window.__PROBE__.printLink(url); + } + + window.__PROBE__.triggerUserEvent("onInit"); + }, options); +}; + + + diff --git a/core/crawl/probe/chrome-probe/htcap/main.js b/core/crawl/probe/chrome-probe/htcap/main.js new file mode 100644 index 0000000..a1e5d8b --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/main.js @@ -0,0 +1,328 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +"use strict"; + +const puppeteer = require('puppeteer'); +const defaults = require('./options').options; +const probe = require("./probe"); +const probeTextComparator = require("./shingleprint"); + +const utils = require('./utils'); +const process = require('process'); + +exports.launch = async function(url, options){ + const chromeArgs = [ + //'--proxy-server=127.0.0.1:8080', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-gpu', + '--hide-scrollbars', + '--mute-audio', + '--ignore-certificate-errors', + '--ignore-certificate-errors-spki-list', + '--ssl-version-max=tls1.3', + '--ssl-version-min=tls1', + '--disable-web-security', + '--allow-running-insecure-content', + + //'--auto-open-devtools-for-tabs', + //`--load-extension=${__dirname}/chrome_extension/`, + //`--disable-extensions-except=${__dirname}/chrome_extension/` + ]; + for(let a in defaults){ + if(!(a in options)) options[a] = defaults[a]; + } + if(options.proxy){ + chromeArgs.push("--proxy-server=" + options.proxy); + } + var browser = await puppeteer.launch({headless: options.headlessChrome, ignoreHTTPSErrors: true, args:chromeArgs}); + var c = new Crawler(url, options, browser); + await c.loadPage(browser); + return c; +}; + + + + + + +function Crawler(targetUrl, options, browser){ + + targetUrl = targetUrl.trim(); + if(targetUrl.length < 4 || targetUrl.substring(0,4).toLowerCase() != "http"){ + targetUrl = "http://" + targetUrl; + } + this.targetUrl = targetUrl; + + this.publicProbeMethods = ['']; + this._cookies = []; + this._redirect = null; + this._errors = []; + + this.error_codes = ["contentType","navigation","response"]; + + this.probeEvents = { + start: function(){}, + xhr: function(){}, + xhrcompleted: function(){}, + jsonp: function(){}, + jsonpcompleted: function(){}, + websocket: function(){}, + websocketmessage: function(){}, + websocketsend: function(){}, + formsubmit: function(){}, + //requestscompleted: function(){}, + //dommodified: function(){}, + newdom: function(){}, + navigation: function(){}, + domcontentloaded: function(){}, + //blockedrequest: function(){}, + redirect: function(){}, + earlydetach: function(){}, + triggerevent: function(){}, + eventtriggered: function(){}, + end: function(){} + } + + + this.options = options; + + this._browser = browser; + this._page = null; +} + + + +Crawler.prototype.browser = function(){ + return this._browser; +} + +Crawler.prototype.page = function(){ + return this._page; +} + +Crawler.prototype.cookies = function(){ + return this._cookies; +} + +Crawler.prototype.redirect = function(){ + return this._redirect; +} + +Crawler.prototype.errors = function(){ + return this._errors; +} + +Crawler.prototype.start = async function(){ + var _this = this; + + if(this.options.verbose)console.log("LOAD") + + var assertContentType = function(hdrs){ + let ctype = 'content-type' in hdrs ? hdrs['content-type'] : ""; + + if(ctype.toLowerCase().split(";")[0] != "text/html"){ + _this._errors.push(["contentType", `content type is ${ctype}`]); + _this.dispatchProbeEvent("end", {}); + return false; + } + return true; + } + + if(this.options.httpAuth){ + await this._page.authenticate({username:this.options.httpAuth[0], password:this.options.httpAuth[1]}); + } + + if(this.options.userAgent){ + await this._page.setUserAgent(this.options.userAgent); + } + + this._page.goto(this.targetUrl, {waitUntil:'load'}).then(async resp => { + + if(!resp.ok()){ + _this._errors.push(["response", resp.request().url() + " status: " + resp.status()]); + _this.dispatchProbeEvent("end", {}); + //console.log(resp) + return; + } + var hdrs = resp.headers(); + _this._cookies = utils.parseCookiesFromHeaders(hdrs, resp.url()) + + + if(!assertContentType(hdrs)) + return; + + + this._page.evaluate(async function(){ + window.__PROBE__.takeDOMSnapshot(); + }); + + + await _this.dispatchProbeEvent("domcontentloaded", {}); + + _this._page.evaluate(async function(){ + await window.__PROBE__.waitAjax() + await window.__PROBE__.waitJsonp() + + window.__PROBE__.dispatchProbeEvent("start"); + console.log("startAnalysis") + window.__PROBE__.startAnalysis(); + }) + + }).catch(e => { + _this._errors.push(["navigation","navigation aborted"]); + _this.dispatchProbeEvent("end", {}); + }); + + + +} + +Crawler.prototype.on = function(eventName, handler){ + eventName = eventName.toLowerCase(); + if(!(eventName in this.probeEvents)){ + throw("unknown event name"); + } + this.probeEvents[eventName] = handler; +}; + + +Crawler.prototype.probe = function(method, args){ + var _this = this; + //if(this.publicProbeMethods.indexOf(method) == -1) + // throw "Probe method not found"; + + return new Promise( (resolve, reject) => { + _this._page.evaluate( async (method, args) => { + var r = await window.__PROBE__[method](...args); + return r; + }, [method, args]).then( ret => resolve(ret)); + }) +} + + +Crawler.prototype.dispatchProbeEvent = async function(name, params) { + name = name.toLowerCase(); + var ret, evt = { + name: name, + params: params || {} + }; + // console.log(name) + // console.log(evt) + + ret = await this.probeEvents[name](evt, this); + if(ret === false){ + return false; + } + + return true; +} + + +Crawler.prototype.loadPage = async function(browser){ + var options = this.options, + targetUrl = this.targetUrl, + pageCookies = this.pageCookies; + + var crawler = this; + + // generate a static map of random values using a "static" seed for input fields + // the same seed generates the same values + // generated values MUST be the same for all analyze.js call othewise the same form will look different + // for example if a page sends a form to itself with input=random1, + // the same form on the same page (after first post) will became input=random2 + // => form.data1 != form.data2 => form.data2 is considered a different request and it'll be crawled. + // this process will lead to and infinite loop! + var inputValues = utils.generateRandomValues(this.options.randomSeed); + + var firstRun = true; + const page = await browser.newPage(); + crawler._page = page; + //if(options.verbose)console.log("new page") + await page.setRequestInterception(true); + + page.on('request', async req => { + const overrides = {}; + if(req.isNavigationRequest()){ + //console.log("NAV REQ " + (req.redirectChain().length ) + req.url()) + if(req.redirectChain().length > 0){ + crawler._redirect = req.url(); + await crawler.dispatchProbeEvent("redirect", {url: crawler._redirect}); + req.abort('failed'); + return; + } + + if(!firstRun){ + page.evaluate(function(r){ + window.__PROBE__.triggerNavigationEvent(r.url, r.method, r.data); + }, {method:req.method(), url:req.url(), data:req.postData()}); + + //console.log(req); + + req.abort('aborted'); + return; + } else { + if(options.loadWithPost){ + overrides.method = 'POST'; + if(options.postData){ + overrides.postData = options.postData; + } + } + } + + firstRun = false; + } + + req.continue(overrides); + }); + + + page.on("dialog", function(dialog){ + dialog.accept(); + }); + + + page.exposeFunction("__htcap_probe_event__", (name, params) => this.dispatchProbeEvent(name, params)); // <- automatically awaited.."If the puppeteerFunction returns a Promise, it will be awaited." + + await page.setViewport({ + width: 1366, + height: 768, + }); + + page.evaluateOnNewDocument(probe.initProbe, this.options, inputValues); + page.evaluateOnNewDocument(probeTextComparator.initTextComparator); + page.evaluateOnNewDocument(utils.hookNativeFunctions, this.options); + + try{ + if(options.referer){ + await page.setExtraHTTPHeaders({ + 'Referer': options.referer + }); + } + for(let i=0; i < options.setCookies.length; i++){ + if(!options.setCookies[i].expires) + options.setCookies[i].expires = parseInt((new Date()).getTime() / 1000) + (60*60*24*365); + //console.log(options.setCookies[i]); + await page.setCookie(options.setCookies[i]); + } + + //if(options.verbose)console.log("goto returned") + + }catch(e) { + // do something . . . + console.log(e) + } + + //}); +}; + + diff --git a/core/crawl/probe/chrome-probe/htcap/options.js b/core/crawl/probe/chrome-probe/htcap/options.js new file mode 100644 index 0000000..957827b --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/options.js @@ -0,0 +1,78 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +exports.options = { + id: 0, + verbose: false, + checkAjax: true, + fillValues: true, + triggerEvents: true, + checkWebsockets: true, + searchUrls: true, + jsonOutput:true, + maxExecTime: 300000, // 300 seconds + //maxExecTime: 100000, // 100 seconds + ajaxTimeout:5000, + printAjaxPostData: true, + loadImages: false, + getCookies:true, + mapEvents: true, + checkScriptInsertion: true, + httpAuth: false, + triggerAllMappedEvents: true, + outputMappedEvents: false, + overrideTimeoutFunctions: false, + referer: false, + userAgent: null, //'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', + allEvents: ['abort', 'autocomplete', 'autocompleteerror', 'beforecopy', 'beforecut', 'beforepaste', 'blur', 'cancel', 'canplay', 'canplaythrough', 'change', /*'click',*/ 'close', 'contextmenu', 'copy', 'cuechange', 'cut', 'dblclick', 'drag', 'dragend', 'dragenter', 'dragleave', 'dragover', 'dragstart', 'drop', 'durationchange', 'emptied', 'ended', 'error', 'focus', 'input', 'invalid', 'keydown', 'keypress', 'keyup', 'load', 'loadeddata', 'loadedmetadata', 'loadstart', 'mousedown', 'mouseenter', 'mouseleave', 'mousemove', 'mouseout', 'mouseover', 'mouseup', 'mousewheel', 'paste', 'pause', 'play', 'playing', 'progress', 'ratechange', 'reset', 'resize', 'scroll', 'search', 'seeked', 'seeking', 'select', 'selectstart', 'show', 'stalled', 'submit', 'suspend', 'timeupdate', 'toggle', 'volumechange', 'waiting', 'webkitfullscreenchange', 'webkitfullscreenerror', 'wheel'], + mouseEvents: [],//['click','dblclick','mouseup','mousedown','mousemove','mouseover', 'mouseout'], + keyboardEvents: [], //['keydown', 'keypress', 'keydown', 'keypress', 'keyup'], + returnHtml: false, + setCookies: [], + excludedUrls: [], + maximumRecursion: 50, + printUnknownRequests: false, // unknown requests are for example mailto: and javascript: urls + maximumAjaxChain: 30, + preventElementRemoval: 1, + randomSeed: "IsHOulDb34RaNd0MsTR1ngbUt1mN0t", + // map input names to string generators. see generateRandomValues to see all available generators + inputNameMatchValue:[ // regexps NEED to be string to get passed to phantom page + {name: "mail", value: "email"}, + {name: "((number)|(phone))|(^tel)", value: "number"}, + {name: "(date)|(birth)", value: "humandate"}, + {name: "((month)|(day))|(^mon$)", value: "month"}, + {name: "year", value: "year"}, + {name: "url", value: "url"}, + {name: "firstname", value: "firstname"}, + {name: "(surname)|(lastname)", value: "surname"}, + ], + /* always trigger these events since event delegation mays "confuse" the triggering of mapped events */ + // NOTE: trigger mouseUP FIRST to prevent up and down to be considered a click + eventsMap: { + 'button':['click','dblclick','keydown','keyup','mouseup','mousedown'], + 'select':['change','click','dblclick','keydown','keyup','mouseup','mousedown'], + 'input':['change','click','dblclick','blur','focus','keydown','keyup','mouseup','mousedown'], + 'a':['click','dblclick','keydown','keyup','mouseup','mousedown'], + 'textarea':['change','click','dblclick','blur','focus','keydown','keyup','mouseup','mousedown'], + 'span':['click','dblclick','mouseup','mousedown'], + 'td':['click','dblclick','mouseup','mousedown'], + 'tr':['click','dblclick','mouseup','mousedown'], + 'div':['click','dblclick','mouseup','mousedown'] + }, + staticInputValues:[ + //['form[name="login"] input[name="mail"]', "bm1@rtom.fake"], + //['form[name="login"] input[name="password"]', "bm1"] + ], + proxy: null, + loadWithPost: false, + postData: null, + headlessChrome: true +}; diff --git a/core/crawl/probe/chrome-probe/htcap/package.json b/core/crawl/probe/chrome-probe/htcap/package.json new file mode 100644 index 0000000..4ea673e --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/package.json @@ -0,0 +1,11 @@ +{ + "name": "htcap", + "version": "1.0.0", + "description": "crawler for single page applications", + "main": "main.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Filippo Cavallarin", + "license": "GPL-3.0" +} diff --git a/core/crawl/probe/chrome-probe/htcap/probe.js b/core/crawl/probe/chrome-probe/htcap/probe.js new file mode 100644 index 0000000..ab8d39f --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/probe.js @@ -0,0 +1,1077 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +"use strict"; + +module.exports = { + initProbe: initProbe +}; + + + +/* + this function is passed to page.evaluate. doing so it is possible to avoid that the Probe object + is inserted into page window scope (only its instance is referred by window.__PROBE__) +*/ +function initProbe(options, inputValues){ + + function Probe(options, inputValues){ + var _this = this; + + this.options = options; + + this.requestsPrintQueue = []; + this.sentAjax = []; + + this.curElement = {}; + this.winOpen = []; + this.resources = []; + this.eventsMap = []; + + this.triggeredEvents = []; + this.websockets = []; + this.html = ""; + this.printedRequests = []; + this.DOMSnapshot = []; + this._pendingAjax = []; + this._pendingJsonp = []; + this.inputValues = inputValues; + this.currentUserScriptParameters = []; + this.domModifications = []; + + this._lastRequestId = 0; + this.started_at = null; + + this.textComparator = null; + + }; + + + Probe.prototype.objectInArray = function(arr, el, ignoreProperties){ + ignoreProperties = ignoreProperties || []; + if(arr.length == 0) return false; + if(typeof arr[0] != 'object') + return arr.indexOf(el) > -1; + for(var a = 0 ;a < arr.length; a++){ + var found = true; + for(var k in arr[a]){ + if(arr[a][k] != el[k] && ignoreProperties.indexOf(k) == -1){ + found = false; + } + } + if(found) return true; + } + return false; + }; + + + + Probe.prototype.arrayUnique = function(arr, ignoreProperties){ + var ret = []; + + for(var a = 0; a < arr.length ; a++){ + if(!this.objectInArray(ret, arr[a], ignoreProperties)) + ret.push(arr[a]); + } + return ret; + }; + + Probe.prototype.compareObjects = function(obj1, obj2){ + var p; + for(p in obj1) + if(obj1[p] != obj2[p]) return false; + + for(p in obj2) + if(obj2[p] != obj1[p]) return false; + + return true; + } + + + /* + anchor.protocol; // => "http:" + anchor.host; // => "example.com:3000" + anchor.hostname; // => "example.com" + anchor.port; // => "3000" + anchor.pathname; // => "/pathname/" + anchor.hash; // => "#hash" + anchor.search; // => "?search=test" + */ + + Probe.prototype.replaceUrlQuery = function(url, qs){ + var anchor = document.createElement("a"); + anchor.href = url; + return anchor.protocol + "//" + anchor.host + anchor.pathname + (qs ? "?" + qs : "") + anchor.hash; + }; + + Probe.prototype.removeUrlParameter = function(url , par){ + var anchor = document.createElement("a"); + anchor.href = url; + + var pars = anchor.search.substr(1).split(/(?:&|&)+/); + + for(var a = pars.length - 1; a >= 0; a--){ + if(pars[a].split("=")[0] == par) + pars.splice(a,1); + } + + + return anchor.protocol + "//" + anchor.host + anchor.pathname + (pars.length > 0 ? "?" + pars.join("&") : "") + anchor.hash; + }; + + + Probe.prototype.getAbsoluteUrl = function(url){ + var anchor = document.createElement('a'); + anchor.href = url; + return anchor.href; + }; + + + + + + // do NOT use MutationObserver to get added elements .. it is asynchronous and the callback is fired only when DOM is refreshed (graphically) + Probe.prototype.takeDOMSnapshot = function(){ + this.DOMSnapshot = Array.prototype.slice.call( document.getElementsByTagName("*"), 0 ); + return; + var els = document.getElementsByTagName("*"); + for(var a = 0; a < els.length; a++){ + if(!els[a].__snapshot) + els[a].__snapshot = true; + } + } + + + Probe.prototype.getAddedElements = function(){ + var elements = [] + var rootElements = [] + var ueRet = null; + var newDom = Array.prototype.slice.call( document.getElementsByTagName("*"), 0 ); + + //*/console.log('get added elements start dom len: ' + this.DOMSnapshot.length + ' new dom len: ' + newDom.length); + // get all added elements + for(var a = 0;a < newDom.length;a++){ + if(this.DOMSnapshot.indexOf(newDom[a]) == -1) { + //if(!newDom[a].__snapshot){ + // set __new flag on added elements to avoid checking for elments.indexOf + // that is very very slow + newDom[a].__new = true; + elements.push(newDom[a]); + } + } + + ///console.log("elements get... (tot "+elements.length+") searching for root nodes") + + for(var a = 0; a < elements.length; a++){ + var p = elements[a]; + var root = null; + // find the farest parent between added elements + while(p){ + //if(elements.indexOf(p) != -1){ + if(p.__new){ + root = p; + } + p = p.parentNode; + } + if(root && rootElements.indexOf(root) == -1){ + rootElements.push(root); + } + } + + for(var a = 0; a < elements.length; a++){ + delete elements[a].__new; + } + + // if(rootElements.length > 0){ + // this.triggerUserEvent("onDomModified", [rootElements, elements]); + // } + + //*/console.log("root elements found: " + rootElements.length); + return rootElements; + } + + + + + /* DO NOT include node as first element.. this is a requirement */ + Probe.prototype.getDOMTreeAsArray = function(node){ + var out = []; + var children = node.querySelectorAll(":scope > *"); + + if(children.length == 0){ + return out; + } + + for(var a = 0; a < children.length; a++){ + out.push(children[a]); + out = out.concat(this.getDOMTreeAsArray(children[a])); + } + + return out; + } + + + + + // class Request + + Probe.prototype.Request = function(type, method, url, data, trigger){ + this.type = type; + this.method = method; + this.url = url; + this.data = data || null; + this.trigger = trigger || null; + + //this.username = null; // todo + //this.password = null; + } + + // returns a unique string represntation of the request. used for comparision + Probe.prototype.Request.prototype.key = function(){ + var key = "" + this.type + this.method + this.url + (this.data ? this.data : "") + (this.trigger ? this.trigger : "") + return key; + }; + + + Probe.prototype.requestToJson = function(req){ + + return JSON.stringify(this.requestToObject(req)); + } + + Probe.prototype.requestToObject = function(req){ + var obj ={ + type: req.type, + method: req.method, + url: req.url, + data: req.data || null + }; + + if(req.trigger) obj.trigger = {element: this.describeElement(req.trigger.element), event:req.trigger.event}; + + return obj; + } + + // END OF class Request.. + + + + + + // returns true if the value has been set + Probe.prototype.setVal = function(el, value){ + var options = this.options; + var _this = this; + + // var ueRet = this.triggerUserEvent("onFillInput", [el]); + // if(ueRet === false) return; + + var getv = function(type){ + if(typeof value != 'undefined' && value !== null && value !== false){ + return value; + } + if(!(type in _this.inputValues)) + type = "string"; + + return _this.inputValues[type]; + } + + var setv = function(name){ + var ret = getv('string'); + for(var a = 0; a < options.inputNameMatchValue.length; a++){ + var regexp = new RegExp(options.inputNameMatchValue[a].name, "gi"); + if(name.match(regexp)){ + ret = getv(options.inputNameMatchValue[a].value); + } + } + return ret; + } + + // needed for example by angularjs + var triggerChange = function(){ + // update angular model + _this.trigger(el, 'input'); + } + + if(el.nodeName.toLowerCase() == 'textarea'){ + el.value = setv(el.name); + triggerChange(); + return true; + } + + if(el.nodeName.toLowerCase() == 'select'){ + var opts = el.getElementsByTagName('option'); + if(opts.length > 1){ // avoid to set the first (already selected) options + // @TODO .. qui seleziono l'ultimo val.. ma devo controllare che non fosse "selected" + el.value = opts[opts.length-1].value; + } else { + el.value = setv(el.name); + } + triggerChange(); + return true; + } + + var type = el.type.toLowerCase(); + + switch(type){ + case 'button': + case 'hidden': + case 'submit': + case 'file': + return false; + case '': + case 'text': + case 'search': + el.value = setv(el.name); + break; + + case 'radio': + case 'checkbox': + el.setAttribute('checked',!(el.getAttribute('checked'))); + break; + case 'range': + case 'number': + + if('min' in el && el.min){ + + el.value = (parseInt(el.min) + parseInt(('step' in el) ? el.step : 1)); + } else{ + el.value = parseInt(getv('number')); + } + break; + case 'password': + case 'color': + case 'date': + case 'email': + case 'month': + case 'time': + case 'url': + case 'week': + case 'tel': + el.value = getv(type); + break; + case 'datetime-local': + el.value = getv('datetimeLocal'); + break; + + + default: + return false; + } + + triggerChange(); + return true; + }; + + + // Probe.prototype.getRandomValue = function(type){ + + // if(!(type in this.inputValues)) + // type = "string"; + + // return this.inputValues[type]; + + // }; + + + Probe.prototype.getStaticInputValue = function(input){ + if(!this.options.staticInputValues.length ) + return null; + + for(let val of this.options.staticInputValues){ + if(input.matches(val[0])){ + return val[1]; + } + } + + return null; + + }; + + Probe.prototype.fillInputValues = function(element){ + element = element || document; + var ret = false; + var els = element.querySelectorAll("input, select, textarea"); + + + for(var a = 0; a < els.length; a++){ + let sv = this.getStaticInputValue(els[a]); + + if(this.setVal(els[a], sv)) + ret = true; + } + return ret; + }; + + + Probe.prototype.trigger = function(el,evname){ + /* workaround for a phantomjs bug on linux (so maybe not a phantom bug but some linux libs??). + if you trigger click on input type=color evertything freezes... maybe due to some + color picker that pops up ... + */ + if(el.tagName == "INPUT" && el.type.toLowerCase()=='color' && evname=='click'){ + return; + } + + // var ueRet = this.triggerUserEvent("onTriggerEvent", [el, evname]); + // if(ueRet === false) return; + + var pdh = function(e){ + var el = e.target; + var urlproto; + + if(el.matches("a")){ + urlproto = el.protocol; + if(el.target == "_blank") el.target = "_self"; // @workaround prevent new tabs + } else { // button or submit + let url = document.createElement("a"); + url.href = el.form.action; + urlproto = url.protocol; + if(el.form.target == "_blank") el.form.target = "_self" // @workaround prevent new tabs + } + + if(urlproto.match(/^https?\:/i) == null){ // @workaround prevent malfomed urls and about:blank to lead to about:blank + e.preventDefault(); + } + + e.stopPropagation(); + } + + if ('createEvent' in document) { + + if(this.options.mouseEvents.indexOf(evname) != -1){ + var evt = new MouseEvent(evname, {view: window, bubbles: true, cancelable: true}); + if(evname.toLowerCase() == "click" && el.matches('a, button, input[type="submit"]')){ + el.addEventListener(evname, pdh); + } + /*} else if(this.options.keyboardEvents.indexOf(evname) != -1){*/ + + } else { + var evt = document.createEvent('HTMLEvents'); + evt.initEvent(evname, true, false); + + } + el.dispatchEvent(evt); + } else { + evname = 'on' + evname; + if( evname in el && typeof el[evname] == "function"){ + el[evname](); + } + } + try{ + el.removeEventListener(evname, pdh); + } catch(e){} + //this.triggerUserEvent("onEventTriggered", [el, evname]) + }; + + + Probe.prototype.isEventTriggerable = function(event){ + + return ['load','unload','beforeunload'].indexOf(event) == -1; + + }; + + Probe.prototype.getEventsForElement = function(element){ + var events = []; + var map; + + if(this.options.triggerAllMappedEvents){ + map = this.eventsMap; + for(var a = 0; a < map.length; a++){ + if(map[a].element == element){ + events = map[a].events.slice(); + break; + } + } + } + + map = this.options.eventsMap; + for(var selector in map){ + if(element.webkitMatchesSelector(selector)){ + events = events.concat(map[selector]); + } + } + //if(events.length >0 ) return ['click'] + return events; + }; + + + + + + Probe.prototype.triggerElementEvent = function(element, event){ + var teObj = {el: element, ev: event}; + this.curElement = {}; + if(!event)return + if(!this.isEventTriggerable(event) || this.objectInArray(this.triggeredEvents, teObj)) + return + this.curElement.element = element; + this.curElement.event = event; + this.triggeredEvents.push(teObj); + this.trigger(element, event); + } + + Probe.prototype.getTrigger = function(){ + if(!this.curElement || !this.curElement.element) + return null; + + //return {element: this.curElement.element, event: this.curElement.event}; + return { + element: this.describeElement(this.curElement.element), + event: this.curElement.event + }; + }; + + + Probe.prototype.describeElements = function(els){ + var ret = []; + for(el of els){ + ret.push(this.describeElement(el)); + } + return ret; + } + + Probe.prototype.describeElement = function(el){ + //return this.stringifyElement(el); + return this.getElementSelector(el); + }; + + + Probe.prototype.stringifyElement = function(el){ + if(!el) + return "[]"; + var tagName = (el == document ? "DOCUMENT" : (el == window ? "WINDOW" :el.tagName)); + var text = null; + if(el.textContent){ + text = el.textContent.trim().replace(/\s/," ").substring(0,10) + if(text.indexOf(" ") > -1) text = "'" + text + "'"; + } + + + var className = (el.className && typeof el.className == 'string') ? (el.className.indexOf(" ") != -1 ? "'" + el.className + "'" : el.className) : ""; + var descr = "[" + + (tagName ? tagName + " " : "") + + (el.name && typeof el.name == 'string' ? el.name + " " : "") + + (className ? "." + className + " " : "")+ + (el.id ? "#" + el.id + " " : "") + + (el.src ? "src=" + el.src + " " : "") + + (el.action ? "action=" + el.action + " " : "") + + (el.method ? "method=" + el.method + " " : "") + + (el.value ? "v=" + el.value + " ": "") + + (text ? "txt=" + text : "") + + "]"; + + return descr; + + }; + + Probe.prototype.getElementSelector = function(element){ + if(!element || !(element instanceof HTMLElement)) + return ""; + var name = element.nodeName.toLowerCase(); + var ret = []; + var selector = "" + //var elid = element.getAttribute('id'); + + + if(element.id){ + selector = "#" + element.id; + } else { + let p = element; + let cnt = 1; + while(p = p.previousSibling){ + if(p instanceof HTMLElement && p.nodeName.toLowerCase() == name){ + cnt++; + } + } + selector = name + (cnt > 1 ? `:nth-of-type(${cnt})` : ""); + if(element != document.documentElement && name != "body" && element.parentNode){ + ret.push(this.getElementSelector(element.parentNode)); + } + } + ret.push(selector); + return ret.join(" > "); + } + + + Probe.prototype.initializeElement = function(element){ + var options = this.options; + + if(options.mapEvents){ + var els = element.getElementsByTagName("*"); + for(var a = 0; a < els.length; a++){ + for(var b = 0; b < options.allEvents.length; b++){ + var evname = "on" + options.allEvents[b]; + if(evname in els[a] && els[a][evname]){ + this.addEventToMap(els[a], options.allEvents[b]); + } + } + } + } + + + if(options.fillValues){ + this.fillInputValues(element); + } + } + + + + + + + Probe.prototype.getFormAsRequest = function(form){ + + var formObj = {}; + var inputs = null; + var par; + + formObj.method = form.getAttribute("method"); + if(!formObj.method){ + formObj.method = "GET"; + } else { + formObj.method = formObj.method.toUpperCase(); + } + + formObj.url = form.getAttribute("action"); + if(!formObj.url) formObj.url = document.location.href; + formObj.data = []; + inputs = form.querySelectorAll("input, select, textarea"); + for(var a = 0; a < inputs.length; a++){ + if(!inputs[a].name) continue; + par = encodeURIComponent(inputs[a].name) + "=" + encodeURIComponent(inputs[a].value); + if(inputs[a].tagName == "INPUT" && inputs[a].type != null){ + + switch(inputs[a].type.toLowerCase()){ + case "button": + case "submit": + break; + case "checkbox": + case "radio": + if(inputs[a].checked) + formObj.data.push(par); + break; + default: + formObj.data.push(par); + } + + } else { + formObj.data.push(par); + } + } + + formObj.data = formObj.data.join("&"); + + if(formObj.method == "GET"){ + var url = this.replaceUrlQuery(formObj.url, formObj.data); + req = new this.Request("form", "GET", url); + } else { + var req = new this.Request("form", "POST", formObj.url, formObj.data, this.getTrigger()); + } + + + return req; + + }; + + + + Probe.prototype.addEventToMap = function(element, event){ + + for(var a = 0; a < this.eventsMap.length; a++){ + if(this.eventsMap[a].element == element){ + this.eventsMap[a].events.push(event); + return; + } + } + this.eventsMap.push({ + element: element, + events: [event] + }); + }; + + + + + Probe.prototype.dispatchProbeEvent = async function(name, params){ + return await window.__htcap_probe_event__(name, params); + }; + + + + Probe.prototype.startAnalysis = async function(){ + console.log("page initialized "); + var _this = this; + this.started_at = (new Date()).getTime(); + await this.crawlDOM(document, 0); + console.log("DOM analyzed "); + //window.__htcap_end__(); + this.dispatchProbeEvent("end", {}); + + }; + + + + Probe.prototype.isContentDuplicated = function(cont){ + return this.domModifications.indexOf(cont) != -1; + + + + // for(let m of this.domModifications){ + // if(this.textComparator.compare(cont, m)){ + // return true; + // } + // } + // return false; + + } + + Probe.prototype.simhashDistance = function(s1, s2){ + var x = (s1 ^ s2) & ((1 << 64) - 1); + var ans = 0; + while(x){ + ans += 1; + x &= x - 1; + } + return ans; + } + + + + + + Probe.prototype.jsonpHook = function(node){ + if(!(node instanceof HTMLElement) || !node.matches("script")) return; + var src = node.getAttribute("src"); + if(!src) return; + var _this = this; + + + var a = document.createElement("a"); + a.href = src; + + // JSONP must have a querystring... + if(!a.search) return; + + var req = new this.Request("jsonp", "GET", src, null, this.getTrigger()); + node.__request = req; + + // __skipped requests.. todo + + this._pendingJsonp.push(node); + + var ev = function(){ + var i = _this._pendingJsonp.indexOf(node); + if(i == -1){ + // ERROR !! + } else { + _this._pendingJsonp.splice(i, 1); + } + + _this.dispatchProbeEvent("jsonpCompleted", { + request: req, + script: _this.describeElement(node) + }); + node.removeEventListener("load", ev); + node.removeEventListener("error", ev); + } + + node.addEventListener("load", ev); + node.addEventListener("error", ev); + + this.dispatchProbeEvent("jsonp", { + request: req + }); + }; + + + + Probe.prototype.triggerXhrsEvent = function(requests){ + + let reqarr = []; + for(let a of requests){ + //reqarr.push(this.requestToObject(a.__request)); + reqarr.push(a.__request); + } + this.dispatchProbeEvent("requestsCompleted", { + requests: reqarr, + // trigger: { + // element: this.describeElement(_this.curElement.element), + // event: this.curElement.event + // } + }); + + } + + Probe.prototype.triggerWebsocketEvent = function(url){ + + var req = new this.Request("websocket", "GET", url, null, this.getTrigger()); + this.dispatchProbeEvent("websocket", { request: req}); + + } + + Probe.prototype.triggerWebsocketMessageEvent = function(url, message){ + + var req = new this.Request("websocket", "GET", url, null, null); + this.dispatchProbeEvent("websocketMessage", { request: req, message: message}); + + } + + Probe.prototype.triggerWebsocketSendEvent = function(url, message){ + var req = new this.Request("websocket", "GET", url, null, null); + this.dispatchProbeEvent("websocketSend", { request: req, message: message}); + + } + + + Probe.prototype.triggerFormSubmitEvent = function(form){ + + var req = this.getFormAsRequest(form); + this.dispatchProbeEvent("formSubmit", { + request: req, + form: this.describeElement(form) + }); + + } + + + Probe.prototype.triggerNavigationEvent = function(url, method, data){ + var req = null; + method = method || "GET"; + + url = url.split("#")[0]; + +// if(url.match(/^[a-z0-9\-_]+\:/i) && !url.match(/(^https?)|(^ftps?)\:/i)){ + // if(this.options.printUnknownRequests){ + // req = new this.Request("unknown", "GET", url); + // } + // } else { + // req = new this.Request("link", "GET", url); + // } + + req = new this.Request("navigation", method, url, data); + + this.dispatchProbeEvent("navigation", { + request: req + }); + + }; + + + + // returns true if at least one request is performed + Probe.prototype.waitRequests = async function(requests){ + var _this = this; + var reqPerformed = false; + return new Promise( (resolve, reject) => { + var timeout = _this.options.ajaxTimeout; + + var t = setInterval(function(){ + if(timeout <= 0 || requests.length == 0){ + clearInterval(t); + //console.log("waitajax reoslve()") + resolve(reqPerformed); + return; + } + timeout -= 10; + reqPerformed = true; + }, 0); + }); + } + + + Probe.prototype.waitAjax = async function(){ + await this.waitRequests(this._pendingAjax); + } + + Probe.prototype.waitJsonp = async function(){ + await this.waitRequests(this._pendingJsonp); + } + + + Probe.prototype.xhrOpenHook = function(xhr, method, url){ + var _url = this.removeUrlParameter(url, "_"); + xhr.__request = new window.__PROBE__.Request("xhr", method, _url, null, this.getTrigger()); + } + + + Probe.prototype.xhrSendHook = async function(xhr, data){ + var _this = this; + xhr.__request.data = data; + + var absurl = this.getAbsoluteUrl(xhr.__request.url); + for(var a = 0; a < options.excludedUrls.length; a++){ + if(absurl.match(options.excludedUrls[a])){ + xhr.__skipped = true; + } + } + + + //xhr.__request.trigger = this.getTrigger(); + + + // check if request has already been sent + var rk = xhr.__request.key(); + if(this.sentAjax.indexOf(rk) != -1){ + return; + } + + + // add to pending ajax before dispatchProbeEvent. Since dispatchProbeEvent can await for something (and take some time) we need to be sure that the current xhr is awaited from the main loop + if(!xhr.__skipped){ + this._pendingAjax.push(xhr); + } + + //var ueRet = this.triggerUserEvent("onXhr",[xhr.__request]); + var uRet = await this.dispatchProbeEvent("xhr", { + request: xhr.__request + }); + //console.log("----------adsdsd_---------------") + // console.log(uRet) + + if(!uRet){ + this._pendingAjax.splice(this._pendingAjax.indexOf(xhr), 1); + return false; + } + xhr.addEventListener("readystatechange", function ev(e){ + if(xhr.readyState != 4) return; + var i = _this._pendingAjax.indexOf(xhr); + if(i == -1){ + //ERROR!!! + } else { + _this._pendingAjax.splice(i, 1); + } + _this.dispatchProbeEvent("xhrCompleted", { + request: xhr.__request, + response: xhr.responseText + }); + xhr.removeEventListener("readystatechange", ev); + }); + + this.sentAjax.push(rk); + + return true; + + } + + + + Probe.prototype.isAttachedToDOM = function(node){ + var p = node; + while(p) { + if(p.nodeName.toLowerCase() == "html") + return true; + p = p.parentNode; + } + return false; + }; + + Probe.prototype.getDetachedRootNode = function(node){ + var p = node; + while(p.parentNode) { + if(p.parentNode.nodeName.toLowerCase() == "html") + return null; + p = p.parentNode; + } + return p; + }; + + + + + Probe.prototype.crawlDOM = async function(node, layer){ // @TODO console.log(">>>>RECURSON LIMIT REACHED :" + counter); + + layer = typeof layer != 'undefined' ? layer : 0; + if(layer == this.options.maximumRecursion){ + console.log(">>>>RECURSON LIMIT REACHED :" + layer) + return; + } + //console.log(">>>>:" + layer) + var dom = [node == document ? document.documentElement : node].concat(this.getDOMTreeAsArray(node)), + newEls = [], + uRet; + // map propety events and fill input values + this.initializeElement(node); + + //let analyzed = 0; + for(let el of dom){ + let elsel = this.getElementSelector(el); + if(!this.isAttachedToDOM(el)){ + console.log("!!00>>> " + this.stringifyElement(el) + " detached before analysis !!! results may be incomplete") + uRet = await this.dispatchProbeEvent("earlydetach", { node: elsel }); + if(!uRet) continue; + } + for(let event of this.getEventsForElement(el)){ + //console.log("analyze element " + this.describeElement(el)); + this.takeDOMSnapshot(); + if(options.triggerEvents){ + uRet = await this.dispatchProbeEvent("triggerevent", {node: elsel, event: event}); + if(!uRet) continue; + + this.triggerElementEvent(el, event); + // if click has been trigered stop mousedown /up !!! + + await this.dispatchProbeEvent("eventtriggered", {node: elsel, event: event}); + } + + + let chainLimit = this.options.maximumAjaxChain; + do { + chainLimit--; + if(chainLimit == 0){ + break; + } + await this.sleep(0); + } while(await this.waitAjax()); + + await this.waitJsonp(); + + newEls = this.getAddedElements(); + for(var a = newEls.length - 1; a >= 0; a--){ + if(newEls[a].innerText && this.isContentDuplicated(newEls[a].innerText)) + newEls.splice(a,1); + } + if(newEls.length > 0){ + for(var a = 0; a < newEls.length; a++){ + if(newEls[a].innerText){ + this.domModifications.push(newEls[a].innerText); + //this.domModifications.push(this.textComparator.getValue(newEls[a].innerText)); + console.log(this.textComparator.getValue(newEls[a].innerText)) + } + } + + //console.log("added elements " + newEls.length) + for(let ne of newEls){ + //console.log(ne) + uRet = await this.dispatchProbeEvent("newdom", { + rootNode: this.describeElement(ne), + trigger: this.getTrigger(), + layer: layer + }); + if(uRet) + await this.crawlDOM(ne, layer + 1); + } + + } + } + + } + } + + Probe.prototype.sleep = function(n){ + return new Promise(resolve => { + setTimeout(resolve, n, true); + }); + }; + + + + window.__PROBE__ = new Probe(options, inputValues); +}; diff --git a/core/crawl/probe/chrome-probe/htcap/shingleprint.js b/core/crawl/probe/chrome-probe/htcap/shingleprint.js new file mode 100644 index 0000000..6075181 --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/shingleprint.js @@ -0,0 +1,473 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +exports.initTextComparator = function(){ + + function ShinglePrint(text){ + //#size = 64 + this.nfeatures = 128; + + this.text = text; + this.features = null; + } + + ShinglePrint.prototype.init = function(){ + this.tokens = this._shingle() + this.heap = new this.HeapMax(this.nfeatures) + this.hash_queue = new this.HashQueue(this.nfeatures) + this.hash_queue.init(); + + this.features = this._hash_tokens() + } + + + ShinglePrint.prototype.compare = function(features){ + var tfeatures = this.getFeatures(); + + return this.score(tfeatures, features); + } + + ShinglePrint.prototype.getFeatures = function(){ + if(!this.features) + this.init(); + return this.features; + } + + + ShinglePrint.prototype._shinglet = function(s){ + var w = 4; + //s = this.text; + if(typeof s == 'string') + s = Array.prototype.slice.call(s, 0); + + var tks = s //.toLowerCase().trim().split(" "); + if (tks.length < w) return [s]; + + var arr = []; + for(let i = 0; i < tks.length - (w-1); i++){ + arr.push(tks.slice(i, i+w).join("")) + } + // for i in range(len(tks)-(w-1)): + // arr.append(" ".join(tks[i:i+w])) + return arr + }; + + + ShinglePrint.prototype._shinglew = function(){ + var w = 4; + s = this.text; + if(typeof s == 'string') + s = Array.prototype.slice.call(s, 0); + + var tks = s.toLowerCase().trim().split(" "); + if (tks.length < w) return [s]; + + var arr = []; + for(let i = 0; i < tks.length - (w-1); i++){ + arr.push(tks.slice(i, i+w).join(" ")) + } + // for i in range(len(tks)-(w-1)): + // arr.append(" ".join(tks[i:i+w])) + return arr + }; + + ShinglePrint.prototype._shingle = function(){ + var w = 8; + s = this.text //#.toLowerCase().trim(); + if(typeof s == 'string') + s = Array.prototype.slice.call(s, 0); + var tks = s; + if (tks.length < w) return [s]; + + var arr = []; + for(let i = 0; i < tks.length - (w-1); i++){ + arr.push(tks.slice(i, i+w).join("")) + } + return arr + }; + + ShinglePrint.prototype.crc32 = function(str){ + var a_table = "00000000 77073096 EE0E612C 990951BA 076DC419 706AF48F E963A535 9E6495A3 0EDB8832 79DCB8A4 E0D5E91E 97D2D988 09B64C2B 7EB17CBD E7B82D07 90BF1D91 1DB71064 6AB020F2 F3B97148 84BE41DE 1ADAD47D 6DDDE4EB F4D4B551 83D385C7 136C9856 646BA8C0 FD62F97A 8A65C9EC 14015C4F 63066CD9 FA0F3D63 8D080DF5 3B6E20C8 4C69105E D56041E4 A2677172 3C03E4D1 4B04D447 D20D85FD A50AB56B 35B5A8FA 42B2986C DBBBC9D6 ACBCF940 32D86CE3 45DF5C75 DCD60DCF ABD13D59 26D930AC 51DE003A C8D75180 BFD06116 21B4F4B5 56B3C423 CFBA9599 B8BDA50F 2802B89E 5F058808 C60CD9B2 B10BE924 2F6F7C87 58684C11 C1611DAB B6662D3D 76DC4190 01DB7106 98D220BC EFD5102A 71B18589 06B6B51F 9FBFE4A5 E8B8D433 7807C9A2 0F00F934 9609A88E E10E9818 7F6A0DBB 086D3D2D 91646C97 E6635C01 6B6B51F4 1C6C6162 856530D8 F262004E 6C0695ED 1B01A57B 8208F4C1 F50FC457 65B0D9C6 12B7E950 8BBEB8EA FCB9887C 62DD1DDF 15DA2D49 8CD37CF3 FBD44C65 4DB26158 3AB551CE A3BC0074 D4BB30E2 4ADFA541 3DD895D7 A4D1C46D D3D6F4FB 4369E96A 346ED9FC AD678846 DA60B8D0 44042D73 33031DE5 AA0A4C5F DD0D7CC9 5005713C 270241AA BE0B1010 C90C2086 5768B525 206F85B3 B966D409 CE61E49F 5EDEF90E 29D9C998 B0D09822 C7D7A8B4 59B33D17 2EB40D81 B7BD5C3B C0BA6CAD EDB88320 9ABFB3B6 03B6E20C 74B1D29A EAD54739 9DD277AF 04DB2615 73DC1683 E3630B12 94643B84 0D6D6A3E 7A6A5AA8 E40ECF0B 9309FF9D 0A00AE27 7D079EB1 F00F9344 8708A3D2 1E01F268 6906C2FE F762575D 806567CB 196C3671 6E6B06E7 FED41B76 89D32BE0 10DA7A5A 67DD4ACC F9B9DF6F 8EBEEFF9 17B7BE43 60B08ED5 D6D6A3E8 A1D1937E 38D8C2C4 4FDFF252 D1BB67F1 A6BC5767 3FB506DD 48B2364B D80D2BDA AF0A1B4C 36034AF6 41047A60 DF60EFC3 A867DF55 316E8EEF 4669BE79 CB61B38C BC66831A 256FD2A0 5268E236 CC0C7795 BB0B4703 220216B9 5505262F C5BA3BBE B2BD0B28 2BB45A92 5CB36A04 C2D7FFA7 B5D0CF31 2CD99E8B 5BDEAE1D 9B64C2B0 EC63F226 756AA39C 026D930A 9C0906A9 EB0E363F 72076785 05005713 95BF4A82 E2B87A14 7BB12BAE 0CB61B38 92D28E9B E5D5BE0D 7CDCEFB7 0BDBDF21 86D3D2D4 F1D4E242 68DDB3F8 1FDA836E 81BE16CD F6B9265B 6FB077E1 18B74777 88085AE6 FF0F6A70 66063BCA 11010B5C 8F659EFF F862AE69 616BFFD3 166CCF45 A00AE278 D70DD2EE 4E048354 3903B3C2 A7672661 D06016F7 4969474D 3E6E77DB AED16A4A D9D65ADC 40DF0B66 37D83BF0 A9BCAE53 DEBB9EC5 47B2CF7F 30B5FFE9 BDBDF21C CABAC28A 53B39330 24B4A3A6 BAD03605 CDD70693 54DE5729 23D967BF B3667A2E C4614AB8 5D681B02 2A6F2B94 B40BBE37 C30C8EA1 5A05DF1B 2D02EF8D"; + var b_table = a_table.split(' ').map(function(s){ return parseInt(s,16) }); + function b_crc32 (str) { + var crc = -1; + for(var i=0, iTop=str.length; i>> 8 ) ^ b_table[( crc ^ str.charCodeAt( i ) ) & 0xFF]; + } + return (crc ^ (-1)) >>> 0; + }; + return b_crc32(str); + } + + ShinglePrint.prototype._hash_tokens = function(){ + //for t in this.tokens: + for(let t of this.tokens){ + let h = this.crc32(t) & 0xffffffff + //print "got %x %d %d" % ((h&0xffffffff), this.heap.nheap, this.nfeatures) + if (this.heap.nheap == this.nfeatures && h >= this.heap.heap[0]){ + continue; + } + //print "0x%x < 0x%x" % (h, this.heap.heap[0]) + if(this.hash_queue.hash_contains(h)){ + //print "dup" + continue; + } + + if(this.heap.nheap == this.nfeatures){ + let m = this.heap.heap_extract_max(); + this.hash_queue.hash_delete(m); + //print "pop %x [%s]" % (m&0xffffffff, " ".join(["%x" % (c & 0xffffffff) for c in this.heap.heap])) + } + + this.hash_queue.hash_insert(h); + this.heap.heap_insert(h); + //print "push %x [%s]" % (h&0xffffffff, " ".join(["%x" % (c & 0xffffffff) for c in this.heap.heap])) + } + //print "END [%s]" % " ".join(["%x" % (c & 0xffffffff) for c in this.heap.heap]) + return this.heap.get_features(); + } + + //@staticmethod + ShinglePrint.prototype.score = function(f1, f2){ + var unionsize = 0.0, + intersectsize = 0.0, + i1 = f1.length - 1, + i2 = f2.length - 1, + count = 0, + matchcount = 0; + + while(i1 >= 0 && i2 >= 0){ + //#print "%x %d" % (f1[i1],i1) + if(f1[i1] < f2[i2]){ + i1 -= 1; + continue; + } + if(f1[i1] > f2[i2]){ + i2 -= 1; + continue; + } + matchcount += 1; + i1 -= 1; + i2 -= 1; + } + count = Math.min(f1.length, f2.length); + intersectsize = matchcount; + unionsize = 2 * count - matchcount; + return intersectsize / unionsize; + } + + // @staticmethod + // def similarity(x, y): + ShinglePrint.prototype.similarity = function(x, y){ + var i = (x & y), + u = (x | y); + //# print "%x %x" % (x, y) + //# print "%f %f" % (ShinglePrint.hammingWeight(i), ShinglePrint.hammingWeight(u)) + return this.hammingWeight(i) / this.hammingWeight(u); + } + // @staticmethod + // def hammingWeight(l): + ShinglePrint.prototype.hammingWeight = function(l){ + //#for(c = 0; l; c++) l &= l-1; + var c = 0; + while(l){ + l &= l - 1; + c += 1; + } + return c; + } + + + + + + //class HeapMax: + ShinglePrint.prototype.HeapMax = function(size){ + this.nheap = 0; + this.maxheap = size; + this.heap = Array(size).fill(0); + } + + ShinglePrint.prototype.HeapMax.prototype.downheap = function(){ + var tmp = 0, + i = 0; + while(true){ + let left = (i << 1) + 1; + let right = left + 1; + if(left >= this.nheap) + return + if(right >= this.nheap){ + if(this.heap[i] < this.heap[left]){ + tmp = this.heap[left]; + this.heap[left] = this.heap[i]; + this.heap[i] = tmp; + } + return; + } + + if(this.heap[i] >= this.heap[left] && this.heap[i] >= this.heap[right]) + return + + if(this.heap[left] > this.heap[right]){ + tmp = this.heap[left]; + this.heap[left] = this.heap[i]; + this.heap[i] = tmp; + i = left; + } + else{ + tmp = this.heap[right]; + this.heap[right] = this.heap[i]; + this.heap[i] = tmp; + i = right; + } + } + } + + ShinglePrint.prototype.HeapMax.prototype.get_features = function(){ + var f = []; + while (this.nheap > 0) + f.push(this.heap_extract_max()) + return f; + } + + ShinglePrint.prototype.HeapMax.prototype.heap_extract_max = function(){ + //assert(this.nheap > 0) + var m = this.heap[0]; + this.nheap -= 1; + this.heap[0] = this.heap[this.nheap]; + this.downheap(); + return m; + } + + + ShinglePrint.prototype.HeapMax.prototype.upheap = function(){ + + var i = this.nheap - 1; + //assert(this.nheap > 0) + while(i > 0){ + let parent = (i - 1) >> 1; + if(this.heap[parent] >= this.heap[i]) + return; + let tmp = this.heap[parent]; + this.heap[parent] = this.heap[i]; + this.heap[i] = tmp; + i = parent; + } + } + + ShinglePrint.prototype.HeapMax.prototype.heap_insert = function(v){ + //assert(this.nheap < this.maxheap) + this.heap[this.nheap] = v; + this.nheap += 1; + this.upheap(); + } + + + + + //class HashQueue: + ShinglePrint.prototype.HashQueue = function(size){ + this.EMPTY = 0 + this.FULL = 1 + this.DELETED = 2 + this.size = size; + } + + ShinglePrint.prototype.HashQueue.prototype.init = function(){ + function next_pow2(n){ + var m = 1; + while(n > 0){ + n >>= 1; + m <<= 1; + } + return m; + } + + this.hash = null; + this.occ = null; + this.nhash = 7 * this.size; + this.nhash = next_pow2(this.nhash); + this.hash_alloc(); + } + + ShinglePrint.prototype.HashQueue.prototype.hash_alloc = function(){ + // this.hash = [0] * this.nhash + // this.occ = [0] * this.nhash + this.hash = Array(this.nhash).fill(0); + this.occ = Array(this.nhash).fill(0); + //for i in range(this.nhash): + for(let i = 0; i < this.hash; i++) + this.occ[i] = this.EMPTY; + } + + /* + # Since the input values are crc's, we don't + # try to hash them at all! they're plenty random + # coming in, in principle. + */ + + ShinglePrint.prototype.HashQueue.prototype.do_hash_insert = function(crc){ + var h = crc; + //for count in range(self.nhash): + for(let count = 0; count < this.nhash; count++){ + let i = parseInt(h) & (this.nhash - 1); + if(this.occ[i] != this.FULL){ + this.occ[i] = this.FULL; + this.hash[i] = crc; + return 1; + } + if(this.hash[i] == crc) + return 1; + h += 2 * (this.nhash / 4) + 1; + } + return 0; + } + + + //# idiot stop-and-copy for deleted references + ShinglePrint.prototype.HashQueue.prototype.gc = function(){ + var oldhash = this.hash, + oldocc = this.occ; + + this.hash_alloc(); + + //for i in range(this.nhash): + for(let i = 0; i < this.hash; i++){ + if(oldocc[i] == this.FULL){ + if(!this.do_hash_insert(oldhash[i])){ + //print "internal error: gc failed, table full" + //sys.exit(1); + throw "UNexpected ShinglePrint error"; + } + } + } + } + + ShinglePrint.prototype.HashQueue.prototype.hash_insert = function(crc){ + if(this.do_hash_insert(crc)) + return; + + this.gc(); + if(this.do_hash_insert(crc)) + return; + //print "internal error: insert failed, table full" + } + + + ShinglePrint.prototype.HashQueue.prototype.do_hash_contains = function(crc){ + var h = crc; + //for count in range(self.nhash): + for(let count = 0; count < this.nhash; count++){ + let i = parseInt(h) & (this.nhash - 1); + if(this.occ[i] == this.EMPTY) + return 0; + if(this.occ[i] == this.FULL && this.hash[i] == crc) + return 1; + h += 2 * (this.nhash / 4) + 1; + } + return -1; + } + + + + ShinglePrint.prototype.HashQueue.prototype.hash_contains = function(crc){ + var result = this.do_hash_contains(crc); + if(result >= 0) + return result; + + this.gc() + result = this.do_hash_contains(crc); + if(result >= 0) + return result; + //print "internal error: can't find value, table full" + } + + + ShinglePrint.prototype.HashQueue.prototype.do_hash_delete = function(crc){ + var h = crc + //for count in range(self.nhash): + for(let count = 0; count < this.nhash; count++){ + let i = parseInt(h) & (this.nhash - 1); + if(this.occ[i] == this.FULL && this.hash[i] == crc){ + this.occ[i] = this.DELETED; + return 1; + } + + if(this.occ[i] == this.EMPTY) + return 0; + h += 2 * (this.nhash / 4) + 1 + } + return -1; + } + + + ShinglePrint.prototype.HashQueue.prototype.hash_delete = function(crc){ + var result = this.do_hash_delete(crc); + if(result >= 0) + return result; + this.gc() + result = this.do_hash_delete(crc); + //if(this.result >= 0) /// <-?? wrong + if(result >= 0) + return result; + //print "internal error: delete failed, table full" + } + + + function TextComparator(text){ + this.text = new this.Text(text); + } + + TextComparator.prototype.getValue = function(){ + return this.text.value; + } + + TextComparator.prototype.compare = function(other){ + switch(this.text.type){ + case "textmatch": + return this.text.value === other.value; + case "simhash": + return this.text.value === other.value; + case "shingleprint": + // return 0; + //return this.text.value === other.value; + return this.text.comparator.compare(other.value) >= 0.95 // !!!!! + } + + } + TextComparator.prototype.Text = function(text){ + this.type = "textmatch"; + this.value = text; + this.comparator = null; + if(text.length >= 32 ){ + this.type = "simhash"; + // @ todo + } + if(text.length >= 256){ + this.type = "shingleprint"; + let s = new ShinglePrint(text); + this.comparator = s; + this.value = s.getFeatures(); + } + } + + window.__PROBE__.textComparator = { + getValue: function(text){ + var tc = new TextComparator(text); + return tc.text; + }, + compare(text, ctext){ + var tc = new TextComparator(text); + return tc.compare(ctext); + } + } + +} //init \ No newline at end of file diff --git a/core/crawl/probe/chrome-probe/htcap/utils.js b/core/crawl/probe/chrome-probe/htcap/utils.js new file mode 100644 index 0000000..fc185c5 --- /dev/null +++ b/core/crawl/probe/chrome-probe/htcap/utils.js @@ -0,0 +1,337 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +//"use strict"; + +const urlparser = require('url').parse; +const fs = require('fs'); + + + +exports.parseCookiesFromHeaders = parseCookiesFromHeaders; +exports.hookNativeFunctions = hookNativeFunctions; +exports.generateRandomValues = generateRandomValues; + + + + + + + + function hookNativeFunctions(options) { + //alert(window.__PROBE__) + + if(options.mapEvents){ + + Node.prototype.originaladdEventListener = Node.prototype.addEventListener; + Node.prototype.addEventListener = function(event, func, useCapture){ + if(event != "DOMContentLoaded"){ // is this ok??? + window.__PROBE__.addEventToMap(this, event); + } + this.originaladdEventListener(event, func, useCapture); + }; + + window.addEventListener = (function(originalAddEventListener){ + return function(event, func, useCapture){ + if(event != "load"){ // is this ok??? + window.__PROBE__.addEventToMap(this, event); + } + originalAddEventListener.apply(this,[event, func, useCapture]); + } + })(window.addEventListener); + } + + if(options.checkAjax){ + XMLHttpRequest.prototype.originalOpen = XMLHttpRequest.prototype.open; + XMLHttpRequest.prototype.open = function(method, url, async, user, password){ + window.__PROBE__.xhrOpenHook(this, method, url); + return this.originalOpen(method, url, async, user, password); + } + + + + XMLHttpRequest.prototype.originalSend = XMLHttpRequest.prototype.send; + + XMLHttpRequest.prototype.send = async function(data){ + var uRet = await window.__PROBE__.xhrSendHook(this, data); + if(!this.__skipped && uRet) + return this.originalSend(data); + + return; + } + + } + + + if(options.checkScriptInsertion){ + + Node.prototype.originalappendChild = Node.prototype.appendChild; + Node.prototype.appendChild = function(node){ + //window.__PROBE__.printJSONP(node); + + window.__PROBE__.jsonpHook(node); + return this.originalappendChild(node); + } + + Node.prototype.originalinsertBefore = Node.prototype.insertBefore; + Node.prototype.insertBefore = function(node, element){ + //window.__PROBE__.printJSONP(node); + window.__PROBE__.jsonpHook(node); + return this.originalinsertBefore(node, element); + } + + Node.prototype.originalreplaceChild = Node.prototype.replaceChild; + Node.prototype.replaceChild = function(node, oldNode){ + //window.__PROBE__.printJSONP(node); + window.__PROBE__.jsonpHook(node); + return this.originalreplaceChild(node, oldNode); + } + } + + if(options.checkWebsockets){ + window.WebSocket = (function(WebSocket){ + return function(url, protocols){ + //window.__PROBE__.printWebsocket(url); //websockets.push(url); + window.__PROBE__.triggerWebsocketEvent(url); + //return WebSocket.prototype; + var ws = new WebSocket(url); + ws.__originalSend = ws.send; + ws.send = function(message){ + window.__PROBE__.triggerWebsocketSendEvent(url, message); + return ws.__originalSend(message); + } + ws.addEventListener("message", function(message){ + window.__PROBE__.triggerWebsocketMessageEvent(url, message.data); + }); + return ws; + } + })(window.WebSocket); + } + + + if(options.overrideTimeoutFunctions){ + window.setTimeout = (function(setTimeout){ + return function(func, time, setTime){ + var t = setTime ? time : 0; + return setTimeout(func, t); + } + })(window.setTimeout); + + window.setInterval = (function(setInterval){ + return function(func, time, setTime){ + var t = setTime ? time : 0; + return setInterval(func, t); + } + })(window.setInterval); + + } + + + + // Node.prototype.originalRemoveEventListener = Node.prototype.removeEventListener; + // Node.prototype.removeEventListener = function(type, listener, options){ + // }; + + Node.prototype.originalRemoveChild = Node.prototype.removeChild; + Node.prototype.removeChild = function(node){ + if(!node.__analyzed){ + //console.log("elem not analyzed "+ window.__PROBE__.stringifyElement(node) ) + //console.log(window.__PROBE__.getTrigger()); + } + this.__removed = true; + if(this instanceof HTMLElement){ + for (let c of this.getElementsByTagName("*")) + c.__removed = true; + } + return this.originalRemoveChild(node); + } + + + + HTMLFormElement.prototype.originalSubmit = HTMLFormElement.prototype.submit; + HTMLFormElement.prototype.submit = function(){ + //console.log("=-->"+this.action) + // var req = window.__PROBE__.getFormAsRequest(this); + // window.__PROBE__.printRequest(req); + window.__PROBE__.triggerFormSubmitEvent(this); + return this.originalSubmit(); + } + + // prevent window.close + window.close = function(){ return }; + window.print = function(){ return }; + + window.open = function(url, name, specs, replace){ + //window.__PROBE__.printLink(url); + window.__PROBE__.triggerNavigationEvent(url); + } + + //window.__PROBE__.triggerUserEvent("onInit"); +} + + + +// generates PSEUDO random values. the same seed will generate the same values +function generateRandomValues(seed){ + var values = {}; + var letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + var numbers = "0123456789"; + var symbols = "!#&^;.,?%$*"; + var months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]; + var years = ["1982", "1989", "1990", "1994", "1995", "1996"]; + var names = ["james", "john", "robert", "michael", "william", "david", "richard", "charles", "joseph", "thomas", "christopher", "daniel", "paul", "mark", "donald", "george", "kenneth"]; + var surnames = ["anderson", "thomas", "jackson", "white", "harris", "martin", "thompson", "garcia", "martinez", "robinson", "clark", "rodriguez", "lewis", "lee", "walker", "hall"]; + var domains = [".com", ".org", ".net", ".it", ".tv", ".de", ".fr"]; + + var randoms = []; + var randoms_i = 0; + + for(var a = 0; a < seed.length; a++){ + var i = seed[a].charCodeAt(0); + randoms.push(i); + } + + var rand = function(max){ + var i = randoms[randoms_i] % max; + randoms_i = (randoms_i + 1) % randoms.length; + return i; + } + + var randarr = function(arr, len){ + var r; + var ret = ""; + for(var a = 0; a < len; a++){ + r = rand(arr.length - 1) ; + ret += arr[r]; + } + return ret; + }; + + var generators = { + string: function(){ + return randarr(letters, 8); + }, + number: function(){ + return randarr(numbers, 3); + }, + month: function(){ + return randarr(months, 1); + }, + year: function(){ + return randarr(years, 1); + }, + date: function(){ + return generators.year() + "-" + generators.month() + "-" + generators.month(); + }, + color: function(){ + return "#" + randarr(numbers, 6); + }, + week: function(){ + return generators.year() + "-W" + randarr(months.slice(0, 6), 1); + }, + time: function(){ + return generators.month() + ":" + generators.month(); + }, + datetimeLocal: function(){ + return generators.date() + "T" + generators.time(); + }, + domain: function(){ + return randarr(letters, 12).toLowerCase() + randarr(domains ,1); + }, + email: function(){ + return randarr(names, 1) + "." + generators.surname() + "@" + generators.domain(); + }, + url: function(){ + return "http://www." + generators.domain(); + }, + humandate: function(){ + return generators.month() + "/" + generators.month() + "/" + generators.year(); + }, + password: function(){ + return randarr(letters, 3) + randarr(symbols, 1) + randarr(letters, 2) + randarr(numbers, 3) + randarr(symbols, 2); + }, + surname: function(){ + return randarr(surnames, 1); + }, + lastname: function(){ + return generators.surname(); + }, + firstname: function(){ + return randarr(names, 1); + }, + tel: function(){ + return "+" + randarr(numbers, 1) + " " + randarr(numbers, 10); + } + }; + + + for(var type in generators){ + values[type] = generators[type](); + } + + return values; + + +}; + + + + +function parseCookiesFromHeaders(headers, url){ + var a, b, c, ret = []; + var purl = urlparser(url); + var domain = purl.hostname; + + for(header in headers){ + //console.log(JSON.stringify(header)) + if(header.toLowerCase() == "set-cookie"){ + var cookies = headers[header].split("\n"); // no multiple cookies due to a chrome bug (??) + for(b = 0; b < cookies.length; b++){ + var ck = cookies[b].split(/; */); + var cookie = {domain: domain, path: "/", secure: false, httponly:false}; + for(c = 0; c < ck.length; c++){ + var kv = ck[c].split("="); + if(c == 0){ + cookie.name = kv[0]; + cookie.value = decodeURIComponent(kv[1]); + continue; + } + switch(kv[0].toLowerCase()){ + case "expires": + if(!("expires" in cookie)) + cookie.expires = parseInt((new Date(kv[1])).getTime() / 1000); + break; + case "max-age": + cookie.expires = parseInt(((new Date()).getTime() / 1000) + parseInt(kv[1])); + break; + case "domain": + case "path": + cookie[kv[0]] = kv[1]; + break; + case "httponly": + case "secure": + cookie[kv[0]] = true; + break; + } + } + + //cookie.url = url; + //cookie.sameSite = "Lax"; + if(!cookie.expires) // expires MUST be in seconds .. + cookie.expires = parseInt((new Date()).getTime() / 1000) + (60*60*24*365); + ret.push(cookie); + } + } + } + return ret; +}; + + diff --git a/core/crawl/probe/chrome-probe/package.json b/core/crawl/probe/chrome-probe/package.json new file mode 100644 index 0000000..d935eb3 --- /dev/null +++ b/core/crawl/probe/chrome-probe/package.json @@ -0,0 +1,9 @@ +{ + "name": "htcap-chrome-probe", + "version": "1.0.0", + "license": "GPL-2.0", + "dependencies": { + "puppeteer": "^1.10.0" + }, + "devDependencies": {} +} diff --git a/core/crawl/probe/chrome-probe/utils.js b/core/crawl/probe/chrome-probe/utils.js new file mode 100644 index 0000000..dd9f414 --- /dev/null +++ b/core/crawl/probe/chrome-probe/utils.js @@ -0,0 +1,346 @@ +/* +HTCAP - 1.2 +http://htcap.org +Author: filippo.cavallarin@wearesegment.com + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. +*/ + +//"use strict"; + +const urlparser = require('url').parse +const fs = require('fs'); + + +exports.parseArgs = function(args, optstring, defaults){ + var g = getopt(args, optstring); + g.args.splice(0,2); + return {opts:parseArgsToOptions(g, defaults), args:g.args}; +} + +exports.usage = usage; +exports.printRequest = printRequest; +exports.printLinks = printLinks; +exports.printForms = printForms; +exports.printStatus = printStatus; +exports.printCookies = printCookies; + + +var printedRequests = []; + +function printRequest(req){ + if(!("method" in req)) + req.method = "GET"; + if(!("data" in req)) + req.data = null; + req.url = filterUrl(req.url); + if(!req.url) + return; + let jr = JSON.stringify(req); + if(printedRequests.indexOf(jr) != -1) + return; + printedRequests.push(jr); + console.log('["request",' + jr + "],"); + +} + + +function filterUrl(url){ + url = url.split("#")[0]; + if(url.match(/^[a-z0-9\-_]+\:/i) && !url.match(/(^https?)|(^ftps?)\:/i)){ + return null; + } + + return url; +} + +async function printLinks(rootNode, page){ + var el = await page.$(rootNode);//.then(el => { + //page.evaluate(e => console.log(e.innerText), el) + if(!el)return; + var links = await el.$$("a");//.then(links => { + for(let l of links){ + var v = await l.getProperty('href');//.then(v => v.jsonValue()).then(t =>{ + //var t = await (await l.getProperty('href')).jsonValue(); + var t = await v.jsonValue(); + var req = { type: "link", url: t}; + printRequest(req); + } +} + +async function printForms(rootNode, page){ + var el = await page.$(rootNode);//.then(el => { + //page.evaluate(e => console.log(e.innerText), el) + if(!el)return; + var forms = await el.$$("form");//.then(forms => { + for(let f of forms){ + var req = await getFormAsRequest(f, page); //.then(req => { + printRequest(req); + } +} + + + +function printStatus(crawler){ + var o = { + status: "ok" + }; + if(crawler.errors().length > 0 && !crawler.redirect()){ + o.errors = JSON.stringify(crawler.errors()); + o.status = "error"; + o.code = crawler.errors()[0][0]; + o.message = crawler.errors()[0][1]; + } + if(crawler.redirect()){ + o.redirect = crawler.redirect(); + } + + printCookies(crawler); + console.log(JSON.stringify(o) + '\n]'); +} + +async function getFormAsRequest(frm, page){ + + var formObj = {type:"form"}; + var inputs = null; + + + formObj.method = await (await frm.getProperty("method")).jsonValue(); + + if(!formObj.method){ + formObj.method = "GET"; + } else { + formObj.method = formObj.method.toUpperCase(); + } + + formObj.url = await (await frm.getProperty("action")).jsonValue(); + formObj.data = []; + inputs = await frm.$$("input, select, textarea"); + for(let input of inputs){ + let name = await (await input.getProperty("name")).jsonValue(); + if(!name) continue; + let value = await (await input.getProperty("value")).jsonValue(); + let tagName = await (await input.getProperty("tagName")).jsonValue(); + let type = await (await input.getProperty("type")).jsonValue(); + + let par = encodeURIComponent(name) + "=" + encodeURIComponent(value); + if(tagName == "INPUT" && type != null){ + + switch(type.toLowerCase()){ + case "button": + case "submit": + break; + case "checkbox": + case "radio": + let checked = await (await input.getProperty("checked")).jsonValue(); + if(checked) + formObj.data.push(par); + break; + default: + formObj.data.push(par); + } + + } else { + formObj.data.push(par); + } + } + + formObj.data = formObj.data.join("&"); + + return formObj; + +}; + + +function printCookies(crawler){ + console.log('["cookies",' + JSON.stringify(crawler.cookies()) + "],") +} + + +function parseArgsToOptions(args, defaults){ + let options = {}; + for(var a in defaults){ + options[a] = defaults[a]; + } + for(var a = 0; a < args.opts.length; a++){ + switch(args.opts[a][0]){ + // case "h": + // //showHelp = true; + // usage(); + // phantom.exit(1); + // break; + case "V": + options.verbose = true; + break; + case "a": + options.checkAjax = false; + break; + case "f": + options.fillValues = false; + break; + case "t": + options.triggerEvents = false; + break; + case "d": // unused + options.printAjaxPostData = false; + case "S": // unused + options.checkScriptInsertion = false; + break; + case "I": // unused + options.loadImages = true; + break; + case "C": // unused + options.getCookies = false; + break; + + case "c": + try{ + var cookie_file = fs.readFileSync(args.opts[a][1]); + options.setCookies = JSON.parse(cookie_file); + } catch(e){ + console.log(e); + phantom.exit(1); + } + + break; + case "p": + var arr = args.opts[a][1].split(":"); + options.httpAuth = [arr[0], arr.slice(1).join(":")]; + break; + case "M": // unused + options.mapEvents = false; + break; + case "T": // unused + options.triggerAllMappedEvents = false; + break; + case "s": // unused + options.checkWebsockets = false; + break; + case "x": + options.maxExecTime = parseInt(args.opts[a][1]) * 1000; + break; + case "A": + options.userAgent = args.opts[a][1]; + break; + case "r": + options.referer = args.opts[a][1]; + break; + case "m": + options.outputMappedEvents = true; + break; + case "H": + options.returnHtml = true; + break; + case "X": // @TODO to be reviewed + options.excludedUrls = args.opts[a][1].split(","); + break; + case "O": + options.overrideTimeoutFunctions = false; + break; + case "i": + options.id = args.opts[a][1]; + break; + case "K": + options.preventElementRemoval = true; + break; + case "R": + options.randomSeed = args.opts[a][1]; + break; + case "P": + options.loadWithPost = true; + break; + case "D": + options.postData = args.opts[a][1]; + break; + case "y": + var tmp = args.opts[a][1].split(":"); + if(tmp.length > 2){ + options.proxy = tmp[0] + "://" + tmp[1] + ":" + tmp[2]; + } else { + options.proxy = args.opts[a][1]; + } + break; + case "l": + options.headlessChrome = false; + break; + + } + } + return options; +}; + + + + + + +// @todo error on Unknown option ds +function getopt(args, optstring){ + var args = args.slice(); + var ret = { + opts: [], + args: args + }; + + var m = optstring.match(/[a-zA-Z]\:*/g); + for(var a = 0; a < m.length; a++){ + var ai = args.indexOf("-" + m[a][0]); + if(ai > -1){ + if(m[a][1] == ":"){ + if(args[ai+1]){ + ret.opts.push([m[a][0], args[ai+1]]); + args.splice(ai,2); + } else { + return "missing argumnet for option " + m[a][0]; + } + } else { + ret.opts.push([m[a][0]]); + args.splice(ai,1); + } + } + } + + return ret; +} + + + + +function usage(){ + var usage = "Usage: analyze.js [options] \n" + + " -V verbose\n" + + " -a don't check ajax\n" + + " -f don't fill values\n" + + " -t don't trigger events (onload only)\n" + + " -s don't check websockets\n" + + " -M dont' map events\n" + + " -T don't trigger mapped events\n" + + " -S don't check for