diff --git a/README.md b/README.md index 49ad15f..527e305 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ Updated to v.0.3 | Whats new: **ToDo** - [x] Crawl for a specific index -- [ ] Implementation of multithreading -- [ ] Allowing a range of years as input +- [x] Implementation of multithreading +- [x] Allowing a range of years as input - [ ] Implementing `direct-grep` -- [ ] Temporary file-writing +- [x] Temporary file-writing **Usage** ``` @@ -44,3 +44,5 @@ cat github_18.txt | grep user **Dependencies** * Python3 + +> This is a fork from the main repository, i just added some missing features diff --git a/cc.py b/cc.py index 88a4f3b..11993b9 100755 --- a/cc.py +++ b/cc.py @@ -1,123 +1,106 @@ #!/usr/bin/env python3 import requests, json, os import argparse, datetime, pathlib +from concurrent.futures import ThreadPoolExecutor parser = argparse.ArgumentParser() -parser.add_argument('domain', help = 'domain which will be crawled for', type = str) -parser.add_argument('-y', '--year', help = 'limit the result to a specific year (default: all)', type = str) -parser.add_argument('-o', '--out', help = 'specify an output file (default: domain.txt)', type = str) -parser.add_argument('-l', '--list', help = 'Lists all available indexes', action = 'store_true') -parser.add_argument('-i', '--index', help = 'Crawl for a specific index (this will crawl all pages!)', type = str) -parser.add_argument('-u', '--update', help = 'Update index file', action = 'store_true') +parser.add_argument('domain', help='domain which will be crawled for', type=str) +parser.add_argument('-y', '--year', help='limit the result to a specific year or range of years (default: all)', type=str) +parser.add_argument('-o', '--out', help='specify an output file (default: domain.txt)', type=str) +parser.add_argument('-l', '--list', help='Lists all available indexes', action='store_true') +parser.add_argument('-i', '--index', help='Crawl for a specific index (this will crawl all pages!)', type=str) +parser.add_argument('-u', '--update', help='Update index file', action='store_true') args = parser.parse_args() -links = [] +links = set() out = '' - indexes = [] def getData(index, page): - global links - global out - - data = requests.get('http://index.commoncrawl.org/' + index + '-index?url=*.' + args.domain + '&output=json&page=' + page) - data = data.text.split('\n')[:-1] - - for entry in data: - link = json.loads(entry)['url'] - - if link not in links: - out = out + (link + '\n') - - -def crawlAll(): - #Below code assumes that array is sorted - #Reverse sort ensures that most recent years are prioritised, not essential. - currentyear=0 - indexes.sort(reverse=1) - - for index in indexes: - if currentyear != index.split("-")[2]: - currentyear = index.split("-")[2] - print("[!] Processing year: " + currentyear) - print('[-] ' + index) - getData(index, '') - -# -def crawlSpecific(domain, year): - #index = indexes.get('y' + year) - print('[!] Processing year: ' + year) - - for index in indexes: - if year in index: - print('[-] ' + index) - crawlIndex(domain, index) - - -def crawlIndex(domain, index): - url = 'http://index.commoncrawl.org/' + index + '-index?url=*.' + domain + '&output=json&showNumPages=true' - data = requests.get(url).text - try: - pages = json.loads(data)['pages'] - print('[-] Collected ' + str(pages) + ' pages') - - for i in range(0, pages): - getData(index, str(i)) - print('[-] Processing page #' + str(i)) - - except: - print('[!] Error reading index') - pass + global links + global out + data = requests.get(f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&page={page}') + data = data.text.split('\n')[:-1] + + for entry in data: + link = json.loads(entry)['url'] + + if link not in links: + links.add(link) + out = out + (link + '\n') + with open('./temp.tmp', 'a') as tmp_file: + tmp_file.write(link + '\n') + +def threadedCrawlIndex(index): + print('[-] ' + index) + url = f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&showNumPages=true' + data = requests.get(url).text + try: + pages = json.loads(data)['pages'] + print(f'[-] Collected {pages} pages') + + with ThreadPoolExecutor() as executor: + executor.map(lambda x: getData(index, str(x)), range(pages)) + except: + print('[!] Error reading index') + pass def readIndexFile(index_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt")): - global indexes - #check when the index file was last updated - path = pathlib.Path(index_filename) - last_updated=datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d') - print("Index file last updated on:", last_updated, "run with -u to update.") - - with open(index_filename, "r") as f: - indexes = f.read().split('\n')[:-1] + global indexes + path = pathlib.Path(index_filename) + last_updated = datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d') + print(f"Index file last updated on: {last_updated}. Run with -u to update.") + + with open(index_filename, "r") as f: + indexes = f.read().split('\n')[:-1] def updateIndexFile(): - with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f: - url = "https://index.commoncrawl.org/collinfo.json" - data = requests.get(url).text - raw_indexes = json.loads(data) - for index in raw_indexes: - indexes.append(index['id']) - f.write(index['id']+"\n") - - -#check if we need to update the index file or just read it into the array + with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f: + url = "https://index.commoncrawl.org/collinfo.json" + data = requests.get(url).text + raw_indexes = json.loads(data) + + for index in raw_indexes: + indexes.append(index['id']) + f.write(index['id'] + "\n") if args.update: - updateIndexFile() + updateIndexFile() else: - readIndexFile() + readIndexFile() if args.list: - for index in indexes: - print('[-] ' + index) + for index in indexes: + print('[-] ' + index) else: - if args.index: - crawlIndex(args.domain, args.index) - elif args.year: - crawlSpecific(args.domain, args.year) - else: - crawlAll() + years = args.year.split("-") if args.year else [] + if len(years) > 0: + start_year, end_year = years[0], years[-1] if len(years) > 1 else years[0] + else: + start_year, end_year = None, None + + if args.index: + threadedCrawlIndex(args.index) + else: + for index in indexes: + year = index.split("-")[2] + if args.year: + if int(year) >= int(start_year) and int(year) <= int(end_year): + threadedCrawlIndex(index) + else: + threadedCrawlIndex(index) if out: - if args.out: - path = os.path.abspath(args.out) - result = open(path, 'w') - output = str(args.out) - else: - result = open('./' + args.domain + '.txt', 'w') - output = str(args.domain + '.txt') - - print('[-] Writing to file ...') - result.write(out) - - print('[!] Done, file written: ./' + output) + if args.out: + path = os.path.abspath(args.out) + result = open(path, 'w') + output = str(args.out) + else: + result = open(f'./{args.domain}.txt', 'w') + output = str(f'{args.domain}.txt') + + print('[-] Writing to file ...') + result.write(out) + os.remove('./temp.tmp') + print(f'[!] Done, file written: ./{output}')