si9int · UncleJ4ck · Sep 5, 2023 · Sep 5, 2023
diff --git a/README.md b/README.md
@@ -12,10 +12,10 @@ Updated to v.0.3 | Whats new:
 **ToDo**
 
 - [x] Crawl for a specific index
-- [ ] Implementation of multithreading
-- [ ] Allowing a range of years as input
+- [x] Implementation of multithreading
+- [x] Allowing a range of years as input
 - [ ] Implementing `direct-grep`
-- [ ] Temporary file-writing 
+- [x] Temporary file-writing 
 
 **Usage**
 ```
@@ -44,3 +44,5 @@ cat github_18.txt | grep user
 
 **Dependencies**
 * Python3
+
+> This is a fork from the main repository, i just added some missing features
diff --git a/cc.py b/cc.py
@@ -1,123 +1,106 @@
 #!/usr/bin/env python3
 import requests, json, os
 import argparse, datetime, pathlib
+from concurrent.futures import ThreadPoolExecutor
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument('domain', help = 'domain which will be crawled for', type = str)
-parser.add_argument('-y', '--year', help = 'limit the result to a specific year (default: all)', type = str)
-parser.add_argument('-o', '--out', help = 'specify an output file (default: domain.txt)', type = str)
-parser.add_argument('-l', '--list', help = 'Lists all available indexes', action = 'store_true')
-parser.add_argument('-i', '--index', help = 'Crawl for a specific index (this will crawl all pages!)', type = str)
-parser.add_argument('-u', '--update', help = 'Update index file', action = 'store_true')
+parser.add_argument('domain', help='domain which will be crawled for', type=str)
+parser.add_argument('-y', '--year', help='limit the result to a specific year or range of years (default: all)', type=str)
+parser.add_argument('-o', '--out', help='specify an output file (default: domain.txt)', type=str)
+parser.add_argument('-l', '--list', help='Lists all available indexes', action='store_true')
+parser.add_argument('-i', '--index', help='Crawl for a specific index (this will crawl all pages!)', type=str)
+parser.add_argument('-u', '--update', help='Update index file', action='store_true')
 
 args = parser.parse_args()
-links = []
+links = set()
 out = ''
-
 indexes = []
 
 def getData(index, page):
-	global links
-	global out
-
-	data = requests.get('http://index.commoncrawl.org/' + index + '-index?url=*.' + args.domain + '&output=json&page=' + page)
-	data = data.text.split('\n')[:-1]
-
-	for entry in data:
-		link = json.loads(entry)['url']
-
-		if link not in links:
-			out = out + (link + '\n')
-
-
-def crawlAll():
-	#Below code assumes that array is sorted
-	#Reverse sort ensures that most recent years are prioritised, not essential. 
-	currentyear=0
-	indexes.sort(reverse=1)
-
-	for index in indexes:
-		if currentyear != index.split("-")[2]:
-			currentyear = index.split("-")[2]
-			print("[!] Processing year: " + currentyear)
-		print('[-] ' + index)
-		getData(index, '')
-
-#
-def crawlSpecific(domain, year):
-	#index = indexes.get('y' + year)
-	print('[!] Processing year: ' + year)
-
-	for index in indexes:
-		if year in index:
-			print('[-] ' + index)
-			crawlIndex(domain, index)
-
-
-def crawlIndex(domain, index):
-	url = 'http://index.commoncrawl.org/' + index + '-index?url=*.' + domain + '&output=json&showNumPages=true'
-	data = requests.get(url).text
-	try:
-		pages = json.loads(data)['pages']
-		print('[-] Collected ' + str(pages) + ' pages')
-
-		for i in range(0, pages):
-			getData(index, str(i))
-			print('[-] Processing page #' + str(i))
-
-	except:
-		print('[!] Error reading index')
-		pass
+    global links
+    global out
+    data = requests.get(f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&page={page}')
+    data = data.text.split('\n')[:-1]
+
+    for entry in data:
+        link = json.loads(entry)['url']
+
+        if link not in links:
+            links.add(link)
+            out = out + (link + '\n')
+            with open('./temp.tmp', 'a') as tmp_file:
+                tmp_file.write(link + '\n')
+
+def threadedCrawlIndex(index):
+    print('[-] ' + index)
+    url = f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&showNumPages=true'
+    data = requests.get(url).text
+    try:
+        pages = json.loads(data)['pages']
+        print(f'[-] Collected {pages} pages')
+
+        with ThreadPoolExecutor() as executor:
+            executor.map(lambda x: getData(index, str(x)), range(pages))
+    except:
+        print('[!] Error reading index')
+        pass
 
 def readIndexFile(index_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt")):
-	global indexes
-	#check when the index file was last updated
-	path = pathlib.Path(index_filename)
-	last_updated=datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d')
-	print("Index file last updated on:", last_updated, "run with -u to update.")
-
-	with open(index_filename, "r") as f:
-		indexes = f.read().split('\n')[:-1]
+    global indexes
+    path = pathlib.Path(index_filename)
+    last_updated = datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d')
+    print(f"Index file last updated on: {last_updated}. Run with -u to update.")
+
+    with open(index_filename, "r") as f:
+        indexes = f.read().split('\n')[:-1]
 
 def updateIndexFile():
-	with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f:
-		url = "https://index.commoncrawl.org/collinfo.json"
-		data = requests.get(url).text
-		raw_indexes = json.loads(data)
-		for index in raw_indexes:
-			indexes.append(index['id'])
-			f.write(index['id']+"\n")
-
-
-#check if we need to update the index file or just read it into the array
+    with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f:
+        url = "https://index.commoncrawl.org/collinfo.json"
+        data = requests.get(url).text
+        raw_indexes = json.loads(data)
+
+        for index in raw_indexes:
+            indexes.append(index['id'])
+            f.write(index['id'] + "\n")
 
 if args.update:
-	updateIndexFile()
+    updateIndexFile()
 else:
-	readIndexFile()
+    readIndexFile()
 
 if args.list:
-	for index in indexes:
-		print('[-] ' + index)	
+    for index in indexes:
+        print('[-] ' + index)
 else:
-	if args.index:
-		crawlIndex(args.domain, args.index)
-	elif args.year:
-		crawlSpecific(args.domain, args.year)
-	else:
-		crawlAll()
+    years = args.year.split("-") if args.year else []
+    if len(years) > 0:
+        start_year, end_year = years[0], years[-1] if len(years) > 1 else years[0]
+    else:
+        start_year, end_year = None, None
+
+    if args.index:
+        threadedCrawlIndex(args.index)
+    else:
+        for index in indexes:
+            year = index.split("-")[2]
+            if args.year:
+                if int(year) >= int(start_year) and int(year) <= int(end_year):
+                    threadedCrawlIndex(index)
+            else:
+                threadedCrawlIndex(index)
 
 if out:
-	if args.out:
-		path = os.path.abspath(args.out)
-		result = open(path, 'w')
-		output = str(args.out)
-	else:
-		result = open('./' + args.domain + '.txt', 'w')
-		output = str(args.domain + '.txt')
-
-	print('[-] Writing to file ...')
-	result.write(out)
-
-	print('[!] Done, file written: ./' + output)
+    if args.out:
+        path = os.path.abspath(args.out)
+        result = open(path, 'w')
+        output = str(args.out)
+    else:
+        result = open(f'./{args.domain}.txt', 'w')
+        output = str(f'{args.domain}.txt')
+    
+    print('[-] Writing to file ...')
+    result.write(out)
+    os.remove('./temp.tmp')
+    print(f'[!] Done, file written: ./{output}')