Skip to content

adding some features #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ Updated to v.0.3 | Whats new:
**ToDo**

- [x] Crawl for a specific index
- [ ] Implementation of multithreading
- [ ] Allowing a range of years as input
- [x] Implementation of multithreading
- [x] Allowing a range of years as input
- [ ] Implementing `direct-grep`
- [ ] Temporary file-writing
- [x] Temporary file-writing

**Usage**
```
Expand Down Expand Up @@ -44,3 +44,5 @@ cat github_18.txt | grep user

**Dependencies**
* Python3

> This is a fork from the main repository, i just added some missing features
181 changes: 82 additions & 99 deletions cc.py
Original file line number Diff line number Diff line change
@@ -1,123 +1,106 @@
#!/usr/bin/env python3
import requests, json, os
import argparse, datetime, pathlib
from concurrent.futures import ThreadPoolExecutor

parser = argparse.ArgumentParser()

parser.add_argument('domain', help = 'domain which will be crawled for', type = str)
parser.add_argument('-y', '--year', help = 'limit the result to a specific year (default: all)', type = str)
parser.add_argument('-o', '--out', help = 'specify an output file (default: domain.txt)', type = str)
parser.add_argument('-l', '--list', help = 'Lists all available indexes', action = 'store_true')
parser.add_argument('-i', '--index', help = 'Crawl for a specific index (this will crawl all pages!)', type = str)
parser.add_argument('-u', '--update', help = 'Update index file', action = 'store_true')
parser.add_argument('domain', help='domain which will be crawled for', type=str)
parser.add_argument('-y', '--year', help='limit the result to a specific year or range of years (default: all)', type=str)
parser.add_argument('-o', '--out', help='specify an output file (default: domain.txt)', type=str)
parser.add_argument('-l', '--list', help='Lists all available indexes', action='store_true')
parser.add_argument('-i', '--index', help='Crawl for a specific index (this will crawl all pages!)', type=str)
parser.add_argument('-u', '--update', help='Update index file', action='store_true')

args = parser.parse_args()
links = []
links = set()
out = ''

indexes = []

def getData(index, page):
global links
global out

data = requests.get('http://index.commoncrawl.org/' + index + '-index?url=*.' + args.domain + '&output=json&page=' + page)
data = data.text.split('\n')[:-1]

for entry in data:
link = json.loads(entry)['url']

if link not in links:
out = out + (link + '\n')


def crawlAll():
#Below code assumes that array is sorted
#Reverse sort ensures that most recent years are prioritised, not essential.
currentyear=0
indexes.sort(reverse=1)

for index in indexes:
if currentyear != index.split("-")[2]:
currentyear = index.split("-")[2]
print("[!] Processing year: " + currentyear)
print('[-] ' + index)
getData(index, '')

#
def crawlSpecific(domain, year):
#index = indexes.get('y' + year)
print('[!] Processing year: ' + year)

for index in indexes:
if year in index:
print('[-] ' + index)
crawlIndex(domain, index)


def crawlIndex(domain, index):
url = 'http://index.commoncrawl.org/' + index + '-index?url=*.' + domain + '&output=json&showNumPages=true'
data = requests.get(url).text
try:
pages = json.loads(data)['pages']
print('[-] Collected ' + str(pages) + ' pages')

for i in range(0, pages):
getData(index, str(i))
print('[-] Processing page #' + str(i))

except:
print('[!] Error reading index')
pass
global links
global out
data = requests.get(f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&page={page}')
data = data.text.split('\n')[:-1]

for entry in data:
link = json.loads(entry)['url']

if link not in links:
links.add(link)
out = out + (link + '\n')
with open('./temp.tmp', 'a') as tmp_file:
tmp_file.write(link + '\n')

def threadedCrawlIndex(index):
print('[-] ' + index)
url = f'http://index.commoncrawl.org/{index}-index?url=*.{args.domain}&output=json&showNumPages=true'
data = requests.get(url).text
try:
pages = json.loads(data)['pages']
print(f'[-] Collected {pages} pages')

with ThreadPoolExecutor() as executor:
executor.map(lambda x: getData(index, str(x)), range(pages))
except:
print('[!] Error reading index')
pass

def readIndexFile(index_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt")):
global indexes
#check when the index file was last updated
path = pathlib.Path(index_filename)
last_updated=datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d')
print("Index file last updated on:", last_updated, "run with -u to update.")

with open(index_filename, "r") as f:
indexes = f.read().split('\n')[:-1]
global indexes
path = pathlib.Path(index_filename)
last_updated = datetime.datetime.fromtimestamp(path.stat().st_mtime).strftime('%Y-%m-%d')
print(f"Index file last updated on: {last_updated}. Run with -u to update.")

with open(index_filename, "r") as f:
indexes = f.read().split('\n')[:-1]

def updateIndexFile():
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f:
url = "https://index.commoncrawl.org/collinfo.json"
data = requests.get(url).text
raw_indexes = json.loads(data)
for index in raw_indexes:
indexes.append(index['id'])
f.write(index['id']+"\n")


#check if we need to update the index file or just read it into the array
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.txt"), "w") as f:
url = "https://index.commoncrawl.org/collinfo.json"
data = requests.get(url).text
raw_indexes = json.loads(data)

for index in raw_indexes:
indexes.append(index['id'])
f.write(index['id'] + "\n")

if args.update:
updateIndexFile()
updateIndexFile()
else:
readIndexFile()
readIndexFile()

if args.list:
for index in indexes:
print('[-] ' + index)
for index in indexes:
print('[-] ' + index)
else:
if args.index:
crawlIndex(args.domain, args.index)
elif args.year:
crawlSpecific(args.domain, args.year)
else:
crawlAll()
years = args.year.split("-") if args.year else []
if len(years) > 0:
start_year, end_year = years[0], years[-1] if len(years) > 1 else years[0]
else:
start_year, end_year = None, None

if args.index:
threadedCrawlIndex(args.index)
else:
for index in indexes:
year = index.split("-")[2]
if args.year:
if int(year) >= int(start_year) and int(year) <= int(end_year):
threadedCrawlIndex(index)
else:
threadedCrawlIndex(index)

if out:
if args.out:
path = os.path.abspath(args.out)
result = open(path, 'w')
output = str(args.out)
else:
result = open('./' + args.domain + '.txt', 'w')
output = str(args.domain + '.txt')

print('[-] Writing to file ...')
result.write(out)

print('[!] Done, file written: ./' + output)
if args.out:
path = os.path.abspath(args.out)
result = open(path, 'w')
output = str(args.out)
else:
result = open(f'./{args.domain}.txt', 'w')
output = str(f'{args.domain}.txt')
print('[-] Writing to file ...')
result.write(out)
os.remove('./temp.tmp')
print(f'[!] Done, file written: ./{output}')