Skip to content

Commit

Permalink
Add e-mail index
Browse files Browse the repository at this point in the history
  • Loading branch information
helviojunior committed Jan 15, 2025
1 parent 57ddbc7 commit 2c52275
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 8 deletions.
95 changes: 87 additions & 8 deletions filecrawler/cmd/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@ class Elastic(CrawlerBase):
_CREDS_WHITE_LIST = []
_CONTROL_KEYS = ["indexing_date", "fingerprint", "filename", "extension",
"mime_type", "file_size", "path_virtual", "path_real"]
_regex = None
_regex_url = None
_regex_email = None

def __init__(self):
super().__init__('elastic', 'Integrate to elasticsearch')
self._regex = re.compile(
self._regex_url = re.compile(
r"(?i)(https?:\/\/[^ '\"<>,;?&\(\)\{\}]*)")
self._regex_email = re.compile(
r"(?i)(?i)(?:[a-z0-9+!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])")

def add_flags(self, flags: _ArgumentGroup):
pass
Expand Down Expand Up @@ -173,7 +176,6 @@ def pre_run(self, **data):
'mappings': {
'properties': {
'indexing_date': {'type': 'date'},
'fingerprint': {'type': 'keyword'},
'filename': {'type': 'text'},
'scheme': {'type': 'keyword'},
'host': {'type': 'keyword'},
Expand All @@ -189,6 +191,28 @@ def pre_run(self, **data):
body=request_body
)

if not es.indices.exists(index=Configuration.index_name + '_emails'):
request_body = {
"settings": {
"number_of_replicas": 1,
"index": {"highlight.max_analyzed_offset": 10000000}
},

'mappings': {
'properties': {
'indexing_date': {'type': 'date'},
'filename': {'type': 'text'},
'email': {'type': 'keyword'},
'host': {'type': 'keyword'}
}
}
}

es.indices.create(
index=Configuration.index_name + '_emails',
body=request_body
)

if not es.indices.exists(index='.ctrl_' + Configuration.index_name):
request_body = {
"settings": {
Expand Down Expand Up @@ -285,10 +309,34 @@ def integrate(self, **data):
raise Exception(f'Cannot insert elasticsearch data: {res}')

for url in self.get_urliter(data.get('content', '')):
es.index(index=Configuration.index_name + '_urls',
id=url['fingerprint'],
document={**url, **{'indexing_date': data['indexing_date']}}
)
if url.get('url', None) is not None:
es.index(index=Configuration.index_name + '_urls',
id=url['url'],
document={
**url,
**{
'indexing_date': data['indexing_date'],
'filename': data.get('filename', ''),
'path_real': data.get('path_real', ''),
'path_virtual': data.get('path_virtual', '')
}
}
)

for email in self.get_emailiter(data.get('content', '')):
if email.get('email', None) is not None:
es.index(index=Configuration.index_name + '_emails',
id=email['email'],
document={
**email,
**{
'indexing_date': data['indexing_date'],
'filename': data.get('filename', ''),
'path_real': data.get('path_real', ''),
'path_virtual': data.get('path_virtual', '')
}
}
)

es.index(index='.ctrl_' + Configuration.index_name, id=id, document={
k: v
Expand All @@ -299,14 +347,45 @@ def integrate(self, **data):
except (elastic_transport.ConnectionError, requests.exceptions.ConnectionError) as e:
raise IntegrationError(e)

def get_emailiter(self, text):
text = text.encode('utf-8', 'ignore').decode('unicode-escape')
text = text.replace("\"", "\n").replace("'", "\n")
pos = 0
while m := self._regex_email.search(text, pos):
pos = m.end()
yield from Elastic._text_to_emailobj(m[0])

def get_urliter(self, text):
text = text.encode('utf-8', 'ignore').decode('unicode-escape')
text = text.replace("\"", "\n").replace("'", "\n")
pos = 0
while m := self._regex.search(text, pos):
while m := self._regex_url.search(text, pos):
pos = m.end()
yield from Elastic._text_to_urlobj(m[0])

@staticmethod
def _text_to_emailobj(email_text):
if email_text is None:
yield {}

email_text = email_text.lower().strip('"\' ()[]{}\r\n\t')
data = {
'email': email_text,
'host': '',
}

try:
if (idx := email_text.rfind('@')) != -1 and idx < len(email_text):
h = email_text[idx+1:]
if h is not None and len(h.strip()) > 0:
data['host'] = h

except Exception as e:
print(e)
pass

yield data

@staticmethod
def _text_to_urlobj(url_text):
try:
Expand Down
1 change: 1 addition & 0 deletions filecrawler/rules/leaked1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self):
]

self._fps = [
"Url: http://domain.com.br/login\nUSER: \nPASS: "
]

def post_processor(self, original_data: str, found: str) -> dict:
Expand Down

0 comments on commit 2c52275

Please sign in to comment.