diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..f2fd5c8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug report +about: Create a report to help us improve + +--- + +**Describe the bug** +A clear and concise description of what the bug is: + - which bolt/spout is in error + - Apache Storm error log + - ... + +**To Reproduce** +Steps to reproduce the behavior: +1. ... +2. ... + +Attach main configuration file of `SpamScope`. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Raw mail** +The raw mail to reproduce the behavior. +You can use a `gist` like [this](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e). +The issues without raw mail will be closed. + +**Environment:** + - OS: [e.g. Debian, Centos] + - Docker: [yes or no] + - `SpamScope` version [e.g. 3.6.0] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..066b2d9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,17 @@ +--- +name: Feature request +about: Suggest an idea for this project + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.gitignore b/.gitignore index 110be8f..8113e42 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,12 @@ _resources .env .idea/ .ropeproject +.tox/ .vscode/ *.pyc build/ dist/ logs/ +report/ SpamScope.egg-info/ venv/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 00c8e4d..15e2e5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,6 @@ python: env: - TIKA_APP_JAR=/tmp/tika-app-${TIKA_VER}.jar FAUP_PATH=/tmp/faup - ZEMANA_PATH=/tmp/zemana DOCKER_ELASTICSEARCH_PATH=/tmp/docker-elasticsearch before_install: @@ -48,7 +47,6 @@ install: - pip install --upgrade pip setuptools - python setup.py install - pip install -r requirements_optional.txt - - git clone https://$BITBUCKET_USER:$BITBUCKET_ROBOT_KEY@bitbucket.org/$BITBUCKET_USER/zemana-api.git $ZEMANA_PATH && cd $ZEMANA_PATH && python setup.py install && cd - - src/cli/faup.sh - pip install coveralls diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..71bbea6 --- /dev/null +++ b/Makefile @@ -0,0 +1,82 @@ +.PHONY: clean clean-test clean-pyc clean-build docs help +.DEFAULT_GOAL := help + +define BROWSER_PYSCRIPT +import os, webbrowser, sys + +try: + from urllib import pathname2url +except: + from urllib.request import pathname2url + +webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) +endef +export BROWSER_PYSCRIPT + +define PRINT_HELP_PYSCRIPT +import re, sys + +for line in sys.stdin: + match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) + if match: + target, help = match.groups() + print("%-20s %s" % (target, help)) +endef +export PRINT_HELP_PYSCRIPT + +BROWSER := python -c "$$BROWSER_PYSCRIPT" + +help: + @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) + +clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts + +clean-build: ## remove build artifacts + rm -fr build/ + rm -fr dist/ + rm -fr .eggs/ + find . -name '*.egg-info' -exec rm -fr {} + + find . -name '*.egg' -exec rm -f {} + + +clean-pyc: ## remove Python file artifacts + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +clean-test: ## remove test and coverage artifacts + rm -fr .tox/ + rm -f .coverage + rm -fr htmlcov/ + rm -fr .pytest_cache + +lint: ## check style with flake8 + flake8 src tests + +test: ## run tests quickly with the default Python + python -m unittest discover -s tests -f -v + +test-all: ## run tests on every Python version with tox + tox + +# docs: ## generate Sphinx HTML documentation, including API docs +# rm -f docs/mailparser.rst +# rm -f docs/modules.rst +# sphinx-apidoc -o docs/ mailparser +# $(MAKE) -C docs clean +# $(MAKE) -C docs html +# $(BROWSER) docs/_build/html/index.html + +# servedocs: docs ## compile the docs watching for changes +# watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . + +release: dist ## package and upload a release + twine upload dist/* + +dist: clean ## builds source and wheel package + python setup.py sdist + python setup.py bdist_wheel + ls -l dist + +install: clean ## install the package to the active Python's site-packages + python setup.py install diff --git a/ansible/requirements.txt b/ansible/requirements.txt index 19df0e6..cabb1f5 100644 --- a/ansible/requirements.txt +++ b/ansible/requirements.txt @@ -1 +1 @@ -ansible==2.5.0 +ansible \ No newline at end of file diff --git a/ansible/templates/spamscope.yml.j2 b/ansible/templates/spamscope.yml.j2 index 1bc35e7..c3d3561 100644 --- a/ansible/templates/spamscope.yml.j2 +++ b/ansible/templates/spamscope.yml.j2 @@ -51,6 +51,9 @@ phishing: tokenizer: + # Persistent where store dumps of hashes. + persistent_path: /tmp + # If true mails with same hash are filtered and not analyzed. # Only the body will not saved filter_mails: true @@ -84,19 +87,37 @@ network: enabled: false api_key: xxxxxxxxxxxxxxxxxxxxxxxxxx - # RawMail bolt configuration raw_mail: # SpamAssassin analysis: https://spamassassin.apache.org/ spamassassin: enabled: false - + # Dialect analysis: https://sissden.eu/blog/analysis-of-smtp-dialects + dialect: + enabled: false + + # elasticsearch instance where are postfix logs + elasticsearch: + hosts: + - "node1:9200" + - "node2" + + # Prefix with dash '-' of Postfix index in Elasticsearch + # The format of indices should be postfix-2018.12.30 + index.prefix.postfix: postfix- + # Attachments bolt configuration attachments: - # The lists of all components must be under lists keyword to load them - # automatically commons: + # enable or disable filter on size + size.filter.enabled: false + + # max size to analyze in bytes + max.size: 3145728 + + # The lists of all components must be under lists keyword to load them + # automatically lists: blacklist_content_types: # All content types to remove from results diff --git a/conf/spamscope.example.yml b/conf/spamscope.example.yml index 6e72726..4c5ba0a 100644 --- a/conf/spamscope.example.yml +++ b/conf/spamscope.example.yml @@ -113,7 +113,7 @@ raw_mail: # Dialect analysis: https://sissden.eu/blog/analysis-of-smtp-dialects dialect: enabled: false - + # elasticsearch instance where are postfix logs elasticsearch: hosts: @@ -127,14 +127,15 @@ raw_mail: # Attachments bolt configuration attachments: - # The lists of all components must be under lists keyword to load them - # automatically commons: # enable or disable filter on size size.filter.enabled: false + # max size to analyze in bytes max.size: 3145728 + # The lists of all components must be under lists keyword to load them + # automatically lists: blacklist_content_types: # All content types to remove from results diff --git a/conf/templates/commons.json b/conf/templates/commons.json index a62d292..619df8f 100644 --- a/conf/templates/commons.json +++ b/conf/templates/commons.json @@ -4,6 +4,7 @@ "settings": { "index.codec": "best_compression", "index.number_of_replicas": 0, + "index.number_of_shards": 1, "index.refresh_interval": "5s", "index.mapping.total_fields.limit": 100000, "index.mapping.ignore_malformed": true diff --git a/conf/templates/spamscope_attachments.json b/conf/templates/spamscope_attachments.json index 47d6eeb..2517e6c 100644 --- a/conf/templates/spamscope_attachments.json +++ b/conf/templates/spamscope_attachments.json @@ -1,38 +1,44 @@ { "order": 0, - "version": 2, - "index_patterns": "spamscope_attachments-*", + "version": 3, + "index_patterns": [ + "spamscope_attachments-*" + ], "settings": { - "analysis": { - "analyzer": { - "header": { - "tokenizer": "uax_url_email", - "filter": [ - "lowercase" - ] - }, - "html_body": { - "char_filter": [ - "html_strip" - ], - "tokenizer": "uax_url_email", - "filter": [ - "lowercase" - ] - }, - "path_pattern": { - "tokenizer": "path_hierarchy", - "filter": [ - "lowercase" - ] + "index": { + "codec": "best_compression", + "mapping": { + "ignore_malformed": "true" + }, + "refresh_interval": "5s", + "analysis": { + "analyzer": { + "header": { + "filter": [ + "lowercase" + ], + "tokenizer": "uax_url_email" + }, + "html_body": { + "filter": [ + "lowercase" + ], + "char_filter": [ + "html_strip" + ], + "tokenizer": "uax_url_email" + }, + "path_pattern": { + "filter": [ + "lowercase" + ], + "tokenizer": "path_hierarchy" + } } - } - }, - "index.codec": "best_compression", - "index.number_of_shards": 1, - "index.number_of_replicas": 0, - "index.refresh_interval": "5s", - "index.mapping.ignore_malformed": true + }, + "number_of_shards": "1", + "number_of_replicas": "0" + } }, "mappings": { "_doc": { @@ -86,5 +92,29 @@ } } } + }, + "aliases": { + "attachments": {}, + "attachments_thug": { + "filter": { + "exists": { + "field": "thug" + } + } + }, + "attachments_tika": { + "filter": { + "exists": { + "field": "tika" + } + } + }, + "attachments_virustotal": { + "filter": { + "exists": { + "field": "virustotal" + } + } + } } -} +} \ No newline at end of file diff --git a/conf/templates/spamscope_mails.json b/conf/templates/spamscope_mails.json index fac8c3d..1f8c78a 100644 --- a/conf/templates/spamscope_mails.json +++ b/conf/templates/spamscope_mails.json @@ -1,38 +1,44 @@ { "order": 0, - "version": 8, - "index_patterns": "spamscope_mails-*", + "version": 9, + "index_patterns": [ + "spamscope_mails-*" + ], "settings": { - "analysis": { - "analyzer": { - "header": { - "tokenizer": "uax_url_email", - "filter": [ - "lowercase" - ] - }, - "html_body": { - "char_filter": [ - "html_strip" - ], - "tokenizer": "uax_url_email", - "filter": [ - "lowercase" - ] - }, - "path_pattern": { - "tokenizer": "path_hierarchy", - "filter": [ - "lowercase" - ] + "index": { + "codec": "best_compression", + "mapping": { + "ignore_malformed": "true" + }, + "refresh_interval": "5s", + "analysis": { + "analyzer": { + "header": { + "filter": [ + "lowercase" + ], + "tokenizer": "uax_url_email" + }, + "html_body": { + "filter": [ + "lowercase" + ], + "char_filter": [ + "html_strip" + ], + "tokenizer": "uax_url_email" + }, + "path_pattern": { + "filter": [ + "lowercase" + ], + "tokenizer": "path_hierarchy" + } } - } - }, - "index.codec": "best_compression", - "index.number_of_shards": 1, - "index.number_of_replicas": 0, - "index.refresh_interval": "5s", - "index.mapping.ignore_malformed": true + }, + "number_of_shards": "1", + "number_of_replicas": "0" + } }, "mappings": { "_doc": { @@ -159,5 +165,24 @@ } } } + }, + "aliases": { + "mails": {}, + "mails_attachments": { + "filter": { + "term": { + "with_attachments": { + "value": "true" + } + } + } + }, + "mails_dialect": { + "filter": { + "exists": { + "field": "raw_mail.dialect" + } + } + } } -} +} \ No newline at end of file diff --git a/project.clj b/project.clj index e4b69d0..c4df0b0 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject spamscope "2.6.0-SNAPSHOT" +(defproject spamscope "2.7.0-SNAPSHOT" :resource-paths ["_resources"] :target-path "_build" :min-lein-version "2.0.0" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..12fb2b8 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,28 @@ +# always +PyYAML +astropy==1.3.3 +backports.functools-lru-cache>=1.3 +chainmap +lxml +mail-parser>=3.9.0 +patool +pyparsing +python-magic +simplejson +six +ssdeep +streamparse==3.15.1 + +# optional +elasticsearch>=6.0.0, <7 +redis>=2.10.5, <3 +shodan +tika-app>=1.4.0 +virustotal-api + +# editable +git+https://github.com/stricaud/faup.git#egg=pyfaup&subdirectory=src/lib/bindings/python + +# dev +flake8 +coverage \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ad611fa..0463e72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,11 +3,11 @@ astropy==1.3.3 backports.functools-lru-cache>=1.3 chainmap lxml -mail-parser>=3.4.1 +mail-parser>=3.9.0 patool pyparsing python-magic simplejson six ssdeep -streamparse==3.13.1 +streamparse==3.15.1 diff --git a/src/bolts/tokenizer.py b/src/bolts/tokenizer.py index 84070cd..7c4d096 100644 --- a/src/bolts/tokenizer.py +++ b/src/bolts/tokenizer.py @@ -24,6 +24,7 @@ import random import six from collections import deque +from cPickle import BadPickleGet from streamparse import Stream import mailparser @@ -84,7 +85,7 @@ def load_filters(self): try: obj = load_obj(path) setattr(self, "analyzed_" + i, obj) - except (IOError, EOFError, ValueError): + except (IOError, EOFError, ValueError, BadPickleGet): setattr(self, "analyzed_" + i, deque( maxlen=getattr(self, "maxlen_" + i))) @@ -100,7 +101,11 @@ def _make_mail(self, tup): mail_type = tup.values[5] rand = '_' + ''.join(random.choice('0123456789') for i in range(10)) self.parser = self.mailparser[mail_type](raw_mail) - mail = self.parser.mail + + # get only the mains headers because this number can explode + # Elastic can't manage all possible headers + mail = self.parser.mail_partial + mail["headers"] = self.parser.headers_json # Data mail sources mail["mail_server"] = tup.values[1] diff --git a/src/modules/attachments/attachments.py b/src/modules/attachments/attachments.py index e6d57e0..63198d2 100644 --- a/src/modules/attachments/attachments.py +++ b/src/modules/attachments/attachments.py @@ -141,7 +141,10 @@ def popcontenttype(self, content_type): remove = [] for i in self: - if not i.get("is_filtered", False): + filtered = i.get("is_filtered", False) + m_content_type = i["mail_content_type"].lower() + + if not filtered: try: if i["Content-Type"].lower() == content_type: remove.append(i) @@ -164,8 +167,7 @@ def popcontenttype(self, content_type): # you should remove sample from results. # You can't use Content-Type because we don't have payload, so # we use mail_content_type - elif (i.get("is_filtered") and - i["mail_content_type"].lower() == content_type): + elif (filtered and m_content_type == content_type): remove.append(i) else: diff --git a/src/modules/attachments/post_processing.py b/src/modules/attachments/post_processing.py index b7fbdcd..b31fea8 100644 --- a/src/modules/attachments/post_processing.py +++ b/src/modules/attachments/post_processing.py @@ -192,8 +192,8 @@ def thug(conf, attachments): i["thug"] = thug.run(i, **conf) -@register(processors, active=True) -def zemana(conf, attachments): +@register(processors, active=False) +def zemana(conf, attachments): # pragma: no cover """This method updates the attachments results with Zemana AntiMalware reports. @@ -224,10 +224,10 @@ def zemana(conf, attachments): log.exception( "HTTPError in Zemana query for md5 {!r}".format( a["md5"])) - - if result: - a["zemana"] = result.json - a["zemana"]["type"] = result.type + else: + if result: + a["zemana"] = result.json + a["zemana"]["type"] = result.type for i in a.get("files", []): try: @@ -236,10 +236,10 @@ def zemana(conf, attachments): log.exception( "HTTPError in Zemana query for md5 {!r}".format( i["md5"])) - - if i_result: - i["zemana"] = i_result.json - i["zemana"]["type"] = i_result.type + else: + if i_result: + i["zemana"] = i_result.json + i["zemana"]["type"] = i_result.type @register(processors, priority=999, active=True) diff --git a/src/modules/mails/dialects.py b/src/modules/mails/dialects.py index 1bd0100..50aef49 100644 --- a/src/modules/mails/dialects.py +++ b/src/modules/mails/dialects.py @@ -189,7 +189,9 @@ def get_messages(message_id, elastic_server, index_prefix, max_size=100): # From message_id get code of comunication from client and server r = es.search( - index=indices, body=query_code % {"message_id": message_id}) + index=indices, + body=query_code % {"message_id": message_id}, + ignore_unavailable=True) code = r["hits"]["hits"][0]["_source"]["code"] timestamp = r["hits"]["hits"][0]["_source"]["@timestamp"] @@ -197,7 +199,9 @@ def get_messages(message_id, elastic_server, index_prefix, max_size=100): # From code get client (ip and name) r = es.search( - index=indices, body=query_client % {"code": code}) + index=indices, + body=query_client % {"code": code}, + ignore_unavailable=True) client_ip = r["hits"]["hits"][0]["_source"]["client_ip"] client_name = r["hits"]["hits"][0]["_source"]["client_name"] @@ -208,7 +212,8 @@ def get_messages(message_id, elastic_server, index_prefix, max_size=100): "timestamp": timestamp, "client_ip": client_ip, "client_name": client_name}, - size=max_size) + size=max_size, + ignore_unavailable=True) messages = [(i["_source"]["actor"], i["_source"]["dialect"]) for i in r["hits"]["hits"]] diff --git a/src/options.py b/src/options.py index 288fddf..31ef55e 100644 --- a/src/options.py +++ b/src/options.py @@ -19,7 +19,7 @@ from os.path import join -__version__ = "2.6.0" +__version__ = "2.7.0" __configuration_path__ = "/etc/spamscope" __defaults__ = { diff --git a/tests/test_attachments.py b/tests/test_attachments.py index 0c2ab2d..c705042 100644 --- a/tests/test_attachments.py +++ b/tests/test_attachments.py @@ -305,12 +305,12 @@ def test_filtercontenttypes(self): self.assertEqual(len(t), 1) self.assertEqual(len(t[0]["files"]), 0) - @unittest.skipIf(OPTIONS["THUG_ENABLED"].capitalize() == "False" or - OPTIONS["VIRUSTOTAL_ENABLED"].capitalize() == "False" or - OPTIONS["ZEMANA_ENABLED"].capitalize() == "False", - "Complete post processing test skipped: " - "set env variables 'THUG_ENABLED', " - "'VIRUSTOTAL_ENABLED' and 'ZEMANA_ENABLED' to True") + @unittest.skipIf( + OPTIONS["THUG_ENABLED"].capitalize() == "False" or OPTIONS[ + "VIRUSTOTAL_ENABLED"].capitalize() == "False", + "Complete post processing test skipped: " + "set env variables 'THUG_ENABLED', " + "'VIRUSTOTAL_ENABLED' and 'ZEMANA_ENABLED' to True") def test_post_processing(self): t = MailAttachments.withhashes(self.attachments_thug) parameters = { @@ -325,7 +325,7 @@ def test_post_processing(self): "user_agents": ["win7ie90", "winxpie80"], "referer": "http://www.google.com/", "timeout": 300}, - "zemana": {"enabled": True, + "zemana": {"enabled": False, "PartnerId": OPTIONS["ZEMANA_PARTNERID"], "UserId": OPTIONS["ZEMANA_USERID"], "ApiKey": OPTIONS["ZEMANA_APIKEY"], @@ -336,12 +336,12 @@ def test_post_processing(self): for i in t: self.assertIn("tika", i) self.assertIn("virustotal", i) - self.assertIn("zemana", i) + self.assertNotIn("zemana", i) self.assertNotIn("thug", i) for j in i.get("files", []): self.assertIn("virustotal", j) - self.assertIn("zemana", j) + self.assertNotIn("zemana", j) self.assertIn("thug", j) def test_incorrect_padding(self): diff --git a/tests/test_attachments_utils.py b/tests/test_attachments_utils.py index 58ef03c..8257d90 100644 --- a/tests/test_attachments_utils.py +++ b/tests/test_attachments_utils.py @@ -19,7 +19,6 @@ import logging import os -import sys import unittest import simplejson as json diff --git a/tests/test_bitmap.py b/tests/test_bitmap.py index d805f6c..414d3b4 100644 --- a/tests/test_bitmap.py +++ b/tests/test_bitmap.py @@ -18,8 +18,6 @@ """ import logging -import os -import sys import unittest from context import bitmap diff --git a/tests/test_dialect.py b/tests/test_dialect.py index 1f73efc..f81550f 100644 --- a/tests/test_dialect.py +++ b/tests/test_dialect.py @@ -18,9 +18,7 @@ """ -import datetime import logging -import os import unittest from context import mails diff --git a/tests/test_network_post_processing.py b/tests/test_network_post_processing.py index 6b3a1a5..7cc0f66 100644 --- a/tests/test_network_post_processing.py +++ b/tests/test_network_post_processing.py @@ -90,11 +90,12 @@ def test_shodan(self): shodan(conf, "8.8.8", results) self.assertFalse(results) - @unittest.skipIf(OPTIONS["SHODAN_ENABLED"].capitalize() == "False" or - OPTIONS["VIRUSTOTAL_ENABLED"].capitalize() == "False", - "Complete post processing test skipped: " - "set env variables 'SHODAN_ENABLED' and " - "'VIRUSTOTAL_ENABLED' to True") + @unittest.skipIf( + OPTIONS["SHODAN_ENABLED"].capitalize() == "False" or OPTIONS[ + "VIRUSTOTAL_ENABLED"].capitalize() == "False", + "Complete post processing test skipped: " + "set env variables 'SHODAN_ENABLED' and " + "'VIRUSTOTAL_ENABLED' to True") def test_processors(self): """Test all post processing.""" diff --git a/tests/test_phishing.py b/tests/test_phishing.py index c153580..0db54f3 100644 --- a/tests/test_phishing.py +++ b/tests/test_phishing.py @@ -20,7 +20,6 @@ import logging import copy import os -import sys import unittest import mailparser diff --git a/tests/test_raw_mail_post_processing.py b/tests/test_raw_mail_post_processing.py index d6a58e4..73f338f 100644 --- a/tests/test_raw_mail_post_processing.py +++ b/tests/test_raw_mail_post_processing.py @@ -75,7 +75,8 @@ def test_processors(self): p_ordered = [i[0] for i in sorted(mails.processors, key=itemgetter(1))] conf = { - "spamassassin": {"enabled": True}} + "spamassassin": {"enabled": True}, + "dialect": {"enabled": False}} results = {} self.assertFalse(results) @@ -85,6 +86,7 @@ def test_processors(self): self.assertTrue(results) self.assertIn("spamassassin", results) + self.assertNotIn("dialect", results) if __name__ == '__main__': diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..97c9b9a --- /dev/null +++ b/tox.ini @@ -0,0 +1,13 @@ +[tox] +envlist = begin, py27, end + +[testenv:begin] +commands = coverage erase + +[testenv] +deps = -rrequirements-dev.txt +commands = + coverage run --append -m unittest discover -s tests -f -v + +[testenv:end] +commands = coverage html -d {toxinidir}/report