Skip to content

Commit

Permalink
Add inspector classes for spam checking. Updates to INSTALL
Browse files Browse the repository at this point in the history
 - Legacy-Id: 434
  • Loading branch information
rpcross committed Jun 7, 2016
1 parent 4069860 commit bd91bc6
Show file tree
Hide file tree
Showing 21 changed files with 410 additions and 50 deletions.
32 changes: 20 additions & 12 deletions INSTALL
Original file line number Diff line number Diff line change
Expand Up @@ -51,43 +51,50 @@ DEPENDENCIES
============
Python 2.7
MySQL 5.5
xapian-core 1.2.12
python-xapian 1.2.12
xapian-core 1.2.17
python-xapian 1.2.17
python-memcached
lxml 3.3.1
memcached 1.4.15
RabbitMQ Server 2.8.7
Celery v3.1.17
Celery v3.1.20
Django 1.7 (Included)
Haystack 2.1.1 (Included)


Python Packages
---------------
pip install python-dateutil
pip install pytest-django
pip install factory-boy
pip install -r requirements.txt

Prerequisites
-------------
0. zypper install xapian-core python-xapian

1. Install and configure memcached
zypper install memcached
chkconfig memcached on
systemctl enable memcached
systemctl start memcached
systemctl status memcached

2. Install and configure RabbitMQ
zypper install rabbitmq-server
zypper install rabbitmq-server-plugins
rabbitmq-plugins enable rabbitmq_management
chkconfig rabbitmq-server on
systemctl enable rabbitmq-server
systemctl start rabbitmq-server
systemctl status rabbitmq-server

see: http://www.rabbitmq.com/man/rabbitmqctl.1.man.html

3. Install and configure celery
pip install celery
cp INSTALL_DIR/celery/celeryd /etc/init.d
cp INSTALL_DIR/celery/celeryd.conf /etc/default/celeryd
add unpriviledged user celery, GROUP=celery
chkconfig celeryd on
systemctl enable celeryd
systemctl start celeryd
systemctl status celeryd
# celery status
** NOTE ** Celery will fail to start unless log/mlarchive.log and
log/archive-mail.log exist and are writable

SETUP:
The following steps will refer to INSTALL_DIR (ie. /a/mailarch )
Expand Down Expand Up @@ -156,6 +163,7 @@ ietfarch-atompub-archive: "|/a/ietf/scripts/call-archives atompub"
x) Install Cronscripts


*** DEPRECATED ***
6. Xapian Replication
On the Server: add the following line to celeryd.conf:
export XAPIAN_MAX_CHANGESETS=10
Expand All @@ -168,7 +176,7 @@ On the Client: periodically run the following

cd /a/mailarch/data
xapian-replicate -o -h [server hostname] -p 7010 archive_index

*** *** *** ***

TESTING:
Testing requires an instance of the message Xapian message index. It must contain an
Expand Down
2 changes: 1 addition & 1 deletion mlarchive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .celeryapp import app

__version__ = "1.2.6"
__version__ = "1.3.0"

__date__ = "$Date$"

Expand Down
81 changes: 81 additions & 0 deletions mlarchive/archive/inspectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
'''This module contains classes which inherit from Inspector. They are used
to inspect incoming messages and perform some auxiliary processing. ie. spam
checkers'''

from django.conf import settings

class SpamMessage(Exception):
pass


class InspectorMeta(type):
def __init__(cls, name, bases, dct):
if not hasattr(cls, 'registry'):
# this is the base class. Create an empty registry
cls.registry = {}
else:
# this is a derived class. Add cls to the registry
interface_id = name.lower()
cls.registry[interface_id] = cls

super(InspectorMeta, cls).__init__(name, bases, dct)


class Inspector(object):
'''The base class for inspector classes. Takes a MessageWrapper object and listname
(string). Inherit from this class and implement has_condition(), handle_file(),
raise_error() methods. Call inspect() to run inspection.'''
__metaclass__ = InspectorMeta

def __init__(self, message_wrapper, options=None):
self.message_wrapper = message_wrapper
self.listname = message_wrapper.listname
if options:
self.options = options
else:
self.options = settings.INSPECTORS.get(self.__class__.__name__)

def inspect(self):
if 'includes' in self.options and self.listname not in self.options['includes']:
return
if self.has_condition():
if not self.options.get('check_only'):
self.handle_file()
self.raise_error()

def has_condition(self):
raise NotImplementedError

def handle_file(self):
raise NotImplementedError

def raise_error(self):
raise NotImplementedError


class SpamInspector(Inspector):
'''Base spam handling class. To write a spam filter inherit from this class and
implement check_condition(). Filters will be run on all mail unless a
settings.INSPECTOR_INLCUDES entry is used'''

def has_condition(self):
raise NotImplementedError

def handle_file(self):
self.message_wrapper.write_msg(subdir='_spam')

def raise_error(self):
raise SpamMessage('Spam Detected. Message-ID: {}'.format(self.message_wrapper.msgid))


class ListIdSpamInspector(SpamInspector):
'''Checks for missing or bogus List-Id header (doesn't contain listname). If so,
message is spam (has_condition = True)'''
def has_condition(self):
listid = self.message_wrapper.email_message.get('List-Id')
if listid and self.listname in listid:
return False
else:
return True


14 changes: 13 additions & 1 deletion mlarchive/archive/management/commands/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from mlarchive.archive.models import Attachment, EmailList, Legacy, Message, Thread
from mlarchive.archive.management.commands._mimetypes import CONTENT_TYPES, UNKNOWN_CONTENT_TYPE
from mlarchive.archive.inspectors import *
from mlarchive.utils.decorators import check_datetime
from mlarchive.utils.encoding import decode_safely, decode_rfc2047_header

Expand Down Expand Up @@ -118,6 +119,10 @@ def archive_message(data,listname,private=False,save_failed=True):
# if DuplicateMessage it's already been saved to _dupes
logger.error('Archive message failed [{0}]'.format(error.args))
return 0
except SpamMessage as error:
# if SpamMessage it's already been saved to _spam
logger.error('Archive message failed [{0}]'.format(error.args))
return 0
except Exception as error:
logger.error('Archive message failed [{0}]'.format(error.args))
if not save_failed:
Expand Down Expand Up @@ -827,6 +832,13 @@ def save(self, test=False):
"""Ensure message is not duplicate message-id or hash. Save message to database.
Save to disk (if not test mode) and process attachments.
"""
# check for spam
if hasattr(settings, 'INSPECTORS'):
for inspector_name in settings.INSPECTORS:
inspector_class = eval(inspector_name)
inspector = inspector_class(self)
inspector.inspect()

# check for duplicate message id, and skip
if Message.objects.filter(msgid=self.msgid,email_list__name=self.listname):
self.write_msg(subdir='_dupes')
Expand All @@ -838,7 +850,7 @@ def save(self, test=False):
raise CommandError('Duplicate hash, msgid: %s' % self.msgid)

# ensure message has been processed
x = self.archive_message
_ = self.archive_message

# write message to disk and then save, post_save signal calls indexer
# which requires file to be present
Expand Down
14 changes: 7 additions & 7 deletions mlarchive/archive/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from mlarchive.utils.decorators import check_access, superuser_only, pad_id
from mlarchive.archive import actions
from mlarchive.archive.query_utils import get_kwargs
from mlarchive.archive.view_funcs import (initialize_formsets, get_columns, get_export,
from mlarchive.archive.view_funcs import (initialize_formsets, get_columns, get_export,
find_message_date, find_message_date_reverse, find_message_gbt)

from models import *
Expand Down Expand Up @@ -65,12 +65,12 @@ def build_form(self, form_kwargs=None):
return super(self.__class__,self).build_form(form_kwargs={ 'request' : self.request })

def build_page(self):
"""Returns tuple of:
"""Returns tuple of:
- subset of results for display
- queryset offset: the offset of results subset within entire queryset
- selected offset: the offset of message specified in query arguments within
results subset
If request arguments include "index", returns slice of results containing
message named in "index" with appropriate offset within slice, otherwise returns
first #(results_per_page) messages and offsets=0.
Expand Down Expand Up @@ -136,7 +136,7 @@ def find_message(self,hash):
msg = Message.objects.get(hashcode=hash+'=')
except Message.DoesNotExist:
raise Http404("No such message!")

if self.request.GET.get('gbt'):
return find_message_gbt(self.results,msg)
elif self.request.GET.get('so') == 'date':
Expand Down Expand Up @@ -173,7 +173,7 @@ def create_response(self):
@superuser_only
def admin(request):
"""Administrator View. Only accessible by the superuser this view allows
the administrator to run queries and perform actions, ie. remove spam, on the
the administrator to run queries and perform actions, ie. remove spam, on the
results. Available actions are defined in actions.py
"""
results = None
Expand All @@ -183,7 +183,7 @@ def admin(request):
if form.is_valid():
kwargs = get_kwargs(form.cleaned_data)
if kwargs:
results = SearchQuerySet().filter(**kwargs)
results = SearchQuerySet().filter(**kwargs).order_by('id')
else:
action = request.POST.get('action')
func = getattr(actions, action)
Expand Down Expand Up @@ -240,7 +240,7 @@ def browse(request, list_name=None):
if list_name:
redirect_url = '%s?%s' % (reverse('archive_search'), 'email_list=' + list_name)
return redirect(redirect_url)

form = BrowseForm()
columns = get_columns(request.user)

Expand Down
70 changes: 70 additions & 0 deletions mlarchive/bin/check_spam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/python
'''
This script will scan messages in the archive, identify spam and remove it (move it
to the _spam directory)
'''

# Set PYTHONPATH and load environment variables for standalone script -----------------
# for file living in project/bin/
import os
import sys
path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if not path in sys.path:
sys.path.insert(0, path)

import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'mlarchive.settings.development'
django.setup()

# -------------------------------------------------------------------------------------
import argparse
import email

from celery_haystack.utils import get_update_task
from django.conf import settings

from mlarchive.archive.forms import get_list_info
from mlarchive.archive.inspectors import *
from mlarchive.archive.management.commands._classes import MessageWrapper
from mlarchive.archive.models import *

import logging
logpath = os.path.join(settings.DATA_ROOT,'log/check_spam.log')
logging.basicConfig(filename=logpath,level=logging.DEBUG)


def main():
# parse arguments
parser = argparse.ArgumentParser(description='Check archive for spam')
parser.add_argument('-i', '--inspector', help="enter the inspector class to use")
parser.add_argument('-l', '--list', help="enter the email list name to check")
parser.add_argument('-r','--remove',help="remove spam. default is check only",action='store_true')
args = parser.parse_args()
stat = {}

if not EmailList.objects.filter(name=args.list).exists():
parser.error('List {} does not exist'.format(args.list))

inspector_class = eval(args.inspector)

stat['scanned'] = Message.objects.filter(email_list__name=args.list).count()
stat['spam'] = 0

for message in Message.objects.filter(email_list__name=args.list):
path = message.get_file_path()
with open(path) as f:
msg = email.message_from_file(f)
mw = MessageWrapper(msg,args.list)
inspector = inspector_class(mw,{'check_only':not args.remove})
try:
inspector.inspect()
except SpamMessage:
stat['spam'] = stat['spam'] + 1
if args.remove:
message.delete()

for k,v in stat.items():
print "{}:{}".format(k,v)

if __name__ == "__main__":
main()
File renamed without changes.
Loading

0 comments on commit bd91bc6

Please sign in to comment.