Skip to content
This repository has been archived by the owner on Dec 16, 2017. It is now read-only.

Updated to use combine plugins vs. hardcoded urls/parsing #171

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@

## Maltrieve

Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mwcrawler). It retrieves malware directly from the sources as listed at a number of sites. Currently we crawl the following:
Maltrieve originated as a fork of [mwcrawler](https://github.com/ricardo-dias/mwcrawler). It retrieves malware directly from the sources as listed at a number of sites. Currently we crawl the following (via included plugins):

* [Malc0de](http://malc0de.com/rss)
* [Malware Domain List](http://www.malwaredomainlist.com/hostslist/mdl.xml)
* [Malware URLs](http://malwareurls.joxeankoret.com/normal.txt)
* [VX Vault](http://vxvault.siri-urz.net/URL_List.php)
* [URLquery](http://urlquery.net/)
* [CleanMX](http://support.clean-mx.de/clean-mx/xmlviruses.php?)
* [ZeusTracker](https://zeustracker.abuse.ch/monitor.php?urlfeed=binaries)
* Additional plugins available: [Combine plugins](https://github.com/mlsecproject/combine/tree/dev/combine/plugins)

Other improvements include:

Expand All @@ -42,6 +40,7 @@ Maltrieve requires the following dependencies:
* [feedparser](https://pypi.python.org/pypi/feedparser)
* [python-magic](https://pypi.python.org/pypi/python-magic/)
* [Requests](http://www.python-requests.org)
* [Yapsy](https://pypi.python.org/pypi/Yapsy)

With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `pip install -e .`. You may need to prepend that with ```sudo``` if not running in a virtual environment, but using such an environment is highly encouraged.

Expand Down
1 change: 1 addition & 0 deletions maltrieve.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ dumpdir = archive
logfile = maltrieve.log
logheaders = true
User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
plugin_dir = ./plugins/

#viper = http://127.0.0.1:8080
#cuckoo = http://127.0.0.1:8090
Expand Down
95 changes: 44 additions & 51 deletions maltrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
import requests
from bs4 import BeautifulSoup

from yapsy.PluginManager import PluginManager


class config(object):

Expand All @@ -45,6 +47,13 @@ class config(object):
def __init__(self, args, filename='maltrieve.cfg'):
self.configp = ConfigParser.ConfigParser()
self.configp.read(filename)
self.plugin_dir = './plugins'

try:
if self.configp.get('Maltrieve', 'plugin_dir'):
self.plugin_dir = self.configp.get('Maltrieve', 'plugin_dir')
except Exception as e:
pass

if args.logfile or self.configp.get('Maltrieve', 'logfile'):
if args.logfile:
Expand Down Expand Up @@ -343,45 +352,6 @@ def save_malware(response, cfg):
return True


def process_xml_list_desc(response):
feed = feedparser.parse(response)
urls = set()

for entry in feed.entries:
desc = entry.description
url = desc.split(' ')[1].rstrip(',')
if url == '':
continue
if url == '-':
url = desc.split(' ')[4].rstrip(',')
url = re.sub('&', '&', url)
if not re.match('http', url):
url = 'http://' + url
urls.add(url)

return urls


def process_xml_list_title(response):
feed = feedparser.parse(response)
urls = set([re.sub('&', '&', entry.title) for entry in feed.entries])
return urls


def process_simple_list(response):
urls = set([re.sub('&', '&', line.strip()) for line in response.split('\n') if line.startswith('http')])
return urls


def process_urlquery(response):
soup = BeautifulSoup(response)
urls = set()
for t in soup.find_all("table", class_="test"):
for a in t.find_all("a"):
urls.add('http://' + re.sub('&', '&', a.text))
return urls


def chunker(seq, size):
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))

Expand Down Expand Up @@ -463,28 +433,51 @@ def main():
hashes = load_hashes('hashes.json')
past_urls = load_urls('urls.json')

print "Processing source URLs"
print 'Loading Plugins'
# Load the plugins from the plugin directory.
manager = PluginManager()
manager.setPluginPlaces([cfg.plugin_dir])
manager.collectPlugins()

# TODO: Replace with plugins
source_urls = {'https://zeustracker.abuse.ch/monitor.php?urlfeed=binaries': process_xml_list_desc,
'http://www.malwaredomainlist.com/hostslist/mdl.xml': process_xml_list_desc,
'http://malc0de.com/rss/': process_xml_list_desc,
'http://vxvault.net/URL_List.php': process_simple_list,
'http://urlquery.net/': process_urlquery,
'http://support.clean-mx.de/clean-mx/rss?scope=viruses&limit=0%2C64': process_xml_list_title,
'http://malwareurls.joxeankoret.com/normal.txt': process_simple_list}
headers = {'User-Agent': 'Maltrieve'}
source_urls = []
for plugin in manager.getAllPlugins():
print 'Processing: ' + plugin.plugin_object.get_name()
o_headers = None
try:
o_headers = plugin.plugin_object.get_headers()
except Exception as e:
pass # because we don't care if this isn't implemented in plugins
for url in plugin.plugin_object.get_URLs():
if url.startswith('file://'):
files.append(url.partition('://')[2])
else:
source_urls.append(url)

headers = {'User-Agent': 'Maltrieve'}
reqs = [grequests.get(url, timeout=60, headers=headers, proxies=cfg.proxy) for url in source_urls]
source_lists = grequests.map(reqs)

print "Completed source processing"
print "Completed source retrieval"

headers['User-Agent'] = cfg.useragent
malware_urls = set()
for response in source_lists:
if hasattr(response, 'status_code') and response.status_code == 200:
malware_urls.update(source_urls[response.url](response.text))
print "Processing feed from %s" % response.url
# Loop through all the plugins and see which ones have matching names
for plugin in manager.getAllPlugins():
urls = set(plugin.plugin_object.URLS)
# For those plugins that build dynamic URLs, we should get those for the comparison for parsing
urls = set(plugin.plugin_object.get_URLs())
if response.url in urls:
print 'Parsing feed from %s' % response.url
result = plugin.plugin_object.process_data(response.url, response.text)
for r in result:
if r['indicator_type'] == 'IPv4' or r['indicator_type'] == 'FQDN':
indicator = 'http://' + r['indicator']
malware_urls.add(indicator)
elif r['indicator_type'] == 'URL':
malware_urls.add(r['indicator'])

if cfg.inputfile:
with open(cfg.inputfile, 'rb') as f:
Expand Down
27 changes: 27 additions & 0 deletions plugins/joxeankoret.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import datetime

from yapsy.IPlugin import IPlugin


class PluginOne(IPlugin):
NAME = "joxeankoret"
DIRECTION = "outbound"
URLS = ['http://malwareurls.joxeankoret.com/normal.txt']

def get_URLs(self):
return self.URLS

def get_direction(self):
return self.DIRECTION

def get_name(self):
return self.NAME

def process_data(self, source, response):
current_date = str(datetime.date.today())
data = []
for line in response.splitlines():
if line.startswith('http'):
data.append({'indicator': line, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
'source_name': self.NAME, 'source': source, 'date': current_date})
return data
9 changes: 9 additions & 0 deletions plugins/joxeankoret.yapsy-plugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[Core]
Name = joxeankoret
Module = joxeankoret

[Documentation]
Author = [email protected]
Version = 0.1
Website = http://secrepo.com
Description = Joxean Koret Malware URLs
39 changes: 39 additions & 0 deletions plugins/malwaredomainlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import datetime
import feedparser
import re

from yapsy.IPlugin import IPlugin


class PluginOne(IPlugin):
NAME = "malwaredomainlist"
DIRECTION = "outbound"
URLS = ['http://www.malwaredomainlist.com/hostslist/mdl.xml']

def get_URLs(self):
return self.URLS

def get_direction(self):
return self.DIRECTION

def get_name(self):
return self.NAME

def process_data(self, source, response):
current_date = str(datetime.date.today())
data = []
feed = feedparser.parse(response)

for entry in feed.entries:
desc = entry.description
url = desc.split(' ')[1].rstrip(',')
if url == '':
continue
if url == '-':
url = desc.split(' ')[4].rstrip(',')
url = re.sub('&', '&', url)
if not re.match('http', url):
url = 'http://' + url
data.append({'indicator': url, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
'source_name': self.NAME, 'source': source, 'note': desc, 'date': current_date})
return data
9 changes: 9 additions & 0 deletions plugins/malwaredomainlist.yapsy-plugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[Core]
Name = malwaredomainlist
Module = malwaredomainlist

[Documentation]
Author = [email protected]
Version = 0.1
Website = http://secrepo.com
Description = Malware Domain List
31 changes: 31 additions & 0 deletions plugins/urlquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import datetime
import bs4
import re

from yapsy.IPlugin import IPlugin


class PluginOne(IPlugin):
NAME = "urlquery"
DIRECTION = "outbound"
URLS = ['https://urlquery.net/']

def get_URLs(self):
return self.URLS

def get_direction(self):
return self.DIRECTION

def get_name(self):
return self.NAME

def process_data(self, source, response):
current_date = str(datetime.date.today())
data = []
soup = bs4.BeautifulSoup(response)
for t in soup.find_all("table", class_="test"):
for a in t.find_all("a"):
indicator = 'http://' + re.sub('&', '&', a.text)
data.append({'indicator': indicator, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
'source_name': self.NAME, 'source': source, 'date': current_date})
return data
9 changes: 9 additions & 0 deletions plugins/urlquery.yapsy-plugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[Core]
Name = urlquery
Module = urlquery

[Documentation]
Author = [email protected]
Version = 0.1
Website = http://secrepo.com
Description = URLQuery
28 changes: 28 additions & 0 deletions plugins/vxvault.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import datetime
import re

from yapsy.IPlugin import IPlugin


class PluginOne(IPlugin):
NAME = "vxvault"
DIRECTION = "outbound"
URLS = ['http://vxvault.net/URL_List.php']

def get_URLs(self):
return self.URLS

def get_direction(self):
return self.DIRECTION

def get_name(self):
return self.NAME

def process_data(self, source, response):
current_date = str(datetime.date.today())
data = []
for line in response.splitlines():
if line.startswith('http'):
data.append({'indicator': line, 'indicator_type': "URL", 'indicator_direction': self.DIRECTION,
'source_name': self.NAME, 'source': source, 'date': current_date})
return data
9 changes: 9 additions & 0 deletions plugins/vxvault.yapsy-plugin
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[Core]
Name = vxvault
Module = vxvault

[Documentation]
Author = [email protected]
Version = 0.1
Website = http://secrepo.com
Description = VX Vault
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
'pytest-cov',
'coveralls',
'LinkChecker',
'yapsy',
'markdown'
],
package_dir={'maltrieve': 'src'},
Expand Down
18 changes: 0 additions & 18 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,24 +71,6 @@ def test_create_default_dumpdir_when_specified_doesnt_exist():
assert cfg.dumpdir == '/tmp/malware'


def test_parse_simple_list():
source = requests.get('http://xwell.org/assets/maltrieve-test.txt').text
assert maltrieve.process_simple_list(source) == \
set(['http://example.org/mylist', 'http://example.com/yourlist'])


def test_parse_xml_list():
source = requests.get('http://xwell.org/assets/maltrieve-test-list.xml').text
assert maltrieve.process_xml_list_title(source) == \
set(['http://example.org/mylist', 'http://example.com/yourlist'])


def test_parse_xml_desc():
source = requests.get('http://xwell.org/assets/maltrieve-test-desc.xml').text
assert maltrieve.process_xml_list_desc(source) == \
set(['http://example.org/mylist', 'http://example.com/yourlist'])


def test_load_hashes(hashfile='test-load-hashes.json'):
assert maltrieve.load_hashes(hashfile) == \
set(['d41d8cd98f00b204e9800998ecf8427e'])
Expand Down