From 7ffcca41ae01767ea9e8626bb935e74bbc7fec2f Mon Sep 17 00:00:00 2001 From: "maciej.skorski" Date: Wed, 19 Jul 2023 18:09:13 +0000 Subject: [PATCH 1/3] fix --- build/lib/ember/__init__.py | 233 +++++++++++++++ build/lib/ember/features.py | 556 ++++++++++++++++++++++++++++++++++++ 2 files changed, 789 insertions(+) create mode 100644 build/lib/ember/__init__.py create mode 100644 build/lib/ember/features.py diff --git a/build/lib/ember/__init__.py b/build/lib/ember/__init__.py new file mode 100644 index 00000000..d0bee0d0 --- /dev/null +++ b/build/lib/ember/__init__.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- + +import os +import json +import tqdm +import numpy as np +import pandas as pd +import lightgbm as lgb +import multiprocessing +from .features import PEFeatureExtractor +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import TimeSeriesSplit +from sklearn.metrics import (roc_auc_score, make_scorer) + + +def raw_feature_iterator(file_paths): + """ + Yield raw feature strings from the inputed file paths + """ + for path in file_paths: + with open(path, "r") as fin: + for line in fin: + yield line + + +def vectorize(irow, raw_features_string, X_path, y_path, extractor, nrows): + """ + Vectorize a single sample of raw features and write to a large numpy file + """ + raw_features = json.loads(raw_features_string) + feature_vector = extractor.process_raw_features(raw_features) + + y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows) + y[irow] = raw_features["label"] + + X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, extractor.dim)) + X[irow] = feature_vector + + +def vectorize_unpack(args): + """ + Pass through function for unpacking vectorize arguments + """ + return vectorize(*args) + + +def vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows): + """ + Vectorize a subset of data and write it to disk + """ + # Create space on disk to write features to + X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(nrows, extractor.dim)) + y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=nrows) + del X, y + + # Distribute the vectorization work + pool = multiprocessing.Pool() + argument_iterator = ((irow, raw_features_string, X_path, y_path, extractor, nrows) + for irow, raw_features_string in enumerate(raw_feature_iterator(raw_feature_paths))) + for _ in tqdm.tqdm(pool.imap_unordered(vectorize_unpack, argument_iterator), total=nrows): + pass + + +def create_vectorized_features(data_dir, feature_version=2): + """ + Create feature vectors from raw features and write them to disk + """ + extractor = PEFeatureExtractor(feature_version) + + print("Vectorizing training set") + X_path = os.path.join(data_dir, "X_train.dat") + y_path = os.path.join(data_dir, "y_train.dat") + raw_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)] + nrows = sum([1 for fp in raw_feature_paths for line in open(fp)]) + vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows) + + print("Vectorizing test set") + X_path = os.path.join(data_dir, "X_test.dat") + y_path = os.path.join(data_dir, "y_test.dat") + raw_feature_paths = [os.path.join(data_dir, "test_features.jsonl")] + nrows = sum([1 for fp in raw_feature_paths for line in open(fp)]) + vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows) + + +def read_vectorized_features(data_dir, subset=None, feature_version=2): + """ + Read vectorized features into memory mapped numpy arrays + """ + if subset is not None and subset not in ["train", "test"]: + return None + + extractor = PEFeatureExtractor(feature_version) + ndim = extractor.dim + X_train = None + y_train = None + X_test = None + y_test = None + + if subset is None or subset == "train": + X_train_path = os.path.join(data_dir, "X_train.dat") + y_train_path = os.path.join(data_dir, "y_train.dat") + y_train = np.memmap(y_train_path, dtype=np.float32, mode="r") + N = y_train.shape[0] + X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim)) + if subset == "train": + return X_train, y_train + + if subset is None or subset == "test": + X_test_path = os.path.join(data_dir, "X_test.dat") + y_test_path = os.path.join(data_dir, "y_test.dat") + y_test = np.memmap(y_test_path, dtype=np.float32, mode="r") + N = y_test.shape[0] + X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim)) + if subset == "test": + return X_test, y_test + + return X_train, y_train, X_test, y_test + + +def read_metadata_record(raw_features_string): + """ + Decode a raw features string and return the metadata fields + """ + all_data = json.loads(raw_features_string) + metadata_keys = {"sha256", "appeared", "label", "avclass"} + return {k: all_data[k] for k in all_data.keys() & metadata_keys} + + +def create_metadata(data_dir): + """ + Write metadata to a csv file and return its dataframe + """ + pool = multiprocessing.Pool() + + train_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)] + train_records = list(pool.imap(read_metadata_record, raw_feature_iterator(train_feature_paths))) + + metadata_keys = ["sha256", "appeared", "label", "avclass"] + ordered_metadata_keys = [k for k in metadata_keys if k in train_records[0].keys()] + + train_metadf = pd.DataFrame(train_records)[ordered_metadata_keys] + train_metadf.to_csv(os.path.join(data_dir, "train_metadata.csv")) + + train_records = [dict(record, **{"subset": "train"}) for record in train_records] + + test_feature_paths = [os.path.join(data_dir, "test_features.jsonl")] + test_records = list(pool.imap(read_metadata_record, raw_feature_iterator(test_feature_paths))) + + test_metadf = pd.DataFrame(test_records)[ordered_metadata_keys] + test_metadf.to_csv(os.path.join(data_dir, "test_metadata.csv")) + + test_records = [dict(record, **{"subset": "test"}) for record in test_records] + + all_metadata_keys = ordered_metadata_keys + ["subset"] + metadf = pd.DataFrame(train_records + test_records)[all_metadata_keys] + metadf.to_csv(os.path.join(data_dir, "metadata.csv")) + return metadf + + +def read_metadata(data_dir): + """ + Read an already created metadata file and return its dataframe + """ + return pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0) + + +def optimize_model(data_dir): + """ + Run a grid search to find the best LightGBM parameters + """ + # Read data + X_train, y_train = read_vectorized_features(data_dir, subset="train") + + # Filter unlabeled data + train_rows = (y_train != -1) + + # read training dataset + X_train = X_train[train_rows] + y_train = y_train[train_rows] + + # score by roc auc + # we're interested in low FPR rates, so we'll consider only the AUC for FPRs in [0,5e-3] + score = make_scorer(roc_auc_score, max_fpr=5e-3) + + # define search grid + param_grid = { + 'boosting_type': ['gbdt'], + 'objective': ['binary'], + 'num_iterations': [500, 1000], + 'learning_rate': [0.005, 0.05], + 'num_leaves': [512, 1024, 2048], + 'feature_fraction': [0.5, 0.8, 1.0], + 'bagging_fraction': [0.5, 0.8, 1.0] + } + model = lgb.LGBMClassifier(boosting_type="gbdt", n_jobs=-1, silent=True) + + # each row in X_train appears in chronological order of "appeared" + # so this works for progrssive time series splitting + progressive_cv = TimeSeriesSplit(n_splits=3).split(X_train) + + grid = GridSearchCV(estimator=model, cv=progressive_cv, param_grid=param_grid, scoring=score, n_jobs=1, verbose=3) + grid.fit(X_train, y_train) + + return grid.best_params_ + + +def train_model(data_dir, params={}, feature_version=2): + """ + Train the LightGBM model from the EMBER dataset from the vectorized features + """ + # update params + params.update({"application": "binary"}) + + # Read data + X_train, y_train = read_vectorized_features(data_dir, "train", feature_version) + + # Filter unlabeled data + train_rows = (y_train != -1) + + # Train + lgbm_dataset = lgb.Dataset(X_train[train_rows], y_train[train_rows]) + lgbm_model = lgb.train(params, lgbm_dataset) + + return lgbm_model + + +def predict_sample(lgbm_model, file_data, feature_version=2): + """ + Predict a PE file with an LightGBM model + """ + extractor = PEFeatureExtractor(feature_version) + features = np.array(extractor.feature_vector(file_data), dtype=np.float32) + return lgbm_model.predict([features])[0] diff --git a/build/lib/ember/features.py b/build/lib/ember/features.py new file mode 100644 index 00000000..8e179f05 --- /dev/null +++ b/build/lib/ember/features.py @@ -0,0 +1,556 @@ +#!/usr/bin/python +''' Extracts some basic features from PE files. Many of the features +implemented have been used in previously published works. For more information, +check out the following resources: +* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf +* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf +* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf +* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf +* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf + +It may be useful to do feature selection to reduce this set of features to a meaningful set +for your modeling problem. +''' + +import re +import lief +import hashlib +import numpy as np +import os +import json +from sklearn.feature_extraction import FeatureHasher + +LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.') +LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 ) +LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11) + + +class FeatureType(object): + ''' Base class from which each feature type may inherit ''' + + name = '' + dim = 0 + + def __repr__(self): + return '{}({})'.format(self.name, self.dim) + + def raw_features(self, bytez, lief_binary): + ''' Generate a JSON-able representation of the file ''' + raise (NotImplementedError) + + def process_raw_features(self, raw_obj): + ''' Generate a feature vector from the raw features ''' + raise (NotImplementedError) + + def feature_vector(self, bytez, lief_binary): + ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently + if there are significant speedups to be gained from combining the two functions. ''' + return self.process_raw_features(self.raw_features(bytez, lief_binary)) + + +class ByteHistogram(FeatureType): + ''' Byte histogram (count + non-normalized) over the entire binary file ''' + + name = 'histogram' + dim = 256 + + def __init__(self): + super(FeatureType, self).__init__() + + def raw_features(self, bytez, lief_binary): + counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256) + return counts.tolist() + + def process_raw_features(self, raw_obj): + counts = np.array(raw_obj, dtype=np.float32) + sum = counts.sum() + normalized = counts / sum + return normalized + + +class ByteEntropyHistogram(FeatureType): + ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015). + This roughly approximates the joint probability of byte value and local entropy. + See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info. + ''' + + name = 'byteentropy' + dim = 256 + + def __init__(self, step=1024, window=2048): + super(FeatureType, self).__init__() + self.window = window + self.step = step + + def _entropy_bin_counts(self, block): + # coarse histogram, 16 bytes per bin + c = np.bincount(block >> 4, minlength=16) # 16-bin histogram + p = c.astype(np.float32) / self.window + wh = np.where(c)[0] + H = np.sum(-p[wh] * np.log2( + p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits) + + Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits) + if Hbin == 16: # handle entropy = 8.0 bits + Hbin = 15 + + return Hbin, c + + def raw_features(self, bytez, lief_binary): + output = np.zeros((16, 16), dtype=np.int) + a = np.frombuffer(bytez, dtype=np.uint8) + if a.shape[0] < self.window: + Hbin, c = self._entropy_bin_counts(a) + output[Hbin, :] += c + else: + # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html + shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window) + strides = a.strides + (a.strides[-1],) + blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :] + + # from the blocks, compute histogram + for block in blocks: + Hbin, c = self._entropy_bin_counts(block) + output[Hbin, :] += c + + return output.flatten().tolist() + + def process_raw_features(self, raw_obj): + counts = np.array(raw_obj, dtype=np.float32) + sum = counts.sum() + normalized = counts / sum + return normalized + + +class SectionInfo(FeatureType): + ''' Information about section names, sizes and entropy. Uses hashing trick + to summarize all this section info into a feature vector. + ''' + + name = 'section' + dim = 5 + 50 + 50 + 50 + 50 + 50 + + def __init__(self): + super(FeatureType, self).__init__() + + @staticmethod + def _properties(s): + return [str(c).split('.')[-1] for c in s.characteristics_lists] + + def raw_features(self, bytez, lief_binary): + if lief_binary is None: + return {"entry": "", "sections": []} + + # properties of entry point, or if invalid, the first executable section + + try: + if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): + section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) + if section is None: + raise lief.not_found + entry_section = section.name + else: # lief < 0.12 + entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name + except lief.not_found: + # bad entry point, let's find the first executable section + entry_section = "" + for s in lief_binary.sections: + if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: + entry_section = s.name + break + + raw_obj = {"entry": entry_section} + raw_obj["sections"] = [{ + 'name': s.name, + 'size': s.size, + 'entropy': s.entropy, + 'vsize': s.virtual_size, + 'props': self._properties(s) + } for s in lief_binary.sections] + return raw_obj + + def process_raw_features(self, raw_obj): + sections = raw_obj['sections'] + general = [ + len(sections), # total number of sections + # number of sections with zero size + sum(1 for s in sections if s['size'] == 0), + # number of sections with an empty name + sum(1 for s in sections if s['name'] == ""), + # number of RX + sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']), + # number of W + sum(1 for s in sections if 'MEM_WRITE' in s['props']) + ] + # gross characteristics of each section + section_sizes = [(s['name'], s['size']) for s in sections] + section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0] + section_entropy = [(s['name'], s['entropy']) for s in sections] + section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] + section_vsize = [(s['name'], s['vsize']) for s in sections] + section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] + entry_name_hashed = FeatureHasher(50, input_type="string").transform([ [raw_obj['entry']] ]).toarray()[0] + characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] + characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] + + return np.hstack([ + general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, + characteristics_hashed + ]).astype(np.float32) + + +class ImportsInfo(FeatureType): + ''' Information about imported libraries and functions from the + import address table. Note that the total number of imported + functions is contained in GeneralFileInfo. + ''' + + name = 'imports' + dim = 1280 + + def __init__(self): + super(FeatureType, self).__init__() + + def raw_features(self, bytez, lief_binary): + imports = {} + if lief_binary is None: + return imports + + for lib in lief_binary.imports: + if lib.name not in imports: + imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite + + # Clipping assumes there are diminishing returns on the discriminatory power of imported functions + # beyond the first 10000 characters, and this will help limit the dataset size + for entry in lib.entries: + if entry.is_ordinal: + imports[lib.name].append("ordinal" + str(entry.ordinal)) + else: + imports[lib.name].append(entry.name[:10000]) + + return imports + + def process_raw_features(self, raw_obj): + # unique libraries + libraries = list(set([l.lower() for l in raw_obj.keys()])) + libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0] + + # A string like "kernel32.dll:CreateFileMappingA" for each imported function + imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist] + imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0] + + # Two separate elements: libraries (alone) and fully-qualified names of imported functions + return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32) + + +class ExportsInfo(FeatureType): + ''' Information about exported functions. Note that the total number of exported + functions is contained in GeneralFileInfo. + ''' + + name = 'exports' + dim = 128 + + def __init__(self): + super(FeatureType, self).__init__() + + def raw_features(self, bytez, lief_binary): + if lief_binary is None: + return [] + + # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond + # the first 10000 characters, and this will help limit the dataset size + if LIEF_EXPORT_OBJECT: + # export is an object with .name attribute (0.10.0 and later) + clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions] + else: + # export is a string (LIEF 0.9.0 and earlier) + clipped_exports = [export[:10000] for export in lief_binary.exported_functions] + + + return clipped_exports + + def process_raw_features(self, raw_obj): + exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0] + return exports_hashed.astype(np.float32) + + +class GeneralFileInfo(FeatureType): + ''' General information about the file ''' + + name = 'general' + dim = 10 + + def __init__(self): + super(FeatureType, self).__init__() + + def raw_features(self, bytez, lief_binary): + if lief_binary is None: + return { + 'size': len(bytez), + 'vsize': 0, + 'has_debug': 0, + 'exports': 0, + 'imports': 0, + 'has_relocations': 0, + 'has_resources': 0, + 'has_signature': 0, + 'has_tls': 0, + 'symbols': 0 + } + + return { + 'size': len(bytez), + 'vsize': lief_binary.virtual_size, + 'has_debug': int(lief_binary.has_debug), + 'exports': len(lief_binary.exported_functions), + 'imports': len(lief_binary.imported_functions), + 'has_relocations': int(lief_binary.has_relocations), + 'has_resources': int(lief_binary.has_resources), + 'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature), + 'has_tls': int(lief_binary.has_tls), + 'symbols': len(lief_binary.symbols), + } + + def process_raw_features(self, raw_obj): + return np.asarray([ + raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'], + raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], + raw_obj['symbols'] + ], + dtype=np.float32) + + +class HeaderFileInfo(FeatureType): + ''' Machine, architecure, OS, linker and other information extracted from header ''' + + name = 'header' + dim = 62 + + def __init__(self): + super(FeatureType, self).__init__() + + def raw_features(self, bytez, lief_binary): + raw_obj = {} + raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []} + raw_obj['optional'] = { + 'subsystem': "", + 'dll_characteristics': [], + 'magic': "", + 'major_image_version': 0, + 'minor_image_version': 0, + 'major_linker_version': 0, + 'minor_linker_version': 0, + 'major_operating_system_version': 0, + 'minor_operating_system_version': 0, + 'major_subsystem_version': 0, + 'minor_subsystem_version': 0, + 'sizeof_code': 0, + 'sizeof_headers': 0, + 'sizeof_heap_commit': 0 + } + if lief_binary is None: + return raw_obj + + raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps + raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1] + raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list] + raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1] + raw_obj['optional']['dll_characteristics'] = [ + str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists + ] + raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1] + raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version + raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version + raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version + raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version + raw_obj['optional'][ + 'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version + raw_obj['optional'][ + 'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version + raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version + raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version + raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code + raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers + raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit + return raw_obj + + def process_raw_features(self, raw_obj): + return np.hstack([ + raw_obj['coff']['timestamp'], + FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0], + FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0], + FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0], + FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0], + FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0], + raw_obj['optional']['major_image_version'], + raw_obj['optional']['minor_image_version'], + raw_obj['optional']['major_linker_version'], + raw_obj['optional']['minor_linker_version'], + raw_obj['optional']['major_operating_system_version'], + raw_obj['optional']['minor_operating_system_version'], + raw_obj['optional']['major_subsystem_version'], + raw_obj['optional']['minor_subsystem_version'], + raw_obj['optional']['sizeof_code'], + raw_obj['optional']['sizeof_headers'], + raw_obj['optional']['sizeof_heap_commit'], + ]).astype(np.float32) + + +class StringExtractor(FeatureType): + ''' Extracts strings from raw byte stream ''' + + name = 'strings' + dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1 + + def __init__(self): + super(FeatureType, self).__init__() + # all consecutive runs of 0x20 - 0x7f that are 5+ characters + self._allstrings = re.compile(b'[\x20-\x7f]{5,}') + # occurances of the string 'C:\'. Not actually extracting the path + self._paths = re.compile(b'c:\\\\', re.IGNORECASE) + # occurances of http:// or https://. Not actually extracting the URLs + self._urls = re.compile(b'https?://', re.IGNORECASE) + # occurances of the string prefix HKEY_. No actually extracting registry names + self._registry = re.compile(b'HKEY_') + # crude evidence of an MZ header (dropper?) somewhere in the byte stream + self._mz = re.compile(b'MZ') + + def raw_features(self, bytez, lief_binary): + allstrings = self._allstrings.findall(bytez) + if allstrings: + # statistics about strings: + string_lengths = [len(s) for s in allstrings] + avlength = sum(string_lengths) / len(string_lengths) + # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive + as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)] + c = np.bincount(as_shifted_string, minlength=96) # histogram count + # distribution of characters in printable strings + csum = c.sum() + p = c.astype(np.float32) / csum + wh = np.where(c)[0] + H = np.sum(-p[wh] * np.log2(p[wh])) # entropy + else: + avlength = 0 + c = np.zeros((96,), dtype=np.float32) + H = 0 + csum = 0 + + return { + 'numstrings': len(allstrings), + 'avlength': avlength, + 'printabledist': c.tolist(), # store non-normalized histogram + 'printables': int(csum), + 'entropy': float(H), + 'paths': len(self._paths.findall(bytez)), + 'urls': len(self._urls.findall(bytez)), + 'registry': len(self._registry.findall(bytez)), + 'MZ': len(self._mz.findall(bytez)) + } + + def process_raw_features(self, raw_obj): + hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0 + return np.hstack([ + raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'], + np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'], + raw_obj['registry'], raw_obj['MZ'] + ]).astype(np.float32) + + +class DataDirectories(FeatureType): + ''' Extracts size and virtual address of the first 15 data directories ''' + + name = 'datadirectories' + dim = 15 * 2 + + def __init__(self): + super(FeatureType, self).__init__() + self._name_order = [ + "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE", + "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE", + "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER" + ] + + def raw_features(self, bytez, lief_binary): + output = [] + if lief_binary is None: + return output + + for data_directory in lief_binary.data_directories: + output.append({ + "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""), + "size": data_directory.size, + "virtual_address": data_directory.rva + }) + return output + + def process_raw_features(self, raw_obj): + features = np.zeros(2 * len(self._name_order), dtype=np.float32) + for i in range(len(self._name_order)): + if i < len(raw_obj): + features[2 * i] = raw_obj[i]["size"] + features[2 * i + 1] = raw_obj[i]["virtual_address"] + return features + + +class PEFeatureExtractor(object): + ''' Extract useful features from a PE file, and return as a vector of fixed size. ''' + + def __init__(self, feature_version=2, print_feature_warning=True, features_file=''): + self.features = [] + features = { + 'ByteHistogram': ByteHistogram(), + 'ByteEntropyHistogram': ByteEntropyHistogram(), + 'StringExtractor': StringExtractor(), + 'GeneralFileInfo': GeneralFileInfo(), + 'HeaderFileInfo': HeaderFileInfo(), + 'SectionInfo': SectionInfo(), + 'ImportsInfo': ImportsInfo(), + 'ExportsInfo': ExportsInfo() + } + + if os.path.exists(features_file): + with open(features_file, encoding='utf8') as f: + x = json.load(f) + self.features = [features[feature] for feature in x['features'] if feature in features] + else: + self.features = list(features.values()) + + if feature_version == 1: + if not lief.__version__.startswith("0.8.3"): + if print_feature_warning: + print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75") + print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print(f"WARNING: in the feature calculations.") + elif feature_version == 2: + self.features.append(DataDirectories()) + if not lief.__version__.startswith("0.9.0"): + if print_feature_warning: + print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-") + print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print(f"WARNING: in the feature calculations.") + else: + raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}") + self.dim = sum([fe.dim for fe in self.features]) + + def raw_features(self, bytez): + lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, + RuntimeError) + try: + lief_binary = lief.PE.parse(list(bytez)) + except lief_errors as e: + print("lief error: ", str(e)) + lief_binary = None + except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError): + raise + + features = {"sha256": hashlib.sha256(bytez).hexdigest()} + features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features}) + return features + + def process_raw_features(self, raw_obj): + feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features] + return np.hstack(feature_vectors).astype(np.float32) + + def feature_vector(self, bytez): + return self.process_raw_features(self.raw_features(bytez)) From c587a83dd72b9688aab612713f24ea36a8dd033f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Sk=C3=B3rski?= Date: Wed, 19 Jul 2023 18:52:12 +0000 Subject: [PATCH 2/3] cleanup --- ember/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ember/features.py b/ember/features.py index bbaa1381..8e179f05 100644 --- a/ember/features.py +++ b/ember/features.py @@ -189,7 +189,7 @@ def process_raw_features(self, raw_obj): section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] section_vsize = [(s['name'], s['vsize']) for s in sections] section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] - entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0] + entry_name_hashed = FeatureHasher(50, input_type="string").transform([ [raw_obj['entry']] ]).toarray()[0] characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] From bbd4f36b680d82972a25f7dede2d9e407fdd7cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Sk=C3=B3rski?= Date: Wed, 19 Jul 2023 18:52:42 +0000 Subject: [PATCH 3/3] don't track builds --- build/lib/ember/__init__.py | 233 --------------- build/lib/ember/features.py | 556 ------------------------------------ 2 files changed, 789 deletions(-) delete mode 100644 build/lib/ember/__init__.py delete mode 100644 build/lib/ember/features.py diff --git a/build/lib/ember/__init__.py b/build/lib/ember/__init__.py deleted file mode 100644 index d0bee0d0..00000000 --- a/build/lib/ember/__init__.py +++ /dev/null @@ -1,233 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -import json -import tqdm -import numpy as np -import pandas as pd -import lightgbm as lgb -import multiprocessing -from .features import PEFeatureExtractor -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import TimeSeriesSplit -from sklearn.metrics import (roc_auc_score, make_scorer) - - -def raw_feature_iterator(file_paths): - """ - Yield raw feature strings from the inputed file paths - """ - for path in file_paths: - with open(path, "r") as fin: - for line in fin: - yield line - - -def vectorize(irow, raw_features_string, X_path, y_path, extractor, nrows): - """ - Vectorize a single sample of raw features and write to a large numpy file - """ - raw_features = json.loads(raw_features_string) - feature_vector = extractor.process_raw_features(raw_features) - - y = np.memmap(y_path, dtype=np.float32, mode="r+", shape=nrows) - y[irow] = raw_features["label"] - - X = np.memmap(X_path, dtype=np.float32, mode="r+", shape=(nrows, extractor.dim)) - X[irow] = feature_vector - - -def vectorize_unpack(args): - """ - Pass through function for unpacking vectorize arguments - """ - return vectorize(*args) - - -def vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows): - """ - Vectorize a subset of data and write it to disk - """ - # Create space on disk to write features to - X = np.memmap(X_path, dtype=np.float32, mode="w+", shape=(nrows, extractor.dim)) - y = np.memmap(y_path, dtype=np.float32, mode="w+", shape=nrows) - del X, y - - # Distribute the vectorization work - pool = multiprocessing.Pool() - argument_iterator = ((irow, raw_features_string, X_path, y_path, extractor, nrows) - for irow, raw_features_string in enumerate(raw_feature_iterator(raw_feature_paths))) - for _ in tqdm.tqdm(pool.imap_unordered(vectorize_unpack, argument_iterator), total=nrows): - pass - - -def create_vectorized_features(data_dir, feature_version=2): - """ - Create feature vectors from raw features and write them to disk - """ - extractor = PEFeatureExtractor(feature_version) - - print("Vectorizing training set") - X_path = os.path.join(data_dir, "X_train.dat") - y_path = os.path.join(data_dir, "y_train.dat") - raw_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)] - nrows = sum([1 for fp in raw_feature_paths for line in open(fp)]) - vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows) - - print("Vectorizing test set") - X_path = os.path.join(data_dir, "X_test.dat") - y_path = os.path.join(data_dir, "y_test.dat") - raw_feature_paths = [os.path.join(data_dir, "test_features.jsonl")] - nrows = sum([1 for fp in raw_feature_paths for line in open(fp)]) - vectorize_subset(X_path, y_path, raw_feature_paths, extractor, nrows) - - -def read_vectorized_features(data_dir, subset=None, feature_version=2): - """ - Read vectorized features into memory mapped numpy arrays - """ - if subset is not None and subset not in ["train", "test"]: - return None - - extractor = PEFeatureExtractor(feature_version) - ndim = extractor.dim - X_train = None - y_train = None - X_test = None - y_test = None - - if subset is None or subset == "train": - X_train_path = os.path.join(data_dir, "X_train.dat") - y_train_path = os.path.join(data_dir, "y_train.dat") - y_train = np.memmap(y_train_path, dtype=np.float32, mode="r") - N = y_train.shape[0] - X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim)) - if subset == "train": - return X_train, y_train - - if subset is None or subset == "test": - X_test_path = os.path.join(data_dir, "X_test.dat") - y_test_path = os.path.join(data_dir, "y_test.dat") - y_test = np.memmap(y_test_path, dtype=np.float32, mode="r") - N = y_test.shape[0] - X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim)) - if subset == "test": - return X_test, y_test - - return X_train, y_train, X_test, y_test - - -def read_metadata_record(raw_features_string): - """ - Decode a raw features string and return the metadata fields - """ - all_data = json.loads(raw_features_string) - metadata_keys = {"sha256", "appeared", "label", "avclass"} - return {k: all_data[k] for k in all_data.keys() & metadata_keys} - - -def create_metadata(data_dir): - """ - Write metadata to a csv file and return its dataframe - """ - pool = multiprocessing.Pool() - - train_feature_paths = [os.path.join(data_dir, "train_features_{}.jsonl".format(i)) for i in range(6)] - train_records = list(pool.imap(read_metadata_record, raw_feature_iterator(train_feature_paths))) - - metadata_keys = ["sha256", "appeared", "label", "avclass"] - ordered_metadata_keys = [k for k in metadata_keys if k in train_records[0].keys()] - - train_metadf = pd.DataFrame(train_records)[ordered_metadata_keys] - train_metadf.to_csv(os.path.join(data_dir, "train_metadata.csv")) - - train_records = [dict(record, **{"subset": "train"}) for record in train_records] - - test_feature_paths = [os.path.join(data_dir, "test_features.jsonl")] - test_records = list(pool.imap(read_metadata_record, raw_feature_iterator(test_feature_paths))) - - test_metadf = pd.DataFrame(test_records)[ordered_metadata_keys] - test_metadf.to_csv(os.path.join(data_dir, "test_metadata.csv")) - - test_records = [dict(record, **{"subset": "test"}) for record in test_records] - - all_metadata_keys = ordered_metadata_keys + ["subset"] - metadf = pd.DataFrame(train_records + test_records)[all_metadata_keys] - metadf.to_csv(os.path.join(data_dir, "metadata.csv")) - return metadf - - -def read_metadata(data_dir): - """ - Read an already created metadata file and return its dataframe - """ - return pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0) - - -def optimize_model(data_dir): - """ - Run a grid search to find the best LightGBM parameters - """ - # Read data - X_train, y_train = read_vectorized_features(data_dir, subset="train") - - # Filter unlabeled data - train_rows = (y_train != -1) - - # read training dataset - X_train = X_train[train_rows] - y_train = y_train[train_rows] - - # score by roc auc - # we're interested in low FPR rates, so we'll consider only the AUC for FPRs in [0,5e-3] - score = make_scorer(roc_auc_score, max_fpr=5e-3) - - # define search grid - param_grid = { - 'boosting_type': ['gbdt'], - 'objective': ['binary'], - 'num_iterations': [500, 1000], - 'learning_rate': [0.005, 0.05], - 'num_leaves': [512, 1024, 2048], - 'feature_fraction': [0.5, 0.8, 1.0], - 'bagging_fraction': [0.5, 0.8, 1.0] - } - model = lgb.LGBMClassifier(boosting_type="gbdt", n_jobs=-1, silent=True) - - # each row in X_train appears in chronological order of "appeared" - # so this works for progrssive time series splitting - progressive_cv = TimeSeriesSplit(n_splits=3).split(X_train) - - grid = GridSearchCV(estimator=model, cv=progressive_cv, param_grid=param_grid, scoring=score, n_jobs=1, verbose=3) - grid.fit(X_train, y_train) - - return grid.best_params_ - - -def train_model(data_dir, params={}, feature_version=2): - """ - Train the LightGBM model from the EMBER dataset from the vectorized features - """ - # update params - params.update({"application": "binary"}) - - # Read data - X_train, y_train = read_vectorized_features(data_dir, "train", feature_version) - - # Filter unlabeled data - train_rows = (y_train != -1) - - # Train - lgbm_dataset = lgb.Dataset(X_train[train_rows], y_train[train_rows]) - lgbm_model = lgb.train(params, lgbm_dataset) - - return lgbm_model - - -def predict_sample(lgbm_model, file_data, feature_version=2): - """ - Predict a PE file with an LightGBM model - """ - extractor = PEFeatureExtractor(feature_version) - features = np.array(extractor.feature_vector(file_data), dtype=np.float32) - return lgbm_model.predict([features])[0] diff --git a/build/lib/ember/features.py b/build/lib/ember/features.py deleted file mode 100644 index 8e179f05..00000000 --- a/build/lib/ember/features.py +++ /dev/null @@ -1,556 +0,0 @@ -#!/usr/bin/python -''' Extracts some basic features from PE files. Many of the features -implemented have been used in previously published works. For more information, -check out the following resources: -* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf -* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf -* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf -* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf -* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf - -It may be useful to do feature selection to reduce this set of features to a meaningful set -for your modeling problem. -''' - -import re -import lief -import hashlib -import numpy as np -import os -import json -from sklearn.feature_extraction import FeatureHasher - -LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.') -LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 ) -LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11) - - -class FeatureType(object): - ''' Base class from which each feature type may inherit ''' - - name = '' - dim = 0 - - def __repr__(self): - return '{}({})'.format(self.name, self.dim) - - def raw_features(self, bytez, lief_binary): - ''' Generate a JSON-able representation of the file ''' - raise (NotImplementedError) - - def process_raw_features(self, raw_obj): - ''' Generate a feature vector from the raw features ''' - raise (NotImplementedError) - - def feature_vector(self, bytez, lief_binary): - ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently - if there are significant speedups to be gained from combining the two functions. ''' - return self.process_raw_features(self.raw_features(bytez, lief_binary)) - - -class ByteHistogram(FeatureType): - ''' Byte histogram (count + non-normalized) over the entire binary file ''' - - name = 'histogram' - dim = 256 - - def __init__(self): - super(FeatureType, self).__init__() - - def raw_features(self, bytez, lief_binary): - counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256) - return counts.tolist() - - def process_raw_features(self, raw_obj): - counts = np.array(raw_obj, dtype=np.float32) - sum = counts.sum() - normalized = counts / sum - return normalized - - -class ByteEntropyHistogram(FeatureType): - ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015). - This roughly approximates the joint probability of byte value and local entropy. - See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info. - ''' - - name = 'byteentropy' - dim = 256 - - def __init__(self, step=1024, window=2048): - super(FeatureType, self).__init__() - self.window = window - self.step = step - - def _entropy_bin_counts(self, block): - # coarse histogram, 16 bytes per bin - c = np.bincount(block >> 4, minlength=16) # 16-bin histogram - p = c.astype(np.float32) / self.window - wh = np.where(c)[0] - H = np.sum(-p[wh] * np.log2( - p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits) - - Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits) - if Hbin == 16: # handle entropy = 8.0 bits - Hbin = 15 - - return Hbin, c - - def raw_features(self, bytez, lief_binary): - output = np.zeros((16, 16), dtype=np.int) - a = np.frombuffer(bytez, dtype=np.uint8) - if a.shape[0] < self.window: - Hbin, c = self._entropy_bin_counts(a) - output[Hbin, :] += c - else: - # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html - shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window) - strides = a.strides + (a.strides[-1],) - blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :] - - # from the blocks, compute histogram - for block in blocks: - Hbin, c = self._entropy_bin_counts(block) - output[Hbin, :] += c - - return output.flatten().tolist() - - def process_raw_features(self, raw_obj): - counts = np.array(raw_obj, dtype=np.float32) - sum = counts.sum() - normalized = counts / sum - return normalized - - -class SectionInfo(FeatureType): - ''' Information about section names, sizes and entropy. Uses hashing trick - to summarize all this section info into a feature vector. - ''' - - name = 'section' - dim = 5 + 50 + 50 + 50 + 50 + 50 - - def __init__(self): - super(FeatureType, self).__init__() - - @staticmethod - def _properties(s): - return [str(c).split('.')[-1] for c in s.characteristics_lists] - - def raw_features(self, bytez, lief_binary): - if lief_binary is None: - return {"entry": "", "sections": []} - - # properties of entry point, or if invalid, the first executable section - - try: - if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): - section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) - if section is None: - raise lief.not_found - entry_section = section.name - else: # lief < 0.12 - entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name - except lief.not_found: - # bad entry point, let's find the first executable section - entry_section = "" - for s in lief_binary.sections: - if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: - entry_section = s.name - break - - raw_obj = {"entry": entry_section} - raw_obj["sections"] = [{ - 'name': s.name, - 'size': s.size, - 'entropy': s.entropy, - 'vsize': s.virtual_size, - 'props': self._properties(s) - } for s in lief_binary.sections] - return raw_obj - - def process_raw_features(self, raw_obj): - sections = raw_obj['sections'] - general = [ - len(sections), # total number of sections - # number of sections with zero size - sum(1 for s in sections if s['size'] == 0), - # number of sections with an empty name - sum(1 for s in sections if s['name'] == ""), - # number of RX - sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']), - # number of W - sum(1 for s in sections if 'MEM_WRITE' in s['props']) - ] - # gross characteristics of each section - section_sizes = [(s['name'], s['size']) for s in sections] - section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0] - section_entropy = [(s['name'], s['entropy']) for s in sections] - section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] - section_vsize = [(s['name'], s['vsize']) for s in sections] - section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] - entry_name_hashed = FeatureHasher(50, input_type="string").transform([ [raw_obj['entry']] ]).toarray()[0] - characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] - characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] - - return np.hstack([ - general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, - characteristics_hashed - ]).astype(np.float32) - - -class ImportsInfo(FeatureType): - ''' Information about imported libraries and functions from the - import address table. Note that the total number of imported - functions is contained in GeneralFileInfo. - ''' - - name = 'imports' - dim = 1280 - - def __init__(self): - super(FeatureType, self).__init__() - - def raw_features(self, bytez, lief_binary): - imports = {} - if lief_binary is None: - return imports - - for lib in lief_binary.imports: - if lib.name not in imports: - imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite - - # Clipping assumes there are diminishing returns on the discriminatory power of imported functions - # beyond the first 10000 characters, and this will help limit the dataset size - for entry in lib.entries: - if entry.is_ordinal: - imports[lib.name].append("ordinal" + str(entry.ordinal)) - else: - imports[lib.name].append(entry.name[:10000]) - - return imports - - def process_raw_features(self, raw_obj): - # unique libraries - libraries = list(set([l.lower() for l in raw_obj.keys()])) - libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0] - - # A string like "kernel32.dll:CreateFileMappingA" for each imported function - imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist] - imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0] - - # Two separate elements: libraries (alone) and fully-qualified names of imported functions - return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32) - - -class ExportsInfo(FeatureType): - ''' Information about exported functions. Note that the total number of exported - functions is contained in GeneralFileInfo. - ''' - - name = 'exports' - dim = 128 - - def __init__(self): - super(FeatureType, self).__init__() - - def raw_features(self, bytez, lief_binary): - if lief_binary is None: - return [] - - # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond - # the first 10000 characters, and this will help limit the dataset size - if LIEF_EXPORT_OBJECT: - # export is an object with .name attribute (0.10.0 and later) - clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions] - else: - # export is a string (LIEF 0.9.0 and earlier) - clipped_exports = [export[:10000] for export in lief_binary.exported_functions] - - - return clipped_exports - - def process_raw_features(self, raw_obj): - exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0] - return exports_hashed.astype(np.float32) - - -class GeneralFileInfo(FeatureType): - ''' General information about the file ''' - - name = 'general' - dim = 10 - - def __init__(self): - super(FeatureType, self).__init__() - - def raw_features(self, bytez, lief_binary): - if lief_binary is None: - return { - 'size': len(bytez), - 'vsize': 0, - 'has_debug': 0, - 'exports': 0, - 'imports': 0, - 'has_relocations': 0, - 'has_resources': 0, - 'has_signature': 0, - 'has_tls': 0, - 'symbols': 0 - } - - return { - 'size': len(bytez), - 'vsize': lief_binary.virtual_size, - 'has_debug': int(lief_binary.has_debug), - 'exports': len(lief_binary.exported_functions), - 'imports': len(lief_binary.imported_functions), - 'has_relocations': int(lief_binary.has_relocations), - 'has_resources': int(lief_binary.has_resources), - 'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature), - 'has_tls': int(lief_binary.has_tls), - 'symbols': len(lief_binary.symbols), - } - - def process_raw_features(self, raw_obj): - return np.asarray([ - raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'], - raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], - raw_obj['symbols'] - ], - dtype=np.float32) - - -class HeaderFileInfo(FeatureType): - ''' Machine, architecure, OS, linker and other information extracted from header ''' - - name = 'header' - dim = 62 - - def __init__(self): - super(FeatureType, self).__init__() - - def raw_features(self, bytez, lief_binary): - raw_obj = {} - raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []} - raw_obj['optional'] = { - 'subsystem': "", - 'dll_characteristics': [], - 'magic': "", - 'major_image_version': 0, - 'minor_image_version': 0, - 'major_linker_version': 0, - 'minor_linker_version': 0, - 'major_operating_system_version': 0, - 'minor_operating_system_version': 0, - 'major_subsystem_version': 0, - 'minor_subsystem_version': 0, - 'sizeof_code': 0, - 'sizeof_headers': 0, - 'sizeof_heap_commit': 0 - } - if lief_binary is None: - return raw_obj - - raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps - raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1] - raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list] - raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1] - raw_obj['optional']['dll_characteristics'] = [ - str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists - ] - raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1] - raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version - raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version - raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version - raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version - raw_obj['optional'][ - 'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version - raw_obj['optional'][ - 'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version - raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version - raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version - raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code - raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers - raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit - return raw_obj - - def process_raw_features(self, raw_obj): - return np.hstack([ - raw_obj['coff']['timestamp'], - FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0], - FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0], - FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0], - FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0], - FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0], - raw_obj['optional']['major_image_version'], - raw_obj['optional']['minor_image_version'], - raw_obj['optional']['major_linker_version'], - raw_obj['optional']['minor_linker_version'], - raw_obj['optional']['major_operating_system_version'], - raw_obj['optional']['minor_operating_system_version'], - raw_obj['optional']['major_subsystem_version'], - raw_obj['optional']['minor_subsystem_version'], - raw_obj['optional']['sizeof_code'], - raw_obj['optional']['sizeof_headers'], - raw_obj['optional']['sizeof_heap_commit'], - ]).astype(np.float32) - - -class StringExtractor(FeatureType): - ''' Extracts strings from raw byte stream ''' - - name = 'strings' - dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1 - - def __init__(self): - super(FeatureType, self).__init__() - # all consecutive runs of 0x20 - 0x7f that are 5+ characters - self._allstrings = re.compile(b'[\x20-\x7f]{5,}') - # occurances of the string 'C:\'. Not actually extracting the path - self._paths = re.compile(b'c:\\\\', re.IGNORECASE) - # occurances of http:// or https://. Not actually extracting the URLs - self._urls = re.compile(b'https?://', re.IGNORECASE) - # occurances of the string prefix HKEY_. No actually extracting registry names - self._registry = re.compile(b'HKEY_') - # crude evidence of an MZ header (dropper?) somewhere in the byte stream - self._mz = re.compile(b'MZ') - - def raw_features(self, bytez, lief_binary): - allstrings = self._allstrings.findall(bytez) - if allstrings: - # statistics about strings: - string_lengths = [len(s) for s in allstrings] - avlength = sum(string_lengths) / len(string_lengths) - # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive - as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)] - c = np.bincount(as_shifted_string, minlength=96) # histogram count - # distribution of characters in printable strings - csum = c.sum() - p = c.astype(np.float32) / csum - wh = np.where(c)[0] - H = np.sum(-p[wh] * np.log2(p[wh])) # entropy - else: - avlength = 0 - c = np.zeros((96,), dtype=np.float32) - H = 0 - csum = 0 - - return { - 'numstrings': len(allstrings), - 'avlength': avlength, - 'printabledist': c.tolist(), # store non-normalized histogram - 'printables': int(csum), - 'entropy': float(H), - 'paths': len(self._paths.findall(bytez)), - 'urls': len(self._urls.findall(bytez)), - 'registry': len(self._registry.findall(bytez)), - 'MZ': len(self._mz.findall(bytez)) - } - - def process_raw_features(self, raw_obj): - hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0 - return np.hstack([ - raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'], - np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'], - raw_obj['registry'], raw_obj['MZ'] - ]).astype(np.float32) - - -class DataDirectories(FeatureType): - ''' Extracts size and virtual address of the first 15 data directories ''' - - name = 'datadirectories' - dim = 15 * 2 - - def __init__(self): - super(FeatureType, self).__init__() - self._name_order = [ - "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE", - "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE", - "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER" - ] - - def raw_features(self, bytez, lief_binary): - output = [] - if lief_binary is None: - return output - - for data_directory in lief_binary.data_directories: - output.append({ - "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""), - "size": data_directory.size, - "virtual_address": data_directory.rva - }) - return output - - def process_raw_features(self, raw_obj): - features = np.zeros(2 * len(self._name_order), dtype=np.float32) - for i in range(len(self._name_order)): - if i < len(raw_obj): - features[2 * i] = raw_obj[i]["size"] - features[2 * i + 1] = raw_obj[i]["virtual_address"] - return features - - -class PEFeatureExtractor(object): - ''' Extract useful features from a PE file, and return as a vector of fixed size. ''' - - def __init__(self, feature_version=2, print_feature_warning=True, features_file=''): - self.features = [] - features = { - 'ByteHistogram': ByteHistogram(), - 'ByteEntropyHistogram': ByteEntropyHistogram(), - 'StringExtractor': StringExtractor(), - 'GeneralFileInfo': GeneralFileInfo(), - 'HeaderFileInfo': HeaderFileInfo(), - 'SectionInfo': SectionInfo(), - 'ImportsInfo': ImportsInfo(), - 'ExportsInfo': ExportsInfo() - } - - if os.path.exists(features_file): - with open(features_file, encoding='utf8') as f: - x = json.load(f) - self.features = [features[feature] for feature in x['features'] if feature in features] - else: - self.features = list(features.values()) - - if feature_version == 1: - if not lief.__version__.startswith("0.8.3"): - if print_feature_warning: - print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") - print(f"WARNING: in the feature calculations.") - elif feature_version == 2: - self.features.append(DataDirectories()) - if not lief.__version__.startswith("0.9.0"): - if print_feature_warning: - print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") - print(f"WARNING: in the feature calculations.") - else: - raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}") - self.dim = sum([fe.dim for fe in self.features]) - - def raw_features(self, bytez): - lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, - RuntimeError) - try: - lief_binary = lief.PE.parse(list(bytez)) - except lief_errors as e: - print("lief error: ", str(e)) - lief_binary = None - except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError): - raise - - features = {"sha256": hashlib.sha256(bytez).hexdigest()} - features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features}) - return features - - def process_raw_features(self, raw_obj): - feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features] - return np.hstack(feature_vectors).astype(np.float32) - - def feature_vector(self, bytez): - return self.process_raw_features(self.raw_features(bytez))