diff --git a/hela/__init__.py b/hela/__init__.py new file mode 100644 index 0000000..e3dd264 --- /dev/null +++ b/hela/__init__.py @@ -0,0 +1,7 @@ + +from .dataset import * +from .feature_importance import * +from .models import * +from .plot import * +from .utils import * +from .wpercentile import * diff --git a/dataset.py b/hela/dataset.py similarity index 71% rename from dataset.py rename to hela/dataset.py index 727db15..59ba2be 100644 --- a/dataset.py +++ b/hela/dataset.py @@ -8,8 +8,7 @@ __all__ = ["Dataset", "load_dataset", "load_data_file"] -logger = logging.getLogger(__name__) - +LOGGER = logging.getLogger(__name__) Dataset = namedtuple("Dataset", ["training_x", "training_y", "testing_x", "testing_y", @@ -17,39 +16,40 @@ def load_data_file(data_file, num_features): - + data = np.load(data_file) - + if data.ndim == 1: data = data[None, :] - + x = data[:, :num_features] y = data[:, num_features:] - + return x, y def load_dataset(dataset_file): - + with open(dataset_file, "r") as f: dataset_info = json.load(f) - + metadata = dataset_info["metadata"] - + base_path = os.path.dirname(dataset_file) - + # Load training data - training_file = os.path.join(base_path, dataset_info["training_data"]) - logger.debug("Loading training data from '{}'...".format(training_file)) - training_x, training_y = load_data_file(training_file, metadata["num_features"]) - + training_file = os.path.join(base_path, dataset_info["training_data"]) + LOGGER.debug("Loading training data from '%s'...", training_file) + training_x, training_y = load_data_file(training_file, + metadata["num_features"]) + # Optionally, load testing data testing_x, testing_y = None, None if dataset_info["testing_data"] is not None: testing_file = os.path.join(base_path, dataset_info["testing_data"]) - logger.debug("Loading testing data from '{}'...".format(testing_file)) - testing_x, testing_y = load_data_file(testing_file, metadata["num_features"]) - + LOGGER.debug("Loading testing data from '%s'...", testing_file) + testing_x, testing_y = load_data_file(testing_file, + metadata["num_features"]) + return Dataset(training_x, training_y, testing_x, testing_y, metadata["names"], metadata["ranges"], metadata["colors"]) - diff --git a/hela/feature_importance.py b/hela/feature_importance.py new file mode 100644 index 0000000..1f1537e --- /dev/null +++ b/hela/feature_importance.py @@ -0,0 +1,183 @@ + +from collections import namedtuple +from multiprocessing import Pool +from typing import Optional, Tuple +import logging + +import numpy as np + +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeRegressor + +from .models import _tree_weights +from .utils import tqdm + +__all__ = [ + 'importances_per_output' +] + +LOGGER = logging.getLogger(__name__) + + +def importances_per_output( + forest: RandomForestRegressor, + x: np.ndarray, + y: np.ndarray + ) -> np.ndarray: + """ + Compute the feature importances with a breakdown per output variable (i.e., + per label). + + Parameters + ---------- + forest : RandomForestRegressor + A trained random forest from which feature importances will be + extracted. + x : array-like of shape (n_samples, n_features) + The input samples of the training dataset used to train `forest`. + y : array-like of shape (n_samples, n_outputs) + + Returns + ------- + importances : np.ndarray of shape (n_features, n_outputs) + The feature importances splitted according to the contributions to each + output variable. Summing along the columns of this array will provide + the global feature importances contained in + forest.feature_importances_. + """ + + impurities = impurities_per_output(forest, x, y) + return importances_from_impurities(forest, impurities) + + +def importances_from_impurities( + forest: RandomForestRegressor, + impurities: np.ndarray + ) -> np.ndarray: + + LOGGER.info("Computing importances from impurities...") + with Pool(forest.n_jobs) as pool: + importances_ = list(tqdm( + pool.imap(_tree_importances_per_output, + zip(forest, impurities)), + total=len(forest) + )) + + importances = np.mean(importances_, axis=0) + + return importances / importances.sum() + + +def _tree_importances_per_output( + args: Tuple[DecisionTreeRegressor, np.ndarray] + ) -> np.ndarray: + + tree, impurities = args + + tree_ = tree.tree_ + importances = np.zeros((tree_.n_features, tree_.n_outputs)) + + Node = namedtuple( + "Node", + ['left', 'right', 'weighted_n_node_samples', 'feature', 'impurities'] + ) + + for node in zip(tree_.children_left, tree_.children_right, + tree_.weighted_n_node_samples, tree_.feature, impurities): + node = Node(*node) + if node.left == -1: + continue + + left = Node( + None, None, + tree_.weighted_n_node_samples[node.left], + tree_.feature[node.left], + impurities[node.left] + ) + right = Node( + None, None, + tree_.weighted_n_node_samples[node.right], + tree_.feature[node.right], impurities[node.right] + ) + + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurities - + left.weighted_n_node_samples * left.impurities - + right.weighted_n_node_samples * right.impurities + ) + + return importances / importances.sum() + + +# Global variables to avoid pickling very large arrays with multiprocessing +_X = None +_Y = None + + +def impurities_per_output( + forest: RandomForestRegressor, + x: np.ndarray, + y: np.ndarray + ) -> np.ndarray: + + LOGGER.info("Computing impurities...") + global _X + global _Y + _X = x + _Y = y + with Pool(forest.n_jobs) as pool: + impurities = list(tqdm( + pool.imap(_tree_impurities_per_output, forest), + total=len(forest) + )) + + return np.array(impurities) + + +def _tree_impurities_per_output(tree: DecisionTreeRegressor) -> np.ndarray: + + assert _X is not None and _Y is not None + + tree_ = tree.tree_ + x, y = _X, _Y + + impurities = np.zeros((tree.tree_.node_count, y.shape[1])) + weights = _tree_weights(tree, len(x)) + + def __tree_impurities_per_output(node_idx, x, y, weights): + + imp = _wvariance(y, weights, axis=0) + impurities[node_idx] = imp + + if tree_.children_left[node_idx] == -1: + return + + feature = tree_.feature[node_idx] + threshold = tree_.threshold[node_idx] + mask = x[:, feature] <= threshold + __tree_impurities_per_output( + tree_.children_left[node_idx], + x[mask], + y[mask], + weights[mask] + ) + mask = np.logical_not(mask) + __tree_impurities_per_output( + tree_.children_right[node_idx], + x[mask], + y[mask], + weights[mask] + ) + + __tree_impurities_per_output(0, x, y, weights) + return impurities + + +def _wvariance( + data: np.ndarray, + weights: np.ndarray, + axis: Optional[int] = None + ) -> np.ndarray: + + avg = np.average(data, weights=weights, axis=axis) + return np.average((data - avg)**2, weights=weights, axis=axis) diff --git a/models.py b/hela/models.py similarity index 96% rename from models.py rename to hela/models.py index 6dc3369..f4caa40 100644 --- a/models.py +++ b/hela/models.py @@ -8,11 +8,7 @@ from sklearn.utils import check_random_state from sklearn.preprocessing import MinMaxScaler -try: - from tqdm import tqdm -except ImportError: - def tqdm(x, *_, **__): - return x +from .utils import tqdm __all__ = [ "Model", @@ -75,14 +71,14 @@ def _scaler_fit(self, y): self.scaler.fit(y) - def _scaler_transform(self, y): + def scaler_transform(self, y): if y.ndim == 1: y = y[:, None] return self.scaler.transform(y)[:, 0] return self.scaler.transform(y) - def _scaler_inverse_transform(self, y): + def scaler_inverse_transform(self, y): if y.ndim == 1: y = y[:, None] @@ -92,7 +88,7 @@ def _scaler_inverse_transform(self, y): def fit(self, x, y): self._scaler_fit(y) - self.rf.fit(x, self._scaler_transform(y)) + self.rf.fit(x, self.scaler_transform(y)) # Build the structures to quickly compute the posteriors if self.enable_posterior: @@ -105,7 +101,7 @@ def fit(self, x, y): def predict(self, x): pred = self.rf.predict(x) - return self._scaler_inverse_transform(pred) + return self.scaler_inverse_transform(pred) def predict_median(self, x): return self.predict_percentile(x, 50) diff --git a/plot.py b/hela/plot.py similarity index 93% rename from plot.py rename to hela/plot.py index a3434f2..bcdfecf 100644 --- a/plot.py +++ b/hela/plot.py @@ -9,19 +9,15 @@ from sklearn import metrics, neighbors from sklearn.preprocessing import MinMaxScaler -from models import resample_posterior -from wpercentile import wmedian - -try: - from tqdm import tqdm -except ImportError: - def tqdm(x, *_, **__): - return x +from .models import resample_posterior +from .wpercentile import wmedian +from .utils import tqdm __all__ = [ - "predicted_vs_real", - "feature_importances", - "posterior_matrix" + 'predicted_vs_real', + 'feature_importances', + 'stacked_feature_importances', + 'posterior_matrix' ] POSTERIOR_MAX_SIZE = 10000 @@ -96,6 +92,26 @@ def feature_importances(forests, names, colors): return fig +def stacked_feature_importances(importances, names, colors): + + bottoms = np.zeros(importances.shape[0]) + + ind = np.arange(len(importances)) + + fig = plt.figure() + ax = fig.gca() + for data_i, name_i, color_i in zip(importances.T, names, colors): + ax.bar(ind, data_i, bottom=bottoms, color=color_i, label=name_i) + bottoms += data_i + + ax.set_xlabel("Feature index", fontsize=18) + ax.legend(fontsize=16) + ax.grid() + + fig.tight_layout() + return fig + + def posterior_matrix(posterior, names, ranges, colors, soft_colors=None): samples, weights = posterior diff --git a/utils.py b/hela/utils.py similarity index 88% rename from utils.py rename to hela/utils.py index 414cc49..2cba725 100644 --- a/utils.py +++ b/hela/utils.py @@ -1,40 +1,46 @@ -import os import logging +try: + from tqdm import tqdm +except ImportError: + def tqdm(x, *_, **__): + return x + __all__ = [ - "config_logger" + 'config_logger', + 'tqdm' ] def config_logger(log_file="/dev/null", level=logging.INFO): - + class MyFormatter(logging.Formatter): - + info_format = "\x1b[32;1m%(asctime)s [%(name)s]\x1b[0m %(message)s" error_format = "\x1b[31;1m%(asctime)s [%(name)s] [%(levelname)s]\x1b[0m %(message)s" - + def format(self, record): - + if record.levelno > logging.INFO: self._style._fmt = self.error_format else: self._style._fmt = self.info_format - + res = super(MyFormatter, self).format(record) return res - + rootLogger = logging.getLogger() - + fileHandler = logging.FileHandler(log_file) fileFormatter = logging.Formatter("%(asctime)s [%(name)s] [%(levelname)s]> %(message)s") fileHandler.setFormatter(fileFormatter) rootLogger.addHandler(fileHandler) - + consoleHandler = logging.StreamHandler() consoleFormatter = MyFormatter() consoleHandler.setFormatter(consoleFormatter) rootLogger.addHandler(consoleHandler) - + rootLogger.setLevel(level) diff --git a/wpercentile.py b/hela/wpercentile.py similarity index 95% rename from wpercentile.py rename to hela/wpercentile.py index f030db9..529e316 100644 --- a/wpercentile.py +++ b/hela/wpercentile.py @@ -8,26 +8,26 @@ def _wpercentile1d(data, weights, percentiles): - + if data.ndim > 1 or weights.ndim > 1: raise ValueError("data and weights must be one-dimensional arrays") - + if data.shape != weights.shape: raise ValueError("data and weights must have the same shape") - + data = np.asarray(data) weights = np.asarray(weights) percentiles = np.asarray(percentiles) - + sort_indices = np.argsort(data) sorted_data = data[sort_indices] sorted_weights = weights[sort_indices] - + cumsum_weights = np.cumsum(sorted_weights) sum_weights = cumsum_weights[-1] - + pn = 100 * (cumsum_weights - 0.5*sorted_weights) / sum_weights - + return np.interp(percentiles, pn, sorted_data) @@ -39,22 +39,23 @@ def wpercentile(data, weights, percentiles, axis=None): data = np.ravel(data) weights = np.ravel(weights) return _wpercentile1d(data, weights, percentiles) - + axis = np.atleast_1d(axis) - + # Reshape the arrays for proper computation # Move the requested axis to the final dimensions dest_axis = list(range(len(axis))) data2 = np.moveaxis(data, axis, dest_axis) - + ndim = len(axis) shape = data2.shape newshape = (np.prod(shape[:ndim]), np.prod(shape[ndim:])) newdata = np.reshape(data2, newshape) newweights = np.reshape(weights, newshape[0]) - - result = np.apply_along_axis(_wpercentile1d, 0, newdata, newweights, percentiles) - + + result = np.apply_along_axis(_wpercentile1d, 0, newdata, newweights, + percentiles) + final_shape = (*np.shape(percentiles), *shape[ndim:]) return np.reshape(result, final_shape) diff --git a/rfretrieval.py b/rfretrieval.py index 662265f..d051d94 100644 --- a/rfretrieval.py +++ b/rfretrieval.py @@ -7,122 +7,159 @@ from sklearn import metrics, multioutput import joblib -from dataset import load_dataset, load_data_file -from models import Model -from utils import config_logger -from wpercentile import wpercentile -import plot +import hela -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) def train_model(dataset, num_trees, num_jobs, verbose=1): - pipeline = Model(num_trees, num_jobs, - names=dataset.names, - ranges=dataset.ranges, - colors=dataset.colors, - verbose=verbose) + pipeline = hela.Model( + num_trees, num_jobs, + names=dataset.names, + ranges=dataset.ranges, + colors=dataset.colors, + verbose=verbose + ) pipeline.fit(dataset.training_x, dataset.training_y) return pipeline def test_model(model, dataset, output_path): - + if dataset.testing_x is None: return - - logger.info("Testing model...") + + LOGGER.info("Testing model...") pred = model.predict(dataset.testing_x) # pred = model.predict_median(dataset.testing_x) - r2scores = {name_i: metrics.r2_score(real_i, pred_i) - for name_i, real_i, pred_i in zip(dataset.names, dataset.testing_y.T, pred.T)} + r2scores = { + name_i: metrics.r2_score(real_i, pred_i) + for name_i, real_i, pred_i in zip(dataset.names, + dataset.testing_y.T, + pred.T) + } print("Testing scores:") for name, values in r2scores.items(): print("\tR^2 score for {}: {:.3f}".format(name, values)) - - logger.info("Plotting testing results...") - fig = plot.predicted_vs_real(dataset.testing_y, pred, dataset.names, dataset.ranges) + + LOGGER.info("Plotting testing results...") + fig = hela.predicted_vs_real( + dataset.testing_y, + pred, + names=dataset.names, + ranges=dataset.ranges + ) fig.savefig(os.path.join(output_path, "predicted_vs_real.pdf"), bbox_inches='tight') -def compute_feature_importance(model, dataset, output_path): - - logger.info("Computing feature importance for individual parameters...") +def _plot_feature_importances(model, dataset, output_path): + + LOGGER.info("Computing feature importances for individual parameters...") regr = multioutput.MultiOutputRegressor(model, n_jobs=1) regr.fit(dataset.training_x, dataset.training_y) - - fig = plot.feature_importances(forests=[i.rf for i in regr.estimators_] + [model.rf], - names=dataset.names + ["joint prediction"], - colors=dataset.colors + ["C0"]) - + + fig = hela.plot.feature_importances( + forests=[i.rf for i in regr.estimators_] + [model.rf], + names=dataset.names + ["joint prediction"], + colors=dataset.colors + ["C0"] + ) + fig.savefig(os.path.join(output_path, "feature_importances.pdf"), bbox_inches='tight') +def _plot_feature_importances_breakdown(model, dataset, output_path): + + LOGGER.info("Computing feature importances per output...") + importances = hela.importances_per_output( + model.rf, + dataset.training_x, + model.scaler_transform(dataset.training_y) + ) + + fig = hela.plot.stacked_feature_importances( + importances, + dataset.names, + dataset.colors + ) + + fig.tight_layout() + fig.savefig( + os.path.join(output_path, "feature_importances_breakdown.pdf"), + bbox_inches='tight' + ) + + def data_ranges(posterior, percentiles=(50, 16, 84)): - + samples, weights = posterior - values = wpercentile(samples, weights, percentiles, axis=0) + values = hela.wpercentile(samples, weights, percentiles, axis=0) ranges = np.array([values[0], values[2]-values[0], values[0]-values[1]]) return ranges.T def main_train(training_dataset, model_path, num_trees, num_jobs, - feature_importance, quiet, + feature_importances, + feature_importances_breakdown, + quiet, **_): - - logger.info("Loading dataset '{}'...".format(training_dataset)) - dataset = load_dataset(training_dataset) - - logger.info("Training model...") + + LOGGER.info("Loading dataset '%s'...", training_dataset) + dataset = hela.load_dataset(training_dataset) + + LOGGER.info("Training model...") model = train_model(dataset, num_trees, num_jobs, not quiet) - + os.makedirs(model_path, exist_ok=True) model_file = os.path.join(model_path, "model.pkl") - logger.info("Saving model to '{}'...".format(model_file)) + LOGGER.info("Saving model to '%s'...", model_file) joblib.dump(model, model_file) - - logger.info("Printing model information...") + + LOGGER.info("Printing model information...") print("OOB score: {:.4f}".format(model.rf.oob_score_)) - + test_model(model, dataset, model_path) - - if feature_importance: + + if feature_importances: model.enable_posterior = False - compute_feature_importance(model, dataset, model_path) + _plot_feature_importances(model, dataset, model_path) + + if feature_importances_breakdown: + _plot_feature_importances_breakdown(model, dataset, model_path) def main_predict(model_path, data_file, output_path, plot_posterior, **_): - + model_file = os.path.join(model_path, "model.pkl") - logger.info("Loading random forest from '{}'...".format(model_file)) + LOGGER.info("Loading random forest from '%s'...", model_file) model = joblib.load(model_file) - - logger.info("Loading data from '{}'...".format(data_file)) - data, _ = load_data_file(data_file, model.rf.n_features_) - + + LOGGER.info("Loading data from '%s'...", data_file) + data, _ = hela.load_data_file(data_file, model.rf.n_features_) + posterior = model.posterior(data[0]) - + posterior_ranges = data_ranges(posterior) for name_i, pred_range_i in zip(model.names, posterior_ranges): - print("Prediction for {}: {:.3g} [+{:.3g} -{:.3g}]".format(name_i, *pred_range_i)) - + format_str = "Prediction for {}: {:.3g} [+{:.3g} -{:.3g}]" + print(format_str.format(name_i, *pred_range_i)) + if plot_posterior: - logger.info("Plotting the posterior matrix...") - - fig = plot.posterior_matrix( + LOGGER.info("Plotting the posterior matrix...") + + fig = hela.plot.posterior_matrix( posterior, names=model.names, ranges=model.ranges, colors=model.colors ) os.makedirs(output_path, exist_ok=True) - logger.info("Saving the figure....") + LOGGER.info("Saving the figure....") fig.savefig(os.path.join(output_path, "posterior_matrix.pdf"), bbox_inches='tight') - logger.info("Done.") + LOGGER.info("Done.") def show_usage(parser, **_): @@ -130,38 +167,65 @@ def show_usage(parser, **_): def main(): - - parser = argparse.ArgumentParser(description="rfretrieval: Atmospheric retrieval with random forests.") + + parser = argparse.ArgumentParser( + description="rfretrieval: Atmospheric retrieval with random forests." + ) parser.set_defaults(func=show_usage, parser=parser) parser.add_argument("--quiet", action='store_true') subparsers = parser.add_subparsers() - + parser_train = subparsers.add_parser('train', help="train a model") - parser_train.add_argument("training_dataset", type=str, - help="JSON file with the training dataset description") - parser_train.add_argument("model_path", type=str, - help="path where the trained model will be saved") - parser_train.add_argument("--num-trees", type=int, default=1000, - help="number of trees in the forest") - parser_train.add_argument("--num-jobs", type=int, default=5, - help="number of parallel jobs for fitting the random forest") - parser_train.add_argument("--feature-importance", action='store_true', - help="compute feature importances after training") parser_train.set_defaults(func=main_train) - - parser_test = subparsers.add_parser('predict', help="use a trained model to perform a prediction") + parser_train.add_argument( + "training_dataset", type=str, + help="JSON file with the training dataset description" + ) + parser_train.add_argument( + "model_path", type=str, + help="path where the trained model will be saved" + ) + parser_train.add_argument( + "--num-trees", type=int, default=1000, + help="number of trees in the forest" + ) + parser_train.add_argument( + "--num-jobs", type=int, default=5, + help="number of parallel jobs for fitting the random forest" + ) + parser_train.add_argument( + "--feature-importances", action='store_true', + help="plot feature importances after training" + ) + parser_train.add_argument( + "--feature-importances-breakdown", action='store_true', + help="plot feature importances per output after training" + ) + + parser_test = subparsers.add_parser( + 'predict', + help="use a trained model to perform a prediction" + ) parser_test.set_defaults(func=main_predict) - parser_test.add_argument("model_path", type=str, - help="path to the trained model") - parser_test.add_argument("data_file", type=str, - help="NPY file with the data for the prediction") - parser_test.add_argument("output_path", type=str, - help="path to write the results of the prediction") - parser_test.add_argument("--plot-posterior", action='store_true', - help="plot and save the scatter matrix of the posterior distribution") - + parser_test.add_argument( + "model_path", type=str, + help="path to the trained model" + ) + parser_test.add_argument( + "data_file", type=str, + help="NPY file with the data for the prediction" + ) + parser_test.add_argument( + "output_path", type=str, + help="path to write the results of the prediction" + ) + parser_test.add_argument( + "--plot-posterior", action='store_true', + help="plot and save the scatter matrix of the posterior distribution" + ) + args = parser.parse_args() - config_logger(level=logging.WARNING if args.quiet else logging.INFO) + hela.config_logger(level=logging.WARNING if args.quiet else logging.INFO) args.func(**vars(args))