Merge pull request #1430 from marc-vdm/ca_range_score_fix

additional contribution analysis fixes
LCA-ActivityBrowser · Jan 16, 2025 · eeb1685 · eeb1685
2 parents 1e02314 + ed7d705
commit eeb1685
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 91 deletions.
diff --git a/activity_browser/bwutils/multilca.py b/activity_browser/bwutils/multilca.py
@@ -3,20 +3,20 @@
 from typing import Iterable, Optional, Union
 from logging import getLogger
 
-import bw2analyzer as ba
 import bw2calc as bc
 import numpy as np
 import pandas as pd
 from PySide2.QtWidgets import QApplication, QMessageBox
 
 from activity_browser.mod import bw2data as bd
+from activity_browser.mod.bw2analyzer import ABContributionAnalysis
 
 from .commontasks import wrap_text
 from .errors import ReferenceFlowValueError
 from .metadata import AB_metadata
 
 log = getLogger(__name__)
-ca = ba.ContributionAnalysis()
+ca = ABContributionAnalysis()
 
 
 class MLCA(object):
@@ -394,20 +394,24 @@ def __init__(self, mlca):
             ),
         }
 
-    def normalize(self, contribution_array: np.ndarray) -> np.ndarray:
-        """Normalise the contribution array.
+    def normalize(self, contribution_array: np.ndarray, total_range:bool=True) -> np.ndarray:
+        """Normalize the contribution array based on range or score
 
         Parameters
         ----------
         contribution_array : A 2-dimensional contribution array
+        total_range : A bool, True for normalization based on range, False for score
 
         Returns
         -------
         2-dimensional array of same shape, with scores normalized.
 
         """
-        scores = abs(contribution_array.sum(axis=1, keepdims=True))
-        return contribution_array / scores
+        if total_range:  # total is based on the range
+            total = abs(abs(contribution_array).sum(axis=1, keepdims=True))
+        else:  # total is based on the score
+            total = abs(contribution_array.sum(axis=1, keepdims=True))
+        return contribution_array / total
 
     def _build_dict(
         self,
@@ -437,12 +441,13 @@ def _build_dict(
         for fu_or_method, col in FU_M_index.items():
             contribution_col = contributions[col, :]
             if total_range:  # total is based on the range
-                total = np.abs(contribution_col).sum()
+                normalize_to = np.abs(contribution_col).sum()
             else:  # total is based on the score
-                total = contribution_col.sum()
+                normalize_to = contribution_col.sum()
+            score = contribution_col.sum()
 
             top_contribution = ca.sort_array(
-                contribution_col, limit=limit, limit_type=limit_type, total=total
+                contribution_col, limit=limit, limit_type=limit_type, total=normalize_to
             )
 
             # split and calculate remaining rest sections for positive and negative part
@@ -458,7 +463,7 @@ def _build_dict(
             cont_per = OrderedDict()
             cont_per.update(
                 {
-                    ("Total", ""): total,
+                    ("Score", ""): score,
                     ("Rest (+)", ""): pos_rest,
                     ("Rest (-)", ""): neg_rest,
                 }
@@ -602,20 +607,21 @@ def get_labelled_contribution_dict(
         # If the cont_dict has tuples for keys, coerce df.columns into MultiIndex
         if all(isinstance(k, tuple) for k in cont_dict.keys()):
             df.columns = pd.MultiIndex.from_tuples(df.columns)
-        special_keys = [("Total", ""), ("Rest (+)", ""), ("Rest (-)", "")]
+
+        special_keys = [("Score", ""), ("Rest (+)", ""), ("Rest (-)", "")]
         # replace all 0 values with NaN and drop all rows with only NaNs
         df = df.replace(0, np.nan)
 
-        # sort on absolute mean of a row
-        df_bot = deepcopy(df.loc[df.index.difference(special_keys)].dropna(how="all"))
-
-        func = lambda row: np.nanmean(np.abs(row))
+        # sort on mean square of a row
+        df_bot = deepcopy(df.iloc[3:, :])
+        func = lambda row: np.nanmean(np.square(row))
         if len(df_bot) > 1:  # but only sort if there is something to sort
             df_bot["_sort_me_"] = (df_bot.select_dtypes(include=np.number)).apply(func, axis=1)
             df_bot.sort_values(by="_sort_me_", ascending=False, inplace=True)
             del df_bot["_sort_me_"]
 
         df = pd.concat([df.iloc[:3, :], df_bot], axis=0)
+        df.dropna(how="all", inplace=True)
 
         if not mask:
             joined = self.join_df_with_metadata(
@@ -638,7 +644,7 @@ def adjust_table_unit(df: pd.DataFrame, method: Optional[tuple]) -> pd.DataFrame
         """Given a dataframe, adjust the unit of the table to either match the given method, or not exist."""
         if "unit" not in df.columns:
             return df
-        keys = df.index[~df["index"].isin({"Total", "Rest (+)", "Rest (-)"})]
+        keys = df.index[~df["index"].isin({"Score", "Rest (+)", "Rest (-)"})]
         unit = bd.Method(method).metadata.get("unit") if method else "unit"
         df.loc[keys, "unit"] = unit
         return df
@@ -850,7 +856,7 @@ def top_elementary_flow_contributions(
 
         # Normalise if required
         if normalize:
-            contributions = self.normalize(contributions)
+            contributions = self.normalize(contributions, total_range)
 
         top_cont_dict = self._build_dict(
             contributions, index, rev_index, limit, limit_type, total_range
@@ -906,7 +912,7 @@ def top_process_contributions(
 
         # Normalise if required
         if normalize:
-            contributions = self.normalize(contributions)
+            contributions = self.normalize(contributions, total_range)
 
         top_cont_dict = self._build_dict(
             contributions, index, rev_index, limit, limit_type, total_range

diff --git a/activity_browser/docs/wiki/LCA-Results.md b/activity_browser/docs/wiki/LCA-Results.md
@@ -98,26 +98,30 @@ The total impact is still 1.6.
 In this section we generalize a little bit for the different contribution approaches,
 we call the _from_ part of the contributions (the EFs or activities or FT above) _entities_.
 
-There are several ways Activity Browser manipulates your results by default.
-- The results are **sorted** so that the row with the largest (absolute) average values are shown first.
+There are several ways Activity Browser manipulates your results by default:
+- All reference flows are compared to eachother.
+- The contributions are **sorted** so that the most important contributions are shown first.
+  - The sorting is done on the _mean square_ (ignoring zero values) of each row of contributing entities.
 - A `cut-off` of 5% is applied, this only shows results that contribute at least 5% to the total range of results, 
-  all other entities are grouped into a `Rest (+)` or `Rest (-)` groups.
-- The contributions are _normalized_ to the impact of that reference flow, meaning they are show as a percentage, 
-  counting up to 100% for every item you compare.
-
-These actions are taken to show you the most relevant results.
+  all other entities are grouped into the `Rest (+)` and `Rest (-)` groups for positive and negative 
+  contributions respectively.
+- The contributions are _normalized_ to the LCA scores, 
+  meaning contributions are shown as a percentage contribution of the score, counting up to 100%.
 
+These defaults exist to show you the most relevant results in most cases, but you may often want to make this more 
+specific for your analysis. 
 You can manually manipulate the contribution results in the menu shown below, which we will explain bit by bit 
 in the next sections.
 ![contributions cutoff](./assets/contribution_manipulation.png)
 
 #### Cut-off
 You can manually change the `Cut-off type` of the results in two ways, `Relative` or `Top #`.
-The `Relative` mode shows contributions _from_ entities of _x_% or higher.
-The `Top #` mode shows contributions from the _x_ entities that contribute the most (as absolute).
+- The `Relative` mode shows contributions _from_ entities of _x_% or higher.
+- The `Top #` mode shows contributions from the _x_ entities that contribute the most (as absolute).
+
 You can adjust the `Cut-off level` to change how many results you see.
 
-All results that don't make the cut-off will be grouped into the `Rest (+)` and `Rest (-)` groups.
+All contributions that are below the cut-off will be grouped into the `Rest (+)` and `Rest (-)` groups.
 The Rest groups are only present when there are positive or negative numbers remaining for the respective rest groups. 
 
 #### Compare
@@ -131,33 +135,42 @@ The compare mode defines what is shown in the figure.
 
 #### Aggregation
 The `Aggregate by` menu can be used to _group_ results based on field names.
-As an example, EF contributions can be grouped on the name, 
-for example to group all flows with the same name.
-Another example for process contributions can be grouped based on their reference product name.
+This is useful to group contributors together so you have fewer -and larger- contributors. 
+As an example, EF contributions can be grouped on the name to group all flows with the same name 
+(which would for example group all EFs with the name _carbon dioxide_ together).
+As another example, process contributions can be grouped based on their reference product name
+(which would for example group all processes with the product name _electricity, high voltage_ together).
 
 #### Plot and Table
 By default, Activity Browser shows a plot and a table. 
-You can disable one of them if you want to focus on one of them.
+You can disable one of them if you want to focus on the other.
 
 #### Relative and Absolute
 You can choose between `Relative` and `Absolute` results.
-The `Relative` results will sum to 100% (the total score), the `Absolute` results will sum to the impact score.
+The `Relative` results will sum to 100% (the total `Range` or `Score`), 
+the `Absolute` results will sum to the impact score.
+For `Relative`, you can choose what you use as the 100% reference, the `Range` or the `Score`.
 
 #### Range and Score
-If the Cut-off type is `Relative`, you can choose between `Range` and `Score`.
-This determines what you use as the _total_ to which the relative contributions are counted. 
-For `Range`, this is the full _range_ of results, for example, if all your negative results together have a score of -2
-and all your positive results together have a score of 10, the _range_ is 12 (-2 * -1 + 10).
-For `Score`, this is the total score (sum) of the results, for example, if all your negative results together have a 
-score of -2 and all your positive results together have a score of 10, the _score_ is 8 (-2 + 10).
-The `Range` or `Score` setting are only used when your results contain both positive and negative results.
+The `Range`/`Score` determines what you use as the _total_ to which the contributions are counted. 
+- For `Range`, this is the full _range_ of results
+  - For example, if all your negative results together have a score of -2 and all your positive results together have a 
+    score of 10, the _range_ is 12 (-2 * -1 + 10).
+  - An entity with a contribution of 4 would have a relative contribution of 4/12 = 33.3...%. 
+- For `Score`, this is the total score (sum) of the results
+  - For example, if all your negative results together have a score of -2 and all your positive results together have a 
+    score of 10, the _score_ is 8 (-2 + 10).
+  - An entity with a contribution of 4 would have a relative contribution of 4/8 = 50%.
+
+The `Range` or `Score` setting are only relevant when your results contain both positive and negative contributions.
 
 ### Positive and negative numbers in contribution results
 It can happen in LCA that you get both positive and negative numbers in your contribution results.
-Some of these reasons could be negative characterization factors, flows with negative numbers or using substitution flows.
+Some reasons for this could be negative characterization factors, flows with negative numbers or using 
+substitution flows.
 
 When there are both positive and negative numbers in the result, Activity Browser will show a marker to indicate 
-where the total score is, and show positive and negative contributions to the impact separately.
+where the total _score_ is, and show positive and negative contributions to the impact separately.
 
 Below is a simple example (with unrealistic values) to demonstrate this: