Add column name metadata to tabmat matrices (#278)

* Add column name getters * Matrix names are also combined * Add names to constructors * Add indexing support for column names * Remove unnecessary code * Better default column names * Reduce code duplication * Saner defaults * Add convenient getters and setters * Fix indexing * Smarter setter for categorical matrices * Add tests * Fix subsetting with np.newaxis * Remove the walrus :( * Fix test * Fix indexing with np.ix_ * Propagate column names where it makes sense * Fix merge mistake * Add changelog entry
Quantco · Aug 15, 2023 · e042ce3 · e042ce3
1 parent d3d3d82
commit e042ce3
Show file tree

Hide file tree

Showing 9 changed files with 814 additions and 27 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -10,6 +10,10 @@ Changelog
 Unreleased
 ----------
 
+**New features:**
+
+- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.
+
 **Other changes:**
 
 - Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
@@ -161,6 +161,7 @@ def matvec(mat, vec):
 
 """
 
+import re
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -245,6 +246,9 @@ def __init__(
         cat_vec: Union[List, np.ndarray, pd.Categorical],
         drop_first: bool = False,
         dtype: np.dtype = np.float64,
+        column_name: Optional[str] = None,
+        term_name: Optional[str] = None,
+        column_name_format: str = "{name}[{category}]",
     ):
         if pd.isnull(cat_vec).any():
             raise ValueError("Categorical data can't have missing values.")
@@ -260,6 +264,13 @@ def __init__(
         self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
         self.dtype = np.dtype(dtype)
 
+        self._colname = column_name
+        if term_name is None:
+            self._term = self._colname
+        else:
+            self._term = term_name
+        self._colname_format = column_name_format
+
     __array_ufunc__ = None
 
     def recover_orig(self) -> np.ndarray:
@@ -466,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix:
         i %= self.shape[1]  # wrap-around indexing
 
         if self.drop_first:
-            i += 1
+            i_corr = i + 1
+        else:
+            i_corr = i
 
-        col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None])
-        return SparseMatrix(col_i)
+        col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None])
+        return SparseMatrix(
+            col_i,
+            column_names=[self.column_names[i]],
+            term_names=[self.term_names[i]],
+        )
 
     def tocsr(self) -> sps.csr_matrix:
         """Return scipy csr representation of matrix."""
@@ -492,7 +509,11 @@ def to_sparse_matrix(self):
         """Return a tabmat.SparseMatrix representation."""
         from .sparse_matrix import SparseMatrix
 
-        return SparseMatrix(self.tocsr())
+        return SparseMatrix(
+            self.tocsr(),
+            column_names=self.column_names,
+            term_names=self.term_names,
+        )
 
     def toarray(self) -> np.ndarray:
         """Return array representation of matrix."""
@@ -523,7 +544,11 @@ def __getitem__(self, item):
             if isinstance(row, np.ndarray):
                 row = row.ravel()
             return CategoricalMatrix(
-                self.cat[row], drop_first=self.drop_first, dtype=self.dtype
+                self.cat[row],
+                drop_first=self.drop_first,
+                dtype=self.dtype,
+                column_name=self._colname,
+                column_name_format=self._colname_format,
             )
         else:
             # return a SparseMatrix if we subset columns
@@ -638,8 +663,111 @@ def multiply(self, other) -> SparseMatrix:
                     np.arange(self.shape[0] + 1, dtype=int),
                 ),
                 shape=self.shape,
-            )
+            ),
+            column_names=self.column_names,
+            term_names=self.term_names,
         )
 
     def __repr__(self):
         return str(self.cat)
+
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: Optional[str] = None,
+        indices: Optional[List[int]] = None,
+    ) -> List[Optional[str]]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
+
+        Returns
+        -------
+        List[Optional[str]]
+            Column names.
+        """
+        if type == "column":
+            name = self._colname
+        elif type == "term":
+            name = self._term
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
+
+        if indices is None:
+            indices = list(range(len(self.cat.categories) - self.drop_first))
+        if name is None and missing_prefix is None:
+            return [None] * (len(self.cat.categories) - self.drop_first)
+        elif name is None:
+            name = f"{missing_prefix}{indices[0]}-{indices[-1]}"
+
+        if type == "column":
+            return [
+                self._colname_format.format(name=name, category=cat)
+                for cat in self.cat.categories[self.drop_first :]
+            ]
+        else:
+            return [name] * (len(self.cat.categories) - self.drop_first)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        if isinstance(names, str):
+            names = [names]
+
+        if len(names) != 1:
+            if type == "column":
+                # Try finding the column name
+                base_names = []
+                for name, cat in zip(names, self.cat.categories[self.drop_first :]):
+                    partial_name = self._colname_format.format(
+                        name="__CAPTURE__", category=cat
+                    )
+                    pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
+                    if name is not None:
+                        match = re.search(pattern, name)
+                    else:
+                        match = None
+                    if match is not None:
+                        base_names.append(match.group(1))
+                    else:
+                        base_names.append(name)
+                names = base_names
+
+            if len(names) == self.shape[1] and all(name == names[0] for name in names):
+                names = [names[0]]
+
+        if len(names) != 1:
+            raise ValueError("A categorical matrix has only one name")
+
+        if type == "column":
+            self._colname = names[0]
+        elif type == "term":
+            self._term = names[0]
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import List, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -21,6 +21,7 @@ def from_pandas(
     object_as_cat: bool = False,
     cat_position: str = "expand",
     drop_first: bool = False,
+    categorical_format: str = "{name}[{category}]",
 ) -> MatrixBase:
     """
     Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
@@ -72,7 +73,14 @@ def from_pandas(
         if object_as_cat and coldata.dtype == object:
             coldata = coldata.astype("category")
         if isinstance(coldata.dtype, pd.CategoricalDtype):
-            cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
+            cat = CategoricalMatrix(
+                coldata,
+                drop_first=drop_first,
+                dtype=dtype,
+                column_name=colname,
+                term_name=colname,
+                column_name_format=categorical_format,
+            )
             if len(coldata.cat.categories) < cat_threshold:
                 (
                     X_dense_F,
@@ -82,6 +90,8 @@ def from_pandas(
                 ) = _split_sparse_and_dense_parts(
                     sps.csc_matrix(cat.tocsr(), dtype=dtype),
                     threshold=sparse_threshold,
+                    column_names=cat.get_names("column"),
+                    term_names=cat.get_names("term"),
                 )
                 matrices.append(X_dense_F)
                 is_cat.append(True)
@@ -128,13 +138,26 @@ def from_pandas(
             f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
         )
     if len(dense_dfidx) > 0:
-        matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
+        matrices.append(
+            DenseMatrix(
+                df.iloc[:, dense_dfidx].astype(dtype),
+                column_names=df.columns[dense_dfidx],
+                term_names=df.columns[dense_dfidx],
+            )
+        )
         indices.append(dense_mxidx)
         is_cat.append(False)
     if len(sparse_dfcols) > 0:
         sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
         full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
-        matrices.append(SparseMatrix(full_sparse, dtype=dtype))
+        matrices.append(
+            SparseMatrix(
+                full_sparse,
+                dtype=dtype,
+                column_names=[col.name for col in sparse_dfcols],
+                term_names=[col.name for col in sparse_dfcols],
+            )
+        )
         indices.append(sparse_mxidx)
         is_cat.append(False)
 
@@ -157,7 +180,10 @@ def from_pandas(
 
 
 def _split_sparse_and_dense_parts(
-    arg1: sps.csc_matrix, threshold: float = 0.1
+    arg1: sps.csc_matrix,
+    threshold: float = 0.1,
+    column_names: Optional[Sequence[Optional[str]]] = None,
+    term_names: Optional[Sequence[Optional[str]]] = None,
 ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
     """
     Split matrix.
@@ -176,12 +202,25 @@ def _split_sparse_and_dense_parts(
     dense_indices = np.where(densities > threshold)[0]
     sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices)
 
-    X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray()))
-    X_sparse = SparseMatrix(arg1[:, sparse_indices])
+    if column_names is None:
+        column_names = [None] * arg1.shape[1]
+    if term_names is None:
+        term_names = column_names
+
+    X_dense_F = DenseMatrix(
+        np.asfortranarray(arg1[:, dense_indices].toarray()),
+        column_names=[column_names[i] for i in dense_indices],
+        term_names=[term_names[i] for i in dense_indices],
+    )
+    X_sparse = SparseMatrix(
+        arg1[:, sparse_indices],
+        column_names=[column_names[i] for i in sparse_indices],
+        term_names=[term_names[i] for i in sparse_indices],
+    )
     return X_dense_F, X_sparse, dense_indices, sparse_indices
 
 
-def from_csc(mat: sps.csc_matrix, threshold=0.1):
+def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
     """
     Convert a CSC-format sparse matrix into a ``SplitMatrix``.