Skip to content

Commit

Permalink
Add column name metadata to tabmat matrices (#278)
Browse files Browse the repository at this point in the history
* Add column name getters

* Matrix names are also combined

* Add names to constructors

* Add indexing support for column names

* Remove unnecessary code

* Better default column names

* Reduce code duplication

* Saner defaults

* Add convenient getters and setters

* Fix indexing

* Smarter setter for categorical matrices

* Add tests

* Fix subsetting with np.newaxis

* Remove the walrus :(

* Fix test

* Fix indexing with np.ix_

* Propagate column names where it makes sense

* Fix merge mistake

* Add changelog entry
  • Loading branch information
stanmart authored Aug 15, 2023
1 parent d3d3d82 commit e042ce3
Show file tree
Hide file tree
Showing 9 changed files with 814 additions and 27 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Changelog
Unreleased
----------

**New features:**

- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.

**Other changes:**

- Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.
Expand Down
140 changes: 134 additions & 6 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def matvec(mat, vec):
"""

import re
from typing import List, Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -245,6 +246,9 @@ def __init__(
cat_vec: Union[List, np.ndarray, pd.Categorical],
drop_first: bool = False,
dtype: np.dtype = np.float64,
column_name: Optional[str] = None,
term_name: Optional[str] = None,
column_name_format: str = "{name}[{category}]",
):
if pd.isnull(cat_vec).any():
raise ValueError("Categorical data can't have missing values.")
Expand All @@ -260,6 +264,13 @@ def __init__(
self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
self.dtype = np.dtype(dtype)

self._colname = column_name
if term_name is None:
self._term = self._colname
else:
self._term = term_name
self._colname_format = column_name_format

__array_ufunc__ = None

def recover_orig(self) -> np.ndarray:
Expand Down Expand Up @@ -466,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix:
i %= self.shape[1] # wrap-around indexing

if self.drop_first:
i += 1
i_corr = i + 1
else:
i_corr = i

col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None])
return SparseMatrix(col_i)
col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None])
return SparseMatrix(
col_i,
column_names=[self.column_names[i]],
term_names=[self.term_names[i]],
)

def tocsr(self) -> sps.csr_matrix:
"""Return scipy csr representation of matrix."""
Expand All @@ -492,7 +509,11 @@ def to_sparse_matrix(self):
"""Return a tabmat.SparseMatrix representation."""
from .sparse_matrix import SparseMatrix

return SparseMatrix(self.tocsr())
return SparseMatrix(
self.tocsr(),
column_names=self.column_names,
term_names=self.term_names,
)

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
Expand Down Expand Up @@ -523,7 +544,11 @@ def __getitem__(self, item):
if isinstance(row, np.ndarray):
row = row.ravel()
return CategoricalMatrix(
self.cat[row], drop_first=self.drop_first, dtype=self.dtype
self.cat[row],
drop_first=self.drop_first,
dtype=self.dtype,
column_name=self._colname,
column_name_format=self._colname_format,
)
else:
# return a SparseMatrix if we subset columns
Expand Down Expand Up @@ -638,8 +663,111 @@ def multiply(self, other) -> SparseMatrix:
np.arange(self.shape[0] + 1, dtype=int),
),
shape=self.shape,
)
),
column_names=self.column_names,
term_names=self.term_names,
)

def __repr__(self):
return str(self.cat)

def get_names(
self,
type: str = "column",
missing_prefix: Optional[str] = None,
indices: Optional[List[int]] = None,
) -> List[Optional[str]]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
type: str {'column'|'term'}
Whether to get column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
missing_prefix: Optional[str], default None
Prefix to use for columns that do not have a name. If None, then no
default name is created.
indices
The indices used for columns that do not have a name. If ``None``,
then the indices are ``list(range(self.shape[1]))``.
Returns
-------
List[Optional[str]]
Column names.
"""
if type == "column":
name = self._colname
elif type == "term":
name = self._term
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")

if indices is None:
indices = list(range(len(self.cat.categories) - self.drop_first))
if name is None and missing_prefix is None:
return [None] * (len(self.cat.categories) - self.drop_first)
elif name is None:
name = f"{missing_prefix}{indices[0]}-{indices[-1]}"

if type == "column":
return [
self._colname_format.format(name=name, category=cat)
for cat in self.cat.categories[self.drop_first :]
]
else:
return [name] * (len(self.cat.categories) - self.drop_first)

def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
"""Set column names.
Parameters
----------
names: List[Optional[str]]
Names to set.
type: str {'column'|'term'}
Whether to set column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
"""
if isinstance(names, str):
names = [names]

if len(names) != 1:
if type == "column":
# Try finding the column name
base_names = []
for name, cat in zip(names, self.cat.categories[self.drop_first :]):
partial_name = self._colname_format.format(
name="__CAPTURE__", category=cat
)
pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
if name is not None:
match = re.search(pattern, name)
else:
match = None
if match is not None:
base_names.append(match.group(1))
else:
base_names.append(name)
names = base_names

if len(names) == self.shape[1] and all(name == names[0] for name in names):
names = [names[0]]

if len(names) != 1:
raise ValueError("A categorical matrix has only one name")

if type == "column":
self._colname = names[0]
elif type == "term":
self._term = names[0]
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")
55 changes: 47 additions & 8 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from typing import List, Tuple, Union
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -21,6 +21,7 @@ def from_pandas(
object_as_cat: bool = False,
cat_position: str = "expand",
drop_first: bool = False,
categorical_format: str = "{name}[{category}]",
) -> MatrixBase:
"""
Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
Expand Down Expand Up @@ -72,7 +73,14 @@ def from_pandas(
if object_as_cat and coldata.dtype == object:
coldata = coldata.astype("category")
if isinstance(coldata.dtype, pd.CategoricalDtype):
cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
cat = CategoricalMatrix(
coldata,
drop_first=drop_first,
dtype=dtype,
column_name=colname,
term_name=colname,
column_name_format=categorical_format,
)
if len(coldata.cat.categories) < cat_threshold:
(
X_dense_F,
Expand All @@ -82,6 +90,8 @@ def from_pandas(
) = _split_sparse_and_dense_parts(
sps.csc_matrix(cat.tocsr(), dtype=dtype),
threshold=sparse_threshold,
column_names=cat.get_names("column"),
term_names=cat.get_names("term"),
)
matrices.append(X_dense_F)
is_cat.append(True)
Expand Down Expand Up @@ -128,13 +138,26 @@ def from_pandas(
f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
)
if len(dense_dfidx) > 0:
matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
matrices.append(
DenseMatrix(
df.iloc[:, dense_dfidx].astype(dtype),
column_names=df.columns[dense_dfidx],
term_names=df.columns[dense_dfidx],
)
)
indices.append(dense_mxidx)
is_cat.append(False)
if len(sparse_dfcols) > 0:
sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
matrices.append(SparseMatrix(full_sparse, dtype=dtype))
matrices.append(
SparseMatrix(
full_sparse,
dtype=dtype,
column_names=[col.name for col in sparse_dfcols],
term_names=[col.name for col in sparse_dfcols],
)
)
indices.append(sparse_mxidx)
is_cat.append(False)

Expand All @@ -157,7 +180,10 @@ def from_pandas(


def _split_sparse_and_dense_parts(
arg1: sps.csc_matrix, threshold: float = 0.1
arg1: sps.csc_matrix,
threshold: float = 0.1,
column_names: Optional[Sequence[Optional[str]]] = None,
term_names: Optional[Sequence[Optional[str]]] = None,
) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
"""
Split matrix.
Expand All @@ -176,12 +202,25 @@ def _split_sparse_and_dense_parts(
dense_indices = np.where(densities > threshold)[0]
sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices)

X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray()))
X_sparse = SparseMatrix(arg1[:, sparse_indices])
if column_names is None:
column_names = [None] * arg1.shape[1]
if term_names is None:
term_names = column_names

X_dense_F = DenseMatrix(
np.asfortranarray(arg1[:, dense_indices].toarray()),
column_names=[column_names[i] for i in dense_indices],
term_names=[term_names[i] for i in dense_indices],
)
X_sparse = SparseMatrix(
arg1[:, sparse_indices],
column_names=[column_names[i] for i in sparse_indices],
term_names=[term_names[i] for i in sparse_indices],
)
return X_dense_F, X_sparse, dense_indices, sparse_indices


def from_csc(mat: sps.csc_matrix, threshold=0.1):
def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
"""
Convert a CSC-format sparse matrix into a ``SplitMatrix``.
Expand Down
Loading

0 comments on commit e042ce3

Please sign in to comment.