Skip to content

Commit

Permalink
Merge branch 'tabmat-v4' into formula
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart committed Aug 15, 2023
2 parents f1ba304 + e042ce3 commit 01e20b3
Show file tree
Hide file tree
Showing 13 changed files with 821 additions and 34 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/conda-build-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ jobs:
- name: Build conda package
shell: pwsh
run: |
mamba install -n base -y conda-build
conda build -m .ci_support/${{ matrix.CONDA_BUILD_YML }}.yaml conda.recipe
mamba install -n base -y conda-build boa
conda mambabuild -m .ci_support/${{ matrix.CONDA_BUILD_YML }}.yaml conda.recipe
4 changes: 2 additions & 2 deletions .github/workflows/macos-conda-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -exo pipefail

mamba install -y conda-build
mamba install -y conda-build boa

# Don't test cross-compiled result (there is no emulation) and use the latest MacOS SDK.
if grep -q "osx-arm64" .ci_support/${CONDA_BUILD_YML}.yaml; then
Expand All @@ -13,4 +13,4 @@ CONDA_BUILD_SYSROOT:
- "${CONDA_BUILD_SYSROOT}"
EOF
fi
conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-}
conda mambabuild -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-}
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
repos:
- repo: https://github.com/Quantco/pre-commit-mirrors-black
rev: 23.3.0
rev: 23.7.0
hooks:
- id: black-conda
additional_dependencies: [flake8-docstrings, flake8-rst-docstrings]
args:
- --safe
- --target-version=py36
- repo: https://github.com/Quantco/pre-commit-mirrors-flake8
rev: 6.0.0
rev: 6.1.0
hooks:
- id: flake8-conda
additional_dependencies: [
Expand All @@ -33,7 +33,7 @@ repos:
additional_dependencies:
- python=3.8
- repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade
rev: 3.7.0
rev: 3.9.0
hooks:
- id: pyupgrade-conda
- repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Changelog
Unreleased
----------

**New features:**

- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.

**Other changes:**

- Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.
Expand Down
140 changes: 134 additions & 6 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def matvec(mat, vec):
"""

import re
from typing import List, Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -245,6 +246,9 @@ def __init__(
cat_vec: Union[List, np.ndarray, pd.Categorical],
drop_first: bool = False,
dtype: np.dtype = np.float64,
column_name: Optional[str] = None,
term_name: Optional[str] = None,
column_name_format: str = "{name}[{category}]",
):
if pd.isnull(cat_vec).any():
raise ValueError("Categorical data can't have missing values.")
Expand All @@ -260,6 +264,13 @@ def __init__(
self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
self.dtype = np.dtype(dtype)

self._colname = column_name
if term_name is None:
self._term = self._colname
else:
self._term = term_name
self._colname_format = column_name_format

__array_ufunc__ = None

def recover_orig(self) -> np.ndarray:
Expand Down Expand Up @@ -466,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix:
i %= self.shape[1] # wrap-around indexing

if self.drop_first:
i += 1
i_corr = i + 1
else:
i_corr = i

col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None])
return SparseMatrix(col_i)
col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None])
return SparseMatrix(
col_i,
column_names=[self.column_names[i]],
term_names=[self.term_names[i]],
)

def tocsr(self) -> sps.csr_matrix:
"""Return scipy csr representation of matrix."""
Expand All @@ -492,7 +509,11 @@ def to_sparse_matrix(self):
"""Return a tabmat.SparseMatrix representation."""
from .sparse_matrix import SparseMatrix

return SparseMatrix(self.tocsr())
return SparseMatrix(
self.tocsr(),
column_names=self.column_names,
term_names=self.term_names,
)

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
Expand Down Expand Up @@ -523,7 +544,11 @@ def __getitem__(self, item):
if isinstance(row, np.ndarray):
row = row.ravel()
return CategoricalMatrix(
self.cat[row], drop_first=self.drop_first, dtype=self.dtype
self.cat[row],
drop_first=self.drop_first,
dtype=self.dtype,
column_name=self._colname,
column_name_format=self._colname_format,
)
else:
# return a SparseMatrix if we subset columns
Expand Down Expand Up @@ -638,8 +663,111 @@ def multiply(self, other) -> SparseMatrix:
np.arange(self.shape[0] + 1, dtype=int),
),
shape=self.shape,
)
),
column_names=self.column_names,
term_names=self.term_names,
)

def __repr__(self):
return str(self.cat)

def get_names(
self,
type: str = "column",
missing_prefix: Optional[str] = None,
indices: Optional[List[int]] = None,
) -> List[Optional[str]]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
type: str {'column'|'term'}
Whether to get column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
missing_prefix: Optional[str], default None
Prefix to use for columns that do not have a name. If None, then no
default name is created.
indices
The indices used for columns that do not have a name. If ``None``,
then the indices are ``list(range(self.shape[1]))``.
Returns
-------
List[Optional[str]]
Column names.
"""
if type == "column":
name = self._colname
elif type == "term":
name = self._term
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")

if indices is None:
indices = list(range(len(self.cat.categories) - self.drop_first))
if name is None and missing_prefix is None:
return [None] * (len(self.cat.categories) - self.drop_first)
elif name is None:
name = f"{missing_prefix}{indices[0]}-{indices[-1]}"

if type == "column":
return [
self._colname_format.format(name=name, category=cat)
for cat in self.cat.categories[self.drop_first :]
]
else:
return [name] * (len(self.cat.categories) - self.drop_first)

def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
"""Set column names.
Parameters
----------
names: List[Optional[str]]
Names to set.
type: str {'column'|'term'}
Whether to set column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
"""
if isinstance(names, str):
names = [names]

if len(names) != 1:
if type == "column":
# Try finding the column name
base_names = []
for name, cat in zip(names, self.cat.categories[self.drop_first :]):
partial_name = self._colname_format.format(
name="__CAPTURE__", category=cat
)
pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
if name is not None:
match = re.search(pattern, name)
else:
match = None
if match is not None:
base_names.append(match.group(1))
else:
base_names.append(name)
names = base_names

if len(names) == self.shape[1] and all(name == names[0] for name in names):
names = [names[0]]

if len(names) != 1:
raise ValueError("A categorical matrix has only one name")

if type == "column":
self._colname = names[0]
elif type == "term":
self._term = names[0]
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")
31 changes: 27 additions & 4 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def from_pandas(
object_as_cat: bool = False,
cat_position: str = "expand",
drop_first: bool = False,
categorical_format: str = "{name}[{category}]",
) -> MatrixBase:
"""
Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
Expand Down Expand Up @@ -79,7 +80,14 @@ def from_pandas(
if object_as_cat and coldata.dtype == object:
coldata = coldata.astype("category")
if isinstance(coldata.dtype, pd.CategoricalDtype):
cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
cat = CategoricalMatrix(
coldata,
drop_first=drop_first,
dtype=dtype,
column_name=colname,
term_name=colname,
column_name_format=categorical_format,
)
if len(coldata.cat.categories) < cat_threshold:
(
X_dense_F,
Expand All @@ -89,6 +97,8 @@ def from_pandas(
) = _split_sparse_and_dense_parts(
sps.csc_matrix(cat.tocsr(), dtype=dtype),
threshold=sparse_threshold,
column_names=cat.get_names("column"),
term_names=cat.get_names("term"),
)
matrices.append(X_dense_F)
is_cat.append(True)
Expand Down Expand Up @@ -135,13 +145,26 @@ def from_pandas(
f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
)
if len(dense_dfidx) > 0:
matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
matrices.append(
DenseMatrix(
df.iloc[:, dense_dfidx].astype(dtype),
column_names=df.columns[dense_dfidx],
term_names=df.columns[dense_dfidx],
)
)
indices.append(dense_mxidx)
is_cat.append(False)
if len(sparse_dfcols) > 0:
sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
matrices.append(SparseMatrix(full_sparse, dtype=dtype))
matrices.append(
SparseMatrix(
full_sparse,
dtype=dtype,
column_names=[col.name for col in sparse_dfcols],
term_names=[col.name for col in sparse_dfcols],
)
)
indices.append(sparse_mxidx)
is_cat.append(False)

Expand All @@ -163,7 +186,7 @@ def from_pandas(
return matrices[0]


def from_csc(mat: sps.csc_matrix, threshold=0.1):
def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
"""
Convert a CSC-format sparse matrix into a ``SplitMatrix``.
Expand Down
24 changes: 20 additions & 4 deletions src/tabmat/constructor_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple
from typing import Optional, Sequence, Tuple

import numpy as np
import scipy.sparse as sps
Expand All @@ -8,7 +8,10 @@


def _split_sparse_and_dense_parts(
arg1: sps.csc_matrix, threshold: float = 0.1
arg1: sps.csc_matrix,
threshold: float = 0.1,
column_names: Optional[Sequence[Optional[str]]] = None,
term_names: Optional[Sequence[Optional[str]]] = None,
) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
"""
Split matrix.
Expand All @@ -27,6 +30,19 @@ def _split_sparse_and_dense_parts(
dense_indices = np.where(densities > threshold)[0]
sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices)

X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray()))
X_sparse = SparseMatrix(arg1[:, sparse_indices])
if column_names is None:
column_names = [None] * arg1.shape[1]
if term_names is None:
term_names = column_names

X_dense_F = DenseMatrix(
np.asfortranarray(arg1[:, dense_indices].toarray()),
column_names=[column_names[i] for i in dense_indices],
term_names=[term_names[i] for i in dense_indices],
)
X_sparse = SparseMatrix(
arg1[:, sparse_indices],
column_names=[column_names[i] for i in sparse_indices],
term_names=[term_names[i] for i in sparse_indices],
)
return X_dense_F, X_sparse, dense_indices, sparse_indices
Loading

0 comments on commit 01e20b3

Please sign in to comment.