From d91fd1c70d550876ad30331f450cd274bc8ad853 Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 08:10:06 +0100 Subject: [PATCH 01/32] Pre-commit autoupdate (#274) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2fa1dbd2..94355447 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: isort-conda additional_dependencies: [toml] - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.3.0" + rev: "1.4.0" hooks: - id: mypy-conda additional_dependencies: From b788ddd44c22805ed81a9b92bd88e939c26fe556 Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:39:23 +0100 Subject: [PATCH 02/32] Pre-commit autoupdate (#276) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94355447..5a37c63e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: isort-conda additional_dependencies: [toml] - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.4.0" + rev: "1.4.1" hooks: - id: mypy-conda additional_dependencies: From 212a1c6858f2d7b1e7e20772b513d4c2767f374c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:40:15 +0100 Subject: [PATCH 03/32] Bump pypa/gh-action-pypi-publish from 1.8.6 to 1.8.7 (#277) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.6 to 1.8.7. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.6...v1.8.7) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels_release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index 0edbde90..3f61d37c 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -60,7 +60,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.6 + - uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.GH_TESTPYPI_UPLOAD }} @@ -75,7 +75,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.6 + - uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.GH_PYPI_UPLOAD }} From 2391ada89e76dd19dc3549ee5bea17b04433d8b3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 08:29:56 -0400 Subject: [PATCH 04/32] Bump pypa/gh-action-pypi-publish from 1.8.7 to 1.8.8 (#279) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.7 to 1.8.8. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.7...v1.8.8) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels_release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index 3f61d37c..b43f3018 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -60,7 +60,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.7 + - uses: pypa/gh-action-pypi-publish@v1.8.8 with: user: __token__ password: ${{ secrets.GH_TESTPYPI_UPLOAD }} @@ -75,7 +75,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.7 + - uses: pypa/gh-action-pypi-publish@v1.8.8 with: user: __token__ password: ${{ secrets.GH_PYPI_UPLOAD }} From 6e756dd55523fe9d8e338d95c13feba80a17f896 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 08:30:20 -0400 Subject: [PATCH 05/32] Bump pypa/cibuildwheel from 2.13.1 to 2.14.1 (#280) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.13.1 to 2.14.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.13.1...v2.14.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/build_wheels_release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index b531b3bc..e147b2bf 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -20,7 +20,7 @@ jobs: with: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.13.1 + uses: pypa/cibuildwheel@v2.14.1 env: CIBW_ARCHS_LINUX: auto CIBW_ARCHS_MACOS: x86_64 arm64 diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index b43f3018..9f6cd123 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -21,7 +21,7 @@ jobs: with: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.13.1 + uses: pypa/cibuildwheel@v2.14.1 env: CIBW_ARCHS_LINUX: auto aarch64 CIBW_ARCHS_MACOS: x86_64 arm64 From 31ca04674e94d69f67b8f9fe844269405f31dad6 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 18 Jul 2023 17:17:08 +0200 Subject: [PATCH 06/32] Minimal implementation (tests green) --- src/tabmat/categorical_matrix.py | 6 ++- src/tabmat/dense_matrix.py | 84 +++++++++++++++++++++++--------- src/tabmat/sparse_matrix.py | 9 ++-- tests/test_matrices.py | 4 +- 4 files changed, 71 insertions(+), 32 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 44c47efd..7a751fdb 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -452,8 +452,10 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self._cross_dense(other, d, rows, L_cols, R_cols) + from .dense_matrix import DenseMatrix + + if isinstance(other, (np.ndarray, DenseMatrix)): + return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, sps.csc_matrix): return self._cross_sparse(other, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 84ef1f1d..c854041f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -17,7 +17,7 @@ ) -class DenseMatrix(np.ndarray, MatrixBase): +class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): """ A ``numpy.ndarray`` subclass with several additional functions that allow it to share the MatrixBase API with SparseMatrix and CategoricalMatrix. @@ -32,29 +32,65 @@ class DenseMatrix(np.ndarray, MatrixBase): """ - def __new__(cls, input_array): # noqa - """ - Details of how to subclass np.ndarray are explained here: + def __init__(self, input_array): + self._array = np.asarray(input_array) - https://docs.scipy.org/doc/numpy/user/basics.subclassing.html\ - #slightly-more-realistic-example-attribute-added-to-existing-array - """ - obj = np.asarray(input_array).view(cls) - if not np.issubdtype(obj.dtype, np.floating): - raise NotImplementedError("DenseMatrix is only implemented for float data") - return obj + def __getitem__(self, key): + return type(self)(self._array.__getitem__(key)) + + def __array__(self, dtype=None): + return self._array.astype(dtype, copy=False) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + if method in ["__call__", "accumulate"]: + return type(self)(result) + else: + return result + + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + + @property + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) + + T = property(transpose) + + def astype(self, dtype, order="K", casting="unsafe", copy=True): + """Copy of the array, cast to a specified type.""" + return type(self)(self._array.astype(dtype, order, casting, copy)) - def __array_finalize__(self, obj): - if obj is None: - return + def sum(self, *args, **kwargs): + """Return the sum of the array elements over the given axis.""" + return self._array.sum(*args, **kwargs) def getcol(self, i): """Return matrix column at specified index.""" - return self[:, [i]] + return type(self)(self._array[:, [i]]) def toarray(self): """Return array representation of matrix.""" - return np.asarray(self) + return self._array def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None @@ -62,7 +98,7 @@ def sandwich( """Perform a sandwich product: X.T @ diag(d) @ X.""" d = np.asarray(d) rows, cols = setup_restrictions(self.shape, rows, cols) - return dense_sandwich(self, d, rows, cols) + return dense_sandwich(self._array, d, rows, cols) def _cross_sandwich( self, @@ -81,7 +117,7 @@ def _cross_sandwich( def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: """Get standard deviations of columns.""" - sqrt_arg = transpose_square_dot_weights(self, weights) - col_means**2 + sqrt_arg = transpose_square_dot_weights(self._array, weights) - col_means**2 # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. @@ -105,7 +141,7 @@ def _matvec_helper( # this without an explosion of code? vec = np.asarray(vec) check_matvec_dimensions(self, vec, transpose=transpose) - X = self.T if transpose else self + X = self._array.T if transpose else self._array # NOTE: We assume that rows and cols are unique unrestricted_rows = rows is None or len(rows) == self.shape[0] @@ -122,11 +158,11 @@ def _matvec_helper( # TODO: should take 'out' parameter fast_fnc = dense_rmatvec if transpose else dense_matvec if vec.ndim == 1: - res = fast_fnc(self, vec, rows, cols) + res = fast_fnc(self._array, vec, rows, cols) elif vec.ndim == 2 and vec.shape[1] == 1: - res = fast_fnc(self, vec[:, 0], rows, cols)[:, None] + res = fast_fnc(self._array, vec[:, 0], rows, cols)[:, None] else: - subset = self[np.ix_(rows, cols)] + subset = self._array[np.ix_(rows, cols)] res = subset.T.dot(vec[rows]) if transpose else subset.dot(vec[cols]) if out is None: return res @@ -164,5 +200,5 @@ def multiply(self, other): This assumes that ``other`` is a vector of size ``self.shape[0]``. """ if np.asanyarray(other).ndim == 1: - return super().__mul__(other[:, np.newaxis]) - return super().__mul__(other) + return type(self)(self._array.__mul__(other[:, np.newaxis])) + return type(self)(self._array.__mul__(other)) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7f1b44ad..3befbad9 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -59,8 +59,7 @@ def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ X.""" - if not hasattr(d, "dtype"): - d = np.asarray(d) + d = np.asarray(d) if not self.dtype == d.dtype: raise TypeError( f"""self and d need to be of same dtype, either np.float64 @@ -80,9 +79,11 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ): """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self.sandwich_dense(other, d, rows, L_cols, R_cols) from .categorical_matrix import CategoricalMatrix + from .dense_matrix import DenseMatrix + + if isinstance(other, (np.ndarray, DenseMatrix)): + return self.sandwich_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return other._cross_sandwich(self, d, rows, R_cols, L_cols).T diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 64317747..5d314c7e 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -24,7 +24,7 @@ def dense_matrix_C() -> tm.DenseMatrix: def dense_matrix_not_writeable() -> tm.DenseMatrix: mat = dense_matrix_F() - mat.setflags(write=False) + mat._array.setflags(write=False) return mat @@ -440,7 +440,7 @@ def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): expected = vec_as_list @ mat.A np.testing.assert_allclose(res, expected) np.testing.assert_allclose(res2, expected) - assert isinstance(res, np.ndarray) + assert isinstance(res, (np.ndarray, tm.DenseMatrix)) @pytest.mark.parametrize("mat", get_matrices()) From cce460fadff5b95e95c07ff24a68dc0f647958b4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 18 Jul 2023 20:29:18 +0200 Subject: [PATCH 07/32] Improve the performance of `from_pandas` in the case of low-cardinality categoricals (#275) * Improve the performance of `from_pandas` * Update changelog according to review --- CHANGELOG.rst | 7 +++++++ src/tabmat/constructor.py | 12 ++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 83c8aceb..331695b1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +Unreleased +---------- + +**Other changes:** + +- Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables. + 3.1.10 - 2023-06-23 ------------------- diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index b782f2da..f8e23c31 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -72,6 +72,7 @@ def from_pandas( if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): + cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) if len(coldata.cat.categories) < cat_threshold: ( X_dense_F, @@ -79,15 +80,7 @@ def from_pandas( dense_indices, sparse_indices, ) = _split_sparse_and_dense_parts( - pd.get_dummies( - coldata, - prefix=colname, - sparse=True, - drop_first=drop_first, - dtype=np.float64, - ) - .sparse.to_coo() - .tocsc(), + sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, ) matrices.append(X_dense_F) @@ -103,7 +96,6 @@ def from_pandas( indices.append(sparse_indices) else: - cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) matrices.append(cat) is_cat.append(True) if cat_position == "expand": From 24525c8fd5c82a705addd5f828909d859eebb1ad Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 09:33:51 +0200 Subject: [PATCH 08/32] Remove sum method and rely on np.sum --- src/tabmat/dense_matrix.py | 4 ---- src/tabmat/standardized_mat.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index c854041f..ade5d355 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -80,10 +80,6 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" return type(self)(self._array.astype(dtype, order, casting, copy)) - def sum(self, *args, **kwargs): - """Return the sum of the array elements over the given axis.""" - return self._array.sum(*args, **kwargs) - def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array[:, [i]]) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 0d8a0190..19b04f5a 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -147,7 +147,7 @@ def sandwich( limited_shift = self.shift[cols] if cols is not None else self.shift limited_d = d[rows] if rows is not None else d - term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * limited_d.sum()) + term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * np.sum(limited_d)) res = term2 + term3_and_4 if isinstance(term1, sps.dia_matrix): idx = np.arange(res.shape[0]) From 1e31779bd9b98190d3f0037b1c41280c47c71f58 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 16:00:07 +0200 Subject: [PATCH 09/32] Force DenseMatrix to always be 2-dimensional --- src/tabmat/dense_matrix.py | 16 +++++++++++++++- tests/test_matrices.py | 11 ++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index ade5d355..dcad444a 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -33,9 +33,22 @@ class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): """ def __init__(self, input_array): + input_array = np.asarray(input_array) + + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + self._array = np.asarray(input_array) def __getitem__(self, key): + if not isinstance(key, tuple): + key = (key,) + + # Always return a 2d array + key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) + return type(self)(self._array.__getitem__(key)) def __array__(self, dtype=None): @@ -44,7 +57,8 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ["__call__", "accumulate"]: + if method in ("call", "accumulate") and ufunc.signature is None: + # Does not change shape return type(self)(result) else: return result diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 5d314c7e..779b160c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -233,7 +233,7 @@ def test_to_array_standardized_mat(mat: tm.StandardizedMatrix): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.asarray, tm.DenseMatrix], + [lambda x: x, np.asarray], ) @pytest.mark.parametrize("cols", [None, [], [1], np.array([1])]) @pytest.mark.parametrize("other_shape", [[], [1], [2]]) @@ -243,7 +243,7 @@ def test_matvec( """ Mat. - other_type: Function transforming list to list, array, or DenseMatrix + t: Function transforming list to list, array, or DenseMatrix cols: Argument 1 to matvec, specifying which columns of the matrix (and which elements of 'other') to use other_shape: Second dimension of 'other.shape', if any. If other_shape is [], then @@ -303,7 +303,7 @@ def process_mat_vec_subsets(mat, vec, mat_rows, mat_cols, vec_idxs): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [2], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [1], np.arange(1)]) @@ -373,7 +373,7 @@ def test_cross_sandwich( @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [1], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [0], np.arange(1)]) @@ -430,7 +430,7 @@ def test_transpose(mat): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): vec_as_list = [3.0, -0.1, 0] @@ -559,6 +559,7 @@ def test_indexing_int_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): @pytest.mark.parametrize("mat", get_matrices()) def test_indexing_range_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0:2, :] + assert res.ndim == 2 if not isinstance(res, np.ndarray): res = res.A expected = mat.A[0:2, :] From 755e6341747fb807dbb3ef5dd87b91f857b88630 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 16:40:17 +0200 Subject: [PATCH 10/32] Add __repr__ and __str__ methods --- src/tabmat/dense_matrix.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index dcad444a..842d4464 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,3 +1,4 @@ +import textwrap from typing import List, Optional, Union import numpy as np @@ -69,6 +70,18 @@ def __matmul__(self, other): def __rmatmul__(self, other): return self._array.__rmatmul__(other) + def __str__(self): + return "{}x{} DenseMatrix:\n\n".format(*self.shape) + np.array_str(self._array) + + def __repr__(self): + class_name = type(self).__name__ + array_str = f"{class_name}({np.array2string(self._array, separator=', ')})" + return textwrap.indent( + array_str, + " " * (len(class_name) + 1), + predicate=lambda line: not line.startswith(class_name), + ) + @property def shape(self): """Tuple of array dimensions.""" From 8c89462f196e6e9ca7de6da3605913d7fd8a77eb Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 20:02:42 +0200 Subject: [PATCH 11/32] Add benchmark data to .gitignore (#282) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c528d376..186ba948 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Project-specific benchmark/*.csv +benchmark/data/*.pkl # Files created by templating dense.cpp From 0560529f441775bcbd129dfaf26f206ce8be934e Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 10:31:35 +0200 Subject: [PATCH 12/32] Fix as_mx --- src/tabmat/split_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index a7618912..aaf88414 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -29,7 +29,7 @@ def as_mx(a: Any): return a elif sps.issparse(a): return SparseMatrix(a) - elif isinstance(a, np.ndarray): + elif isinstance(a, (np.ndarray, DenseMatrix)): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") From 80143ef6550d537e53defc73649a3f112aa1ec3e Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 12:31:38 +0200 Subject: [PATCH 13/32] Fix ufunc return value --- src/tabmat/dense_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 842d4464..953fe505 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -58,7 +58,7 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ("call", "accumulate") and ufunc.signature is None: + if method in ("__call__", "accumulate") and ufunc.signature is None: # Does not change shape return type(self)(result) else: From 34d6f37cc6c442d877ec3cb2b191c6f519130228 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 17:08:49 +0200 Subject: [PATCH 14/32] Wrap SparseMatrix, too --- src/tabmat/sparse_matrix.py | 96 ++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 3befbad9..11043bb8 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -31,29 +31,65 @@ class SparseMatrix(sps.csc_matrix, MatrixBase): """ def __init__(self, arg1, shape=None, dtype=None, copy=False): - super().__init__(arg1, shape, dtype, copy) - self.idx_dtype = max(self.indices.dtype, self.indptr.dtype) - if self.indices.dtype != self.idx_dtype: - self.indices = self.indices.astype(self.idx_dtype) - if self.indptr.dtype != self.idx_dtype: - self.indptr = self.indptr.astype(self.idx_dtype) + self._array = sps.csc_matrix(arg1, shape, dtype, copy) + + self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) + if self._array.indices.dtype != self.idx_dtype: + self._array.indices = self._array.indices.astype(self.idx_dtype) + if self._array.indptr.dtype != self.idx_dtype: + self._array.indptr = self._array.indptr.astype(self.idx_dtype) assert self.indices.dtype == self.idx_dtype - if not self.has_sorted_indices: - self.sort_indices() - self._x_csr = None + if not self._array.has_sorted_indices: + self._array.sort_indices() + self._array_csr = None + + @property + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + @property + def indices(self): + """Indices of the matrix.""" # noqa: D401 + return self._array.indices + + @property + def indptr(self): + """Indptr of the matrix.""" # noqa: D401 + return self._array.indptr + + @property + def data(self): + """Data of the matrix.""" # noqa: D401 + return self._array.data + + @property + def array_csc(self): + """Return the CSC representation of the matrix.""" + return self._array @property - def x_csr(self): + def array_csr(self): """Cache the CSR representation of the matrix.""" - if self._x_csr is None: - self._x_csr = self.tocsr(copy=False) - if self._x_csr.indices.dtype != self.idx_dtype: - self._x_csr.indices = self._x_csr.indices.astype(self.idx_dtype) - if self._x_csr.indptr.dtype != self.idx_dtype: - self._x_csr.indptr = self._x_csr.indptr.astype(self.idx_dtype) + if self._array_csr is None: + self._array_csr = self._array.tocsr(copy=False) + if self._array_csr.indices.dtype != self.idx_dtype: + self._array_csr.indices = self._array_csr.indices.astype(self.idx_dtype) + if self._array_csr.indptr.dtype != self.idx_dtype: + self._array_csr.indptr = self._array_csr.indptr.astype(self.idx_dtype) - return self._x_csr + return self._array_csr def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None @@ -68,7 +104,7 @@ def sandwich( ) rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) - return sparse_sandwich(self, self.x_csr, d, rows, cols) + return sparse_sandwich(self, self.array_csr, d, rows, cols) def _cross_sandwich( self, @@ -112,7 +148,7 @@ def sandwich_dense( rows, L_cols = setup_restrictions(self.shape, rows, L_cols) R_cols = set_up_rows_or_cols(R_cols, B.shape[1]) - return csr_dense_sandwich(self.x_csr, B, d, rows, L_cols, R_cols) + return csr_dense_sandwich(self.array_csr, B, d, rows, L_cols, R_cols) def _matvec_helper( self, @@ -129,9 +165,11 @@ def _matvec_helper( unrestricted_cols = cols is None or len(cols) == self.shape[1] if unrestricted_rows and unrestricted_cols and vec.ndim == 1: if transpose: - return csc_rmatvec_unrestricted(self, vec, out, self.indices) + return csc_rmatvec_unrestricted(self.array_csc, vec, out, self.indices) else: - return csr_matvec_unrestricted(self.x_csr, vec, out, self.x_csr.indices) + return csr_matvec_unrestricted( + self.array_csr, vec, out, self.array_csr.indices + ) matrix_matvec = lambda x, v: sps.csc_matrix.dot(x, v) if transpose: @@ -139,9 +177,9 @@ def _matvec_helper( rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) if transpose: - fast_fnc = lambda v: csc_rmatvec(self, v, rows, cols) + fast_fnc = lambda v: csc_rmatvec(self.array_csc, v, rows, cols) else: - fast_fnc = lambda v: csr_matvec(self.x_csr, v, rows, cols) + fast_fnc = lambda v: csr_matvec(self.array_csr, v, rows, cols) if vec.ndim == 1: res = fast_fnc(vec) elif vec.ndim == 2 and vec.shape[1] == 1: @@ -180,7 +218,11 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra """Get standard deviations of columns.""" sqrt_arg = ( transpose_square_dot_weights( - self.data, self.indices, self.indptr, weights, weights.dtype + self._array.data, + self._array.indices, + self._array.indptr, + weights, + weights.dtype, ) - col_means**2 ) @@ -192,7 +234,7 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return SparseMatrix cast to new type.""" - return super().astype(dtype, casting, copy) + return type(self)(self._array.astype(dtype, casting, copy)) def multiply(self, other): """Element-wise multiplication. @@ -202,5 +244,5 @@ def multiply(self, other): ``self.shape[0]``. """ if other.ndim == 1: - return SparseMatrix(super().multiply(other[:, np.newaxis])) - return SparseMatrix(super().multiply(other)) + return type(self)(self._array.multiply(other[:, np.newaxis])) + return type(self)(self._array.multiply(other)) From 97349f4a7d1a5bfd826d602af14b9b01046ef9e4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 17:31:25 +0200 Subject: [PATCH 15/32] Demo of how the ufunc interface can be implemented --- src/tabmat/sparse_matrix.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 11043bb8..8074a67c 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -44,6 +44,26 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + from .dense_matrix import DenseMatrix + + if ufunc.nin == 1 and ufunc.nout == 1: + if getattr(ufunc, method)(0) == 0: + result_matrix = sps.csc_matrix( + ( + getattr(ufunc, method)(self._array.data, **kwargs), + self._array.indices, + self._array.indptr, + ), + shape=self._array.shape, + ) + return type(self)(result_matrix) + else: + result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) + return DenseMatrix(result_matrix) + else: + return NotImplemented + @property def shape(self): """Tuple of array dimensions.""" @@ -201,8 +221,6 @@ def matvec(self, vec, cols: np.ndarray = None, out: np.ndarray = None): check_matvec_out_shape(self, out) return self._matvec_helper(vec, None, cols, out, False) - __array_priority__ = 12 - def transpose_matvec( self, vec: Union[np.ndarray, List], From e86c0058350f2b0bb9451d923feedfb56f0673a0 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 19:12:12 +0200 Subject: [PATCH 16/32] Do not subclass csc_matrix --- src/tabmat/categorical_matrix.py | 2 ++ src/tabmat/sparse_matrix.py | 31 +++++++++++++++++++++++++++++-- src/tabmat/split_matrix.py | 10 ++++++++-- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 7a751fdb..6e781691 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -458,6 +458,8 @@ def _cross_sandwich( return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, sps.csc_matrix): return self._cross_sparse(other, d, rows, L_cols, R_cols) + if isinstance(other, SparseMatrix): + return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return self._cross_categorical(other, d, rows, L_cols, R_cols) raise TypeError diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8074a67c..40644763 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -22,7 +22,7 @@ ) -class SparseMatrix(sps.csc_matrix, MatrixBase): +class SparseMatrix(MatrixBase): """ A scipy.sparse csc matrix subclass that allows such objects to conform to the ``MatrixBase`` interface. @@ -44,6 +44,15 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + def __getitem__(self, key): + if not isinstance(key, tuple): + key = (key,) + + # Always return a 2d array + key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) + + return type(self)(self._array.__getitem__(key)) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .dense_matrix import DenseMatrix @@ -111,6 +120,24 @@ def array_csr(self): return self._array_csr + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) + + T = property(transpose) + + def getcol(self, i): + """Return matrix column at specified index.""" + return type(self)(self._array.getcol(i)) + + def toarray(self): + """Return a dense ndarray representation of the matrix.""" + return self._array.toarray() + + def dot(self, other): + """Return the dot product as a scipy sparse matrix.""" + return self._array.dot(other) + def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: @@ -206,7 +233,7 @@ def _matvec_helper( res = fast_fnc(vec[:, 0])[:, None] else: res = matrix_matvec( - self[np.ix_(rows, cols)], vec[rows] if transpose else vec[cols] + self[np.ix_(rows, cols)]._array, vec[rows] if transpose else vec[cols] ) if out is None: return res diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index aaf88414..2f7438fa 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -75,8 +75,14 @@ def _combine_matrices(matrices, indices): n_row = matrices[0].shape[0] for mat_type_, stack_fn in [ - (DenseMatrix, np.hstack), - (SparseMatrix, sps.hstack), + ( + DenseMatrix, + lambda matrices: np.hstack([mat._array for mat in matrices]), + ), + ( + SparseMatrix, + lambda matrices: sps.hstack([mat._array for mat in matrices]), + ), ]: this_type_matrices = [ i for i, mat in enumerate(matrices) if isinstance(mat, mat_type_) From 5a88fbc45a0eb80f7a8cb9907899e29eeac06bea Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:03:01 +0200 Subject: [PATCH 17/32] Demonstrate binary ufuncs for sparse --- src/tabmat/categorical_matrix.py | 4 +--- src/tabmat/dense_matrix.py | 3 +++ src/tabmat/sparse_matrix.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 6e781691..1a996cc0 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -454,10 +454,8 @@ def _cross_sandwich( """Perform a sandwich product: X.T @ diag(d) @ Y.""" from .dense_matrix import DenseMatrix - if isinstance(other, (np.ndarray, DenseMatrix)): + if isinstance(other, DenseMatrix): return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) - if isinstance(other, sps.csc_matrix): - return self._cross_sparse(other, d, rows, L_cols, R_cols) if isinstance(other, SparseMatrix): return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 953fe505..4bd76501 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -56,6 +56,9 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + if not all(isinstance(x, (np.ndarray, DenseMatrix)) for x in inputs): + return NotImplemented + inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) if method in ("__call__", "accumulate") and ufunc.signature is None: diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 40644763..b144dcc1 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -70,6 +70,17 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) return DenseMatrix(result_matrix) + + elif ufunc == np.multiply: + if isinstance(inputs[0], SparseMatrix) and isinstance( + inputs[1], SparseMatrix + ): + return SparseMatrix(inputs[0].array_csc.multiply(inputs[1].array_csc)) + elif isinstance(inputs[0], SparseMatrix): + return SparseMatrix(inputs[0].array_csc.multiply(inputs[1])) + else: + return SparseMatrix(inputs[1].array_csc.multiply(inputs[0])) + else: return NotImplemented From 44e1970422db6258d19b179219e7c81c13ddeac8 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:07:14 +0200 Subject: [PATCH 18/32] Add tocsc method --- src/tabmat/sparse_matrix.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index b144dcc1..bda43efd 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -131,6 +131,10 @@ def array_csr(self): return self._array_csr + def tocsc(self, copy=False): + """Return the matrix in CSC format.""" + return self._array.tocsc(copy=copy) + def transpose(self): """Returns a view of the array with axes transposed.""" # noqa: D401 return type(self)(self._array.T) From ffe918e4378c8b1b5d0bfb9c7ade57bb77fccc5a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:39:56 +0200 Subject: [PATCH 19/32] Fix type checks --- src/tabmat/dense_matrix.py | 4 ++-- src/tabmat/sparse_matrix.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 4bd76501..fced90c4 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -56,10 +56,10 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all(isinstance(x, (np.ndarray, DenseMatrix)) for x in inputs): + if not all(isinstance(x, (np.ndarray, type(self))) for x in inputs): return NotImplemented - inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) + inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) if method in ("__call__", "accumulate") and ufunc.signature is None: # Does not change shape diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index bda43efd..06f54505 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -72,14 +72,12 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return DenseMatrix(result_matrix) elif ufunc == np.multiply: - if isinstance(inputs[0], SparseMatrix) and isinstance( - inputs[1], SparseMatrix - ): - return SparseMatrix(inputs[0].array_csc.multiply(inputs[1].array_csc)) - elif isinstance(inputs[0], SparseMatrix): - return SparseMatrix(inputs[0].array_csc.multiply(inputs[1])) + if isinstance(inputs[0], type(self)) and isinstance(inputs[1], type(self)): + return type(self)(inputs[0].array_csc.multiply(inputs[1].array_csc)) + elif isinstance(inputs[0], type(self)): + return type(self)(inputs[0].array_csc.multiply(inputs[1])) else: - return SparseMatrix(inputs[1].array_csc.multiply(inputs[0])) + return type(self)(inputs[1].array_csc.multiply(inputs[0])) else: return NotImplemented From 3f94e4d4f4626d4249be8e5dc0cf42ed5a8c875c Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 11:18:07 +0200 Subject: [PATCH 20/32] Minor improvements --- src/tabmat/dense_matrix.py | 5 ++++- src/tabmat/sparse_matrix.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index fced90c4..a520389d 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,3 +1,4 @@ +import numbers import textwrap from typing import List, Optional, Union @@ -56,7 +57,9 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all(isinstance(x, (np.ndarray, type(self))) for x in inputs): + if not all( + isinstance(x, (np.ndarray, type(self), numbers.Number)) for x in inputs + ): return NotImplemented inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 06f54505..7d52773a 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -56,6 +56,9 @@ def __getitem__(self, key): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .dense_matrix import DenseMatrix + if "out" in kwargs: + raise NotImplementedError("out argument is not supported") + if ufunc.nin == 1 and ufunc.nout == 1: if getattr(ufunc, method)(0) == 0: result_matrix = sps.csc_matrix( From 9f943d83c7b96d3a804c7081fe6b6ee57f7bfe79 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 11:43:19 +0200 Subject: [PATCH 21/32] ufunc support for categoricals --- src/tabmat/categorical_matrix.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 1a996cc0..2c688053 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -265,6 +265,12 @@ def __init__( self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + inputs = ( + x.to_sparse_matrix() if isinstance(x, type(self)) else x for x in inputs + ) + return getattr(ufunc, method)(*inputs, **kwargs) + def recover_orig(self) -> np.ndarray: """ Return 1d numpy array with same data as what was initially fed to __init__. @@ -491,6 +497,12 @@ def tocsr(self) -> sps.csr_matrix: shape=self.shape, ) + def to_sparse_matrix(self): + """Return a tabmat.SparseMatrix representation.""" + from .sparse_matrix import SparseMatrix + + return SparseMatrix(self.tocsr()) + def toarray(self) -> np.ndarray: """Return array representation of matrix.""" return self.tocsr().A From 006497092238a7df46fd76a3a84bb39ec77b619d Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 06:44:08 +0100 Subject: [PATCH 22/32] Pre-commit autoupdate (#283) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5a37c63e..150cd678 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/Quantco/pre-commit-mirrors-black - rev: 23.3.0 + rev: 23.7.0 hooks: - id: black-conda additional_dependencies: [flake8-docstrings, flake8-rst-docstrings] @@ -33,7 +33,7 @@ repos: additional_dependencies: - python=3.8 - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade - rev: 3.7.0 + rev: 3.9.0 hooks: - id: pyupgrade-conda - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint From 34cc13c51e2846ecae88e0feb7994141e761e5c1 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 25 Jul 2023 09:15:43 +0200 Subject: [PATCH 23/32] Remove __array_ufunc__ interface --- src/tabmat/categorical_matrix.py | 8 ++----- src/tabmat/dense_matrix.py | 18 +--------------- src/tabmat/sparse_matrix.py | 36 +++----------------------------- 3 files changed, 6 insertions(+), 56 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 2c688053..f6e84c1d 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -265,11 +265,7 @@ def __init__( self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - inputs = ( - x.to_sparse_matrix() if isinstance(x, type(self)) else x for x in inputs - ) - return getattr(ufunc, method)(*inputs, **kwargs) + __array_ufunc__ = None def recover_orig(self) -> np.ndarray: """ @@ -461,7 +457,7 @@ def _cross_sandwich( from .dense_matrix import DenseMatrix if isinstance(other, DenseMatrix): - return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) + return self._cross_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, SparseMatrix): return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index a520389d..464d1f70 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,4 +1,3 @@ -import numbers import textwrap from typing import List, Optional, Union @@ -53,22 +52,7 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) - def __array__(self, dtype=None): - return self._array.astype(dtype, copy=False) - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all( - isinstance(x, (np.ndarray, type(self), numbers.Number)) for x in inputs - ): - return NotImplemented - - inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ("__call__", "accumulate") and ufunc.signature is None: - # Does not change shape - return type(self)(result) - else: - return result + __array_ufunc__ = None def __matmul__(self, other): return self._array.__matmul__(other) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7d52773a..5eba5adc 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -53,37 +53,7 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .dense_matrix import DenseMatrix - - if "out" in kwargs: - raise NotImplementedError("out argument is not supported") - - if ufunc.nin == 1 and ufunc.nout == 1: - if getattr(ufunc, method)(0) == 0: - result_matrix = sps.csc_matrix( - ( - getattr(ufunc, method)(self._array.data, **kwargs), - self._array.indices, - self._array.indptr, - ), - shape=self._array.shape, - ) - return type(self)(result_matrix) - else: - result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) - return DenseMatrix(result_matrix) - - elif ufunc == np.multiply: - if isinstance(inputs[0], type(self)) and isinstance(inputs[1], type(self)): - return type(self)(inputs[0].array_csc.multiply(inputs[1].array_csc)) - elif isinstance(inputs[0], type(self)): - return type(self)(inputs[0].array_csc.multiply(inputs[1])) - else: - return type(self)(inputs[1].array_csc.multiply(inputs[0])) - - else: - return NotImplemented + __array_ufunc__ = None @property def shape(self): @@ -181,8 +151,8 @@ def _cross_sandwich( from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix - if isinstance(other, (np.ndarray, DenseMatrix)): - return self.sandwich_dense(np.asarray(other), d, rows, L_cols, R_cols) + if isinstance(other, DenseMatrix): + return self.sandwich_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return other._cross_sandwich(self, d, rows, R_cols, L_cols).T From a396a09ff49c47d21c538cdf2c342cc5e1076614 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 25 Jul 2023 11:33:00 +0200 Subject: [PATCH 24/32] Remove numpy operator mixin --- src/tabmat/categorical_matrix.py | 4 ++-- src/tabmat/dense_matrix.py | 2 +- src/tabmat/split_matrix.py | 9 ++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index f6e84c1d..7783d5fd 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -466,7 +466,7 @@ def _cross_sandwich( # TODO: best way to return this depends on the use case. See what that is # See how csr getcol works - def getcol(self, i: int) -> sps.csc_matrix: + def getcol(self, i: int) -> SparseMatrix: """Return matrix column at specified index.""" i %= self.shape[1] # wrap-around indexing @@ -474,7 +474,7 @@ def getcol(self, i: int) -> sps.csc_matrix: i += 1 col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None]) - return col_i + return SparseMatrix(col_i) def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 464d1f70..587d244b 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -18,7 +18,7 @@ ) -class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): +class DenseMatrix(MatrixBase): """ A ``numpy.ndarray`` subclass with several additional functions that allow it to share the MatrixBase API with SparseMatrix and CategoricalMatrix. diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 2f7438fa..f936bfb1 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -1,10 +1,9 @@ import warnings -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np from scipy import sparse as sps -from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix from .ext.split import is_sorted, split_col_subsets from .matrix_base import MatrixBase @@ -29,7 +28,7 @@ def as_mx(a: Any): return a elif sps.issparse(a): return SparseMatrix(a) - elif isinstance(a, (np.ndarray, DenseMatrix)): + elif isinstance(a, np.ndarray): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") @@ -135,7 +134,7 @@ class SplitMatrix(MatrixBase): def __init__( self, - matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]], + matrices: Sequence[MatrixBase], indices: Optional[List[np.ndarray]] = None, ): flatten_matrices = [] @@ -149,7 +148,7 @@ def __init__( if isinstance(mat, SplitMatrix): # Flatten out the SplitMatrix current_idx = 0 - for iind, imat in zip(mat.indices, mat.matrices): + for iind, imat in zip(mat.indices, mat.matrices): # type: ignore flatten_matrices.append(imat) index_corrections.append( iind - np.arange(len(iind), dtype=np.int64) - current_idx From e046dcdb81c25467b6c446342e6c131bfd72fec1 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 12:18:14 +0200 Subject: [PATCH 25/32] Add hstack function --- src/tabmat/__init__.py | 4 +++- src/tabmat/split_matrix.py | 30 +++++++++++++++++++++++++++--- tests/test_matrices.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index 9f4a8889..53fb1c55 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -3,7 +3,7 @@ from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix -from .split_matrix import SplitMatrix +from .split_matrix import SplitMatrix, as_tabmat, hstack from .standardized_mat import StandardizedMatrix __all__ = [ @@ -15,4 +15,6 @@ "CategoricalMatrix", "from_csc", "from_pandas", + "as_tabmat", + "hstack", ] diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index f936bfb1..a091949f 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import numpy as np from scipy import sparse as sps @@ -16,7 +16,7 @@ ) -def as_mx(a: Any): +def as_tabmat(a: Union[MatrixBase, StandardizedMatrix, np.ndarray, sps.spmatrix]): """Convert an array to a corresponding MatrixBase type. If the input is already a MatrixBase, return untouched. @@ -27,13 +27,37 @@ def as_mx(a: Any): if isinstance(a, (MatrixBase, StandardizedMatrix)): return a elif sps.issparse(a): - return SparseMatrix(a) + return SparseMatrix(a.tocsc(copy=False)) elif isinstance(a, np.ndarray): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") +def hstack(tup: Sequence[Union[MatrixBase, np.ndarray, sps.spmatrix]]) -> MatrixBase: + """Stack arrays in sequence horizontally (column wise). + + This is equivalent to concatenation along the second axis, + except for 1-D arrays where it concatenates along the first axis. + + Parameters + ---------- + tup: sequence of arrays + The arrays must have the same shape along all but the second axis. + """ + matrices = [as_tabmat(a) for a in tup] + + if len(matrices) == 0: + raise ValueError("Need at least one array to concatenate.") + + if all(isinstance(mat, SparseMatrix) for mat in matrices): + return SparseMatrix(sps.hstack([mat._array for mat in matrices])) + elif all(isinstance(mat, DenseMatrix) for mat in matrices): + return DenseMatrix(np.hstack([mat._array for mat in matrices])) + else: + return SplitMatrix(matrices) + + def _prepare_out_array(out: Optional[np.ndarray], out_shape, out_dtype): if out is None: out = np.zeros(out_shape, out_dtype) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 779b160c..34f6a5bb 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -632,3 +632,32 @@ def test_multiply(mat): for act in actual: assert isinstance(act, MatrixBase) np.testing.assert_allclose(act.A, expected) + + +@pytest.mark.parametrize( + "mat_1", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +@pytest.mark.parametrize( + "mat_2", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +def test_hstack(mat_1, mat_2): + mats = [mat_1, mat_2] + stacked = tm.hstack(mats) + + if all(isinstance(mat, (np.ndarray, tm.DenseMatrix)) for mat in mats): + assert isinstance(stacked, tm.DenseMatrix) + elif all(isinstance(mat, (sps.csc_matrix, tm.SparseMatrix)) for mat in mats): + assert isinstance(stacked, tm.SparseMatrix) + else: + assert isinstance(stacked, tm.SplitMatrix) + + np.testing.assert_array_equal( + stacked.A, + np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]), + ) From e7f216ca391dff72e7ddca795f728fb9da686906 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 14:31:13 +0200 Subject: [PATCH 26/32] Add method for unpacking underlying array --- src/tabmat/categorical_matrix.py | 4 ++++ src/tabmat/dense_matrix.py | 4 ++++ src/tabmat/sparse_matrix.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 7783d5fd..4968c628 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -503,6 +503,10 @@ def toarray(self) -> np.ndarray: """Return array representation of matrix.""" return self.tocsr().A + def unpack(self): + """Return the underlying pandas.Categorical.""" + return self.cat + def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return CategoricalMatrix cast to new type.""" self.dtype = dtype diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 587d244b..1a70457f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -105,6 +105,10 @@ def toarray(self): """Return array representation of matrix.""" return self._array + def unpack(self): + """Return the underlying numpy.ndarray.""" + return self._array + def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 5eba5adc..8d7a30bc 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -116,6 +116,10 @@ def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array.getcol(i)) + def unpack(self): + """Return the underlying scipy.sparse.csc_matrix.""" + return self._array + def toarray(self): """Return a dense ndarray representation of the matrix.""" return self._array.toarray() From c66e026e30cc2d02ab5e1baf81a3df2762011846 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 14:38:32 +0200 Subject: [PATCH 27/32] Add __matmul__ methods to SparseMatrix --- src/tabmat/sparse_matrix.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8d7a30bc..188f6862 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -53,6 +53,12 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + __array_ufunc__ = None @property From 38813e7482a8b92d83d91cfd17230b1a12c63982 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 27 Jul 2023 12:13:54 +0200 Subject: [PATCH 28/32] Stricter and more consistent indexing --- src/tabmat/categorical_matrix.py | 48 ++++++++++----------------- src/tabmat/dense_matrix.py | 9 ++--- src/tabmat/sparse_matrix.py | 9 ++--- src/tabmat/util.py | 48 +++++++++++++++++++++++++++ tests/test_matrices.py | 57 ++++++++++++++++++++++++++++++-- 5 files changed, 124 insertions(+), 47 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 4968c628..68161445 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -161,7 +161,7 @@ def matvec(mat, vec): """ -from typing import Any, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -181,6 +181,7 @@ def matvec(mat, vec): from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -189,21 +190,15 @@ def matvec(mat, vec): ) -def _is_indexer_full_length(full_length: int, indexer: Any): - if isinstance(indexer, int): - return full_length == 1 - elif isinstance(indexer, list): - if (np.asarray(indexer) > full_length - 1).any(): - raise IndexError("Index out-of-range.") - return len(set(indexer)) == full_length - elif isinstance(indexer, np.ndarray): +def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): + if isinstance(indexer, np.ndarray): if (indexer > full_length - 1).any(): raise IndexError("Index out-of-range.") - return len(np.unique(indexer)) == full_length + # Order is important in indexing. Could achieve similar results + # by rearranging categories. + return np.array_equal(indexer.ravel(), np.arange(full_length)) elif isinstance(indexer, slice): return len(range(*indexer.indices(full_length))) == full_length - else: - raise ValueError(f"Indexing with {type(indexer)} is not allowed.") def _row_col_indexing( @@ -522,25 +517,18 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra return np.sqrt(mean - col_means**2) def __getitem__(self, item): - if isinstance(item, tuple): - row, col = item - if _is_indexer_full_length(self.shape[1], col): - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) - else: - # return a SparseMatrix if we subset columns - # TODO: this is inefficient. See issue #101. - return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype) + row, col = _check_indexer(item) + + if _is_indexer_full_length(self.shape[1], col): + if isinstance(row, np.ndarray): + row = row.ravel() + return CategoricalMatrix( + self.cat[row], drop_first=self.drop_first, dtype=self.dtype + ) else: - row = item - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) + # return a SparseMatrix if we subset columns + # TODO: this is inefficient. See issue #101. + return self.to_sparse_matrix()[row, col] def _cross_dense( self, diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 1a70457f..55c9a088 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -11,6 +11,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -44,13 +45,7 @@ def __init__(self, input_array): self._array = np.asarray(input_array) def __getitem__(self, key): - if not isinstance(key, tuple): - key = (key,) - - # Always return a 2d array - key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - - return type(self)(self._array.__getitem__(key)) + return type(self)(self._array.__getitem__(_check_indexer(key))) __array_ufunc__ = None diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 188f6862..8c2a3b2b 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -14,6 +14,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -45,13 +46,7 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array_csr = None def __getitem__(self, key): - if not isinstance(key, tuple): - key = (key,) - - # Always return a 2d array - key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - - return type(self)(self._array.__getitem__(key)) + return type(self)(self._array.__getitem__(_check_indexer(key))) def __matmul__(self, other): return self._array.__matmul__(other) diff --git a/src/tabmat/util.py b/src/tabmat/util.py index 2dd570ec..24cfbe30 100644 --- a/src/tabmat/util.py +++ b/src/tabmat/util.py @@ -50,3 +50,51 @@ def check_matvec_dimensions(mat, vec: np.ndarray, transpose: bool) -> None: f"shapes {mat.shape} and {vec.shape} not aligned: " f"{mat.shape[match_dim]} (dim {match_dim}) != {vec.shape[0]} (dim 0)" ) + + +def _check_indexer(indexer): + """Check that the indexer is valid, and transform it to a canonical format.""" + if not isinstance(indexer, tuple): + indexer = (indexer, slice(None, None, None)) + + if len(indexer) > 2: + raise ValueError("More than two indexers are not supported.") + + row_indexer, col_indexer = indexer + + if isinstance(row_indexer, slice): + if isinstance(col_indexer, slice): + return row_indexer, col_indexer + else: + col_indexer = np.asarray(col_indexer) + if col_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer, col_indexer.reshape(-1) + + elif isinstance(col_indexer, slice): + row_indexer = np.asarray(row_indexer) + if row_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer.reshape(-1), col_indexer + + else: + row_indexer = np.asarray(row_indexer) + col_indexer = np.asarray(col_indexer) + if row_indexer.ndim <= 1 and col_indexer.ndim <= 1: + return np.ix_(row_indexer.reshape(-1), col_indexer.reshape(-1)) + elif ( + row_indexer.ndim == 2 + and row_indexer.shape[1] == 1 + and col_indexer.ndim == 2 + and col_indexer.shape[0] == 1 + ): + # support for np.ix_-ed indices + return row_indexer, col_indexer + else: + raise ValueError("This type of indexing is not supported.") diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 34f6a5bb..815c48e3 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -552,8 +552,8 @@ def test_indexing_int_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0, :] if not isinstance(res, np.ndarray): res = res.A - expected = mat.A[0, :] - np.testing.assert_allclose(np.squeeze(res), expected) + expected = mat.A[[0], :] + np.testing.assert_allclose(res, expected) @pytest.mark.parametrize("mat", get_matrices()) @@ -563,7 +563,58 @@ def test_indexing_range_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): if not isinstance(res, np.ndarray): res = res.A expected = mat.A[0:2, :] - np.testing.assert_allclose(np.squeeze(res), expected) + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_col(mat): + res = mat[:, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 1) + expected = mat.A[:, [0]] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_range_col(mat): + res = mat[:, 0:2] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 2) + expected = mat.A[:, 0:2] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_both(mat): + res = mat[0, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (1, 1) + expected = mat.A[0, 0] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_seq_both(mat): + res = mat[[0, 1], [0, 1]] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[np.ix_([0, 1], [0, 1])] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_ix_both(mat): + indexer = np.ix_([0, 1], [0, 1]) + res = mat[indexer] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[indexer] + np.testing.assert_array_equal(res, expected) def test_pandas_to_matrix(): From 1ba081d8d9fcc367ac54fa12d543a41d8bf2e66a Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 7 Aug 2023 06:56:45 +0100 Subject: [PATCH 29/32] Pre-commit autoupdate (#284) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 150cd678..d66edb54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - --safe - --target-version=py36 - repo: https://github.com/Quantco/pre-commit-mirrors-flake8 - rev: 6.0.0 + rev: 6.1.0 hooks: - id: flake8-conda additional_dependencies: [ From 693b1058633765e467ee61975584ccbb56903e89 Mon Sep 17 00:00:00 2001 From: Jan Tilly Date: Mon, 7 Aug 2023 18:22:51 +0200 Subject: [PATCH 30/32] Use boa to build conda packages. (#285) --- .github/workflows/conda-build-win.yml | 4 ++-- .github/workflows/macos-conda-build.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/conda-build-win.yml b/.github/workflows/conda-build-win.yml index 8528298e..85d01459 100644 --- a/.github/workflows/conda-build-win.yml +++ b/.github/workflows/conda-build-win.yml @@ -30,5 +30,5 @@ jobs: - name: Build conda package shell: pwsh run: | - mamba install -n base -y conda-build - conda build -m .ci_support/${{ matrix.CONDA_BUILD_YML }}.yaml conda.recipe + mamba install -n base -y conda-build boa + conda mambabuild -m .ci_support/${{ matrix.CONDA_BUILD_YML }}.yaml conda.recipe diff --git a/.github/workflows/macos-conda-build.sh b/.github/workflows/macos-conda-build.sh index 1901f4c7..8c9880b9 100755 --- a/.github/workflows/macos-conda-build.sh +++ b/.github/workflows/macos-conda-build.sh @@ -2,7 +2,7 @@ set -exo pipefail -mamba install -y conda-build +mamba install -y conda-build boa # Don't test cross-compiled result (there is no emulation) and use the latest MacOS SDK. if grep -q "osx-arm64" .ci_support/${CONDA_BUILD_YML}.yaml; then @@ -13,4 +13,4 @@ CONDA_BUILD_SYSROOT: - "${CONDA_BUILD_SYSROOT}" EOF fi -conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-} +conda mambabuild -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe ${CONDA_BUILD_ARGS:-} From 78d0278755555e934d68024b3fbf4a15f65443ad Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 9 Aug 2023 15:09:39 +0200 Subject: [PATCH 31/32] Be consistent when instantiating from 1d arrays --- src/tabmat/sparse_matrix.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8c2a3b2b..d98f180f 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -31,8 +31,14 @@ class SparseMatrix(MatrixBase): SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix. """ - def __init__(self, arg1, shape=None, dtype=None, copy=False): - self._array = sps.csc_matrix(arg1, shape, dtype, copy) + def __init__(self, input_array, shape=None, dtype=None, copy=False): + if isinstance(input_array, np.ndarray): + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + + self._array = sps.csc_matrix(input_array, shape, dtype, copy) self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) if self._array.indices.dtype != self.idx_dtype: From e042ce3e2ec5b5aeaaa8405b81b0b172b7ec958f Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 15 Aug 2023 09:07:53 +0200 Subject: [PATCH 32/32] Add column name metadata to `tabmat` matrices (#278) * Add column name getters * Matrix names are also combined * Add names to constructors * Add indexing support for column names * Remove unnecessary code * Better default column names * Reduce code duplication * Saner defaults * Add convenient getters and setters * Fix indexing * Smarter setter for categorical matrices * Add tests * Fix subsetting with np.newaxis * Remove the walrus :( * Fix test * Fix indexing with np.ix_ * Propagate column names where it makes sense * Fix merge mistake * Add changelog entry --- CHANGELOG.rst | 4 + src/tabmat/categorical_matrix.py | 140 ++++++++++++++++++++++- src/tabmat/constructor.py | 55 ++++++++-- src/tabmat/dense_matrix.py | 124 ++++++++++++++++++++- src/tabmat/matrix_base.py | 70 ++++++++++++ src/tabmat/sparse_matrix.py | 129 +++++++++++++++++++++- src/tabmat/split_matrix.py | 65 +++++++++++ src/tabmat/standardized_mat.py | 71 +++++++++++- tests/test_matrices.py | 183 +++++++++++++++++++++++++++++++ 9 files changed, 814 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 331695b1..9de08d2f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,10 @@ Changelog Unreleased ---------- +**New features:** + +- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties. + **Other changes:** - Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables. diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 68161445..1180646c 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -161,6 +161,7 @@ def matvec(mat, vec): """ +import re from typing import List, Optional, Tuple, Union import numpy as np @@ -245,6 +246,9 @@ def __init__( cat_vec: Union[List, np.ndarray, pd.Categorical], drop_first: bool = False, dtype: np.dtype = np.float64, + column_name: Optional[str] = None, + term_name: Optional[str] = None, + column_name_format: str = "{name}[{category}]", ): if pd.isnull(cat_vec).any(): raise ValueError("Categorical data can't have missing values.") @@ -260,6 +264,13 @@ def __init__( self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) + self._colname = column_name + if term_name is None: + self._term = self._colname + else: + self._term = term_name + self._colname_format = column_name_format + __array_ufunc__ = None def recover_orig(self) -> np.ndarray: @@ -466,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix: i %= self.shape[1] # wrap-around indexing if self.drop_first: - i += 1 + i_corr = i + 1 + else: + i_corr = i - col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None]) - return SparseMatrix(col_i) + col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None]) + return SparseMatrix( + col_i, + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" @@ -492,7 +509,11 @@ def to_sparse_matrix(self): """Return a tabmat.SparseMatrix representation.""" from .sparse_matrix import SparseMatrix - return SparseMatrix(self.tocsr()) + return SparseMatrix( + self.tocsr(), + column_names=self.column_names, + term_names=self.term_names, + ) def toarray(self) -> np.ndarray: """Return array representation of matrix.""" @@ -523,7 +544,11 @@ def __getitem__(self, item): if isinstance(row, np.ndarray): row = row.ravel() return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype + self.cat[row], + drop_first=self.drop_first, + dtype=self.dtype, + column_name=self._colname, + column_name_format=self._colname_format, ) else: # return a SparseMatrix if we subset columns @@ -638,8 +663,111 @@ def multiply(self, other) -> SparseMatrix: np.arange(self.shape[0] + 1, dtype=int), ), shape=self.shape, - ) + ), + column_names=self.column_names, + term_names=self.term_names, ) def __repr__(self): return str(self.cat) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + if type == "column": + name = self._colname + elif type == "term": + name = self._term + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self.cat.categories) - self.drop_first)) + if name is None and missing_prefix is None: + return [None] * (len(self.cat.categories) - self.drop_first) + elif name is None: + name = f"{missing_prefix}{indices[0]}-{indices[-1]}" + + if type == "column": + return [ + self._colname_format.format(name=name, category=cat) + for cat in self.cat.categories[self.drop_first :] + ] + else: + return [name] * (len(self.cat.categories) - self.drop_first) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != 1: + if type == "column": + # Try finding the column name + base_names = [] + for name, cat in zip(names, self.cat.categories[self.drop_first :]): + partial_name = self._colname_format.format( + name="__CAPTURE__", category=cat + ) + pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)") + if name is not None: + match = re.search(pattern, name) + else: + match = None + if match is not None: + base_names.append(match.group(1)) + else: + base_names.append(name) + names = base_names + + if len(names) == self.shape[1] and all(name == names[0] for name in names): + names = [names[0]] + + if len(names) != 1: + raise ValueError("A categorical matrix has only one name") + + if type == "column": + self._colname = names[0] + elif type == "term": + self._term = names[0] + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index f8e23c31..d280140a 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -21,6 +21,7 @@ def from_pandas( object_as_cat: bool = False, cat_position: str = "expand", drop_first: bool = False, + categorical_format: str = "{name}[{category}]", ) -> MatrixBase: """ Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this @@ -72,7 +73,14 @@ def from_pandas( if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): - cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) + cat = CategoricalMatrix( + coldata, + drop_first=drop_first, + dtype=dtype, + column_name=colname, + term_name=colname, + column_name_format=categorical_format, + ) if len(coldata.cat.categories) < cat_threshold: ( X_dense_F, @@ -82,6 +90,8 @@ def from_pandas( ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, + column_names=cat.get_names("column"), + term_names=cat.get_names("term"), ) matrices.append(X_dense_F) is_cat.append(True) @@ -128,13 +138,26 @@ def from_pandas( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) if len(dense_dfidx) > 0: - matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) + matrices.append( + DenseMatrix( + df.iloc[:, dense_dfidx].astype(dtype), + column_names=df.columns[dense_dfidx], + term_names=df.columns[dense_dfidx], + ) + ) indices.append(dense_mxidx) is_cat.append(False) if len(sparse_dfcols) > 0: sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)} full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo() - matrices.append(SparseMatrix(full_sparse, dtype=dtype)) + matrices.append( + SparseMatrix( + full_sparse, + dtype=dtype, + column_names=[col.name for col in sparse_dfcols], + term_names=[col.name for col in sparse_dfcols], + ) + ) indices.append(sparse_mxidx) is_cat.append(False) @@ -157,7 +180,10 @@ def from_pandas( def _split_sparse_and_dense_parts( - arg1: sps.csc_matrix, threshold: float = 0.1 + arg1: sps.csc_matrix, + threshold: float = 0.1, + column_names: Optional[Sequence[Optional[str]]] = None, + term_names: Optional[Sequence[Optional[str]]] = None, ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: """ Split matrix. @@ -176,12 +202,25 @@ def _split_sparse_and_dense_parts( dense_indices = np.where(densities > threshold)[0] sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) - X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray())) - X_sparse = SparseMatrix(arg1[:, sparse_indices]) + if column_names is None: + column_names = [None] * arg1.shape[1] + if term_names is None: + term_names = column_names + + X_dense_F = DenseMatrix( + np.asfortranarray(arg1[:, dense_indices].toarray()), + column_names=[column_names[i] for i in dense_indices], + term_names=[term_names[i] for i in dense_indices], + ) + X_sparse = SparseMatrix( + arg1[:, sparse_indices], + column_names=[column_names[i] for i in sparse_indices], + term_names=[term_names[i] for i in sparse_indices], + ) return X_dense_F, X_sparse, dense_indices, sparse_indices -def from_csc(mat: sps.csc_matrix, threshold=0.1): +def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 55c9a088..5de2c91f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -34,7 +34,7 @@ class DenseMatrix(MatrixBase): """ - def __init__(self, input_array): + def __init__(self, input_array, column_names=None, term_names=None): input_array = np.asarray(input_array) if input_array.ndim == 1: @@ -43,9 +43,32 @@ def __init__(self, input_array): raise ValueError("Input array must be 1- or 2-dimensional") self._array = np.asarray(input_array) + width = self._array.shape[1] + + if column_names is not None: + if len(column_names) != width: + raise ValueError( + f"Expected {width} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * width + + if term_names is not None: + if len(term_names) != width: + raise ValueError(f"Expected {width} term names, got {len(term_names)}") + self._terms = term_names + else: + self._terms = self._colnames def __getitem__(self, key): - return type(self)(self._array.__getitem__(_check_indexer(key))) + row, col = _check_indexer(key) + colnames = list(np.array(self.column_names)[col].ravel()) + terms = list(np.array(self.term_names)[col].ravel()) + + return type(self)( + self._array.__getitem__((row, col)), column_names=colnames, term_names=terms + ) __array_ufunc__ = None @@ -90,11 +113,19 @@ def transpose(self): def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" - return type(self)(self._array.astype(dtype, order, casting, copy)) + return type(self)( + self._array.astype(dtype, order, casting, copy), + column_names=self.column_names, + term_names=self.term_names, + ) def getcol(self, i): """Return matrix column at specified index.""" - return type(self)(self._array[:, [i]]) + return type(self)( + self._array[:, [i]], + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def toarray(self): """Return array representation of matrix.""" @@ -212,5 +243,86 @@ def multiply(self, other): This assumes that ``other`` is a vector of size ``self.shape[0]``. """ if np.asanyarray(other).ndim == 1: - return type(self)(self._array.__mul__(other[:, np.newaxis])) - return type(self)(self._array.__mul__(other)) + return type(self)( + self._array.__mul__(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.__mul__(other), + column_names=self.column_names, + term_names=self.term_names, + ) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self._colnames))) + + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + + return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 88091834..ac17d717 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -164,6 +164,76 @@ def standardize( def __getitem__(self, item): pass + @abstractmethod + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + pass + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + pass + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: List[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: List[Optional[str]]): + self.set_names(names, type="term") + # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the # behavior of this class __array_priority__ = 11 diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index d98f180f..1c568757 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -31,7 +31,15 @@ class SparseMatrix(MatrixBase): SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix. """ - def __init__(self, input_array, shape=None, dtype=None, copy=False): + def __init__( + self, + input_array, + shape=None, + dtype=None, + copy=False, + column_names=None, + term_names=None, + ): if isinstance(input_array, np.ndarray): if input_array.ndim == 1: input_array = input_array.reshape(-1, 1) @@ -51,8 +59,32 @@ def __init__(self, input_array, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + if column_names is not None: + if len(column_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * self.shape[1] + + if term_names is not None: + if len(term_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} term names, got {len(term_names)}" + ) + self._terms = term_names + else: + self._terms = self._colnames + def __getitem__(self, key): - return type(self)(self._array.__getitem__(_check_indexer(key))) + row, col = _check_indexer(key) + colnames = list(np.array(self.column_names)[col].ravel()) + terms = list(np.array(self.term_names)[col].ravel()) + + return type(self)( + self._array.__getitem__((row, col)), column_names=colnames, term_names=terms + ) def __matmul__(self, other): return self._array.__matmul__(other) @@ -121,7 +153,11 @@ def transpose(self): def getcol(self, i): """Return matrix column at specified index.""" - return type(self)(self._array.getcol(i)) + return type(self)( + self._array.getcol(i), + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def unpack(self): """Return the underlying scipy.sparse.csc_matrix.""" @@ -285,6 +321,87 @@ def multiply(self, other): from the parent class except that ``other`` is assumed to be a vector of size ``self.shape[0]``. """ - if other.ndim == 1: - return type(self)(self._array.multiply(other[:, np.newaxis])) - return type(self)(self._array.multiply(other)) + if np.asanyarray(other).ndim == 1: + return type(self)( + self._array.multiply(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.multiply(other), + column_names=self.column_names, + term_names=self.term_names, + ) + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + + if indices is None: + indices = list(range(len(self._colnames))) + + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + + return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index a091949f..a60b1fb3 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -113,8 +113,16 @@ def _combine_matrices(matrices, indices): if len(this_type_matrices) > 1: new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices])) new_indices = np.concatenate([indices[i] for i in this_type_matrices]) + new_colnames = np.concatenate( + [np.array(matrices[i]._colnames) for i in this_type_matrices] + ) + new_terms = np.concatenate( + [np.array(matrices[i]._terms) for i in this_type_matrices] + ) sorter = np.argsort(new_indices) sorted_matrix = new_matrix[:, sorter] + sorted_matrix._colnames = list(new_colnames[sorter]) + sorted_matrix._terms = list(new_terms[sorter]) sorted_indices = new_indices[sorter] assert sorted_matrix.shape[0] == n_row @@ -477,3 +485,60 @@ def __repr__(self): return out __array_priority__ = 13 + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + names = np.empty(self.shape[1], dtype=object) + for idx, mat in zip(self.indices, self.matrices): + names[idx] = mat.get_names(type, missing_prefix, idx) + return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + names_array = np.array(names) + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + for idx, mat in zip(self.indices, self.matrices): + mat.set_names(list(names_array[idx]), type) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 19b04f5a..2e88dbb0 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Optional, Union import numpy as np from scipy import sparse as sps @@ -298,3 +298,72 @@ def __repr__(self): Mult: {self.mult} """ return out + + def get_names( + self, + type: str = "column", + missing_prefix: Optional[str] = None, + indices: Optional[List[int]] = None, + ) -> List[Optional[str]]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. + + Returns + ------- + List[Optional[str]] + Column names. + """ + return self.mat.get_names(type, missing_prefix, indices) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + self.mat.set_names(names, type) + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: List[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: List[Optional[str]]): + self.set_names(names, type="term") diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 815c48e3..85b2cd68 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -712,3 +712,186 @@ def test_hstack(mat_1, mat_2): stacked.A, np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]), ) + + +def test_names_against_expectation(): + X = tm.DenseMatrix( + np.ones((5, 2)), column_names=["a", None], term_names=["a", None] + ) + Xc = tm.CategoricalMatrix( + pd.Categorical(["a", "b", "c", "b", "a"]), column_name="c", term_name="c" + ) + Xc2 = tm.CategoricalMatrix(pd.Categorical(["a", "b", "c", "b", "a"])) + Xs = tm.SparseMatrix( + sps.csc_matrix(np.ones((5, 2))), + column_names=["s1", "s2"], + term_names=["s", "s"], + ) + + mat = tm.SplitMatrix(matrices=[X, Xc, Xc2, Xs]) + + assert mat.get_names(type="column") == [ + "a", + None, + "c[a]", + "c[b]", + "c[c]", + None, + None, + None, + "s1", + "s2", + ] + + assert mat.get_names(type="term") == [ + "a", + None, + "c", + "c", + "c", + None, + None, + None, + "s", + "s", + ] + + assert mat.get_names(type="column", missing_prefix="_col_") == [ + "a", + "_col_1", + "c[a]", + "c[b]", + "c[c]", + "_col_5-7[a]", + "_col_5-7[b]", + "_col_5-7[c]", + "s1", + "s2", + ] + + assert mat.get_names(type="term", missing_prefix="_col_") == [ + "a", + "_col_1", + "c", + "c", + "c", + "_col_5-7", + "_col_5-7", + "_col_5-7", + "s", + "s", + ] + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_names_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="column") + mat.column_names = names + assert mat.column_names == names + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_terms_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="term") + mat.term_names = names + assert mat.term_names == names + + +@pytest.mark.parametrize("indexer_1", [slice(None, None), 0, slice(2, 8)]) +@pytest.mark.parametrize("indexer_2", [[0], slice(1, 4), [0, 2, 3], [4, 3, 2, 1, 0]]) +@pytest.mark.parametrize("sparse", [True, False]) +def test_names_indexing(indexer_1, indexer_2, sparse): + X = np.ones((10, 5), dtype=np.float64) + colnames = ["a", "b", None, "d", "e"] + termnames = ["t1", "t1", None, "t4", "t5"] + + colnames_array = np.array(colnames) + termnames_array = np.array(termnames) + + if sparse: + X = tm.SparseMatrix( + sps.csc_matrix(X), column_names=colnames, term_names=termnames + ) + else: + X = tm.DenseMatrix(X, column_names=colnames, term_names=termnames) + + X_indexed = X[indexer_1, indexer_2] + if not isinstance(X_indexed, tm.MatrixBase): + pytest.skip("Does not return MatrixBase") + assert X_indexed.column_names == list(colnames_array[indexer_2]) + assert X_indexed.term_names == list(termnames_array[indexer_2]) + + +@pytest.mark.parametrize("mat_1", get_all_matrix_base_subclass_mats()) +@pytest.mark.parametrize("mat_2", get_all_matrix_base_subclass_mats()) +def test_combine_names(mat_1, mat_2): + mat_1.column_names = mat_1.get_names(missing_prefix="m1_", type="column") + mat_2.column_names = mat_2.get_names(missing_prefix="m2_", type="column") + + mat_1.term_names = mat_1.get_names(missing_prefix="m1_", type="term") + mat_2.term_names = mat_2.get_names(missing_prefix="m2_", type="term") + + combined = tm.SplitMatrix(matrices=[mat_1, mat_2]) + + assert combined.column_names == mat_1.column_names + mat_2.column_names + assert combined.term_names == mat_1.term_names + mat_2.term_names + + +@pytest.mark.parametrize("prefix_sep", ["_", ": "]) +@pytest.mark.parametrize("drop_first", [True, False]) +def test_names_pandas(prefix_sep, drop_first): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "cl_obj": cat_ser_lowdim.astype(object), + "ch": cat_ser_highdim, + "ds": lowdense_ser, + "s": sparse_ser, + } + ) + + categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="end", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first) + assert mat_end.column_names == expanded_df.columns.tolist() + + mat_expand = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="expand", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + unique_terms = list(dict.fromkeys(mat_expand.term_names)) + assert unique_terms == df.columns.tolist()