Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expand_bits function #41

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
max-line-length = 120
exclude =
.pyi
typings
typings
ignore = E203, W503
119 changes: 119 additions & 0 deletions stackstac/bits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import Optional

import xarray as xr
import dask.array as da
import numpy as np


def _unpack_block(arr: np.ndarray, axis: int) -> np.ndarray:
arr = np.moveaxis(arr, axis, -1)
# ^ When we do `.view("uint8")`, if the array is >8-bit, NumPy will {double, quadruple, etc.}
# the length of the last dimension to account for the extra elements in the 8-bit view.
# We want the expanded dimension to be our new `bits` dimension, not some other one.
u8_lil = (
arr.astype(arr.dtype.newbyteorder("L"), copy=False)
# ^ ensure it's little-endian
.view("uint8")
)
bits = np.unpackbits(u8_lil, axis=-1, bitorder="little")
# ^ `bitorder="little"` so element 0 -> bit 0
return np.moveaxis(bits, -1, axis)


def expand_bits(
arr: xr.DataArray, astype: Optional[np.dtype] = None, keep_mask=True, bit_axis=0
) -> xr.DataArray:
"""
Expand a DataArray of bitpacked values, giving each bit its own entry in a new ``"bits"`` dimension.

For ``dtype="uint16"``, the result would have a new dimension ``"bit"`` with 16 elements.
The Nth item in ``"bit"`` would correspond to the Nth bit in the original values
(after they'd been converted to uint16), starting from the least-significant bit.

This ordering is consistent with most data sheets from imagery providers;
if it says "Bit 3" on a data sheet, that's probaby ``unpack_bits(arr, ...).sel(bit=3)``.

Parameters
---------
arr:
Array whose values are all bitpacked
astype:
dtype of the original data source (typically ``uint8`` or ``uint18``).
``arr`` is first converted to this type. For example, if ``arr`` is a float64
array loaded from Landsat-8 data where the original GeoTIFFs are in uint16,
you'd pass ``"uint16"`` here.
keep_mask:
If True (default), the ``arr.isnull()`` mask is re-applied to the result.
Note that this will result in a ``float64`` array, where the possible values are 0, 1, or NaN.

If False, any NaN values will be filled with 0s before unpacking, and the result will be a ``uint8`` array
of values 0 or 1.

If ``arr`` has an integer dtype, ``keep_mask`` is ignored and always considered False
(since ``arr`` cannot contain NaNs).
bit_axis:
Insert the new ``"bit"`` dimension at this axis (default 0).

Returns
-------
xarray.DataArray:
DataArray with a ``"bit"`` dimension added, which is equal in length to the number of bytes in ``astype``.

Example
-------
>>> import xarray as xr
>>> import stackstac
>>> arr = xr.DataArray([0b10000011_11111110, 0b11100011_00000001], dims="x")
>>> arr
<xarray.DataArray (x: 2)>
array([33790, 58113])
Dimensions without coordinates: x

>>> expand_bits(arr, astype="uint16", bit_axis=-1)
<xarray.DataArray (x: 2, bit: 16)>
array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1]], dtype=uint8)
Dimensions without coordinates: x, bit

Notice how the bits are printed in opposite order of how we wrote them above.
That's because binary literals in Python (``0b10000011_11111110``) are right-to-left,
with the least-significant bit---bit 0---furthest right.
`expand_bits` puts bit 0 as item 0 in the ``bit`` dimension, which is then printed in
left-to-right order.
"""
if astype is not None:
astype = np.dtype(astype)
if arr.dtype.kind in ("i", "u"):
keep_mask = False
prep = arr
if astype is None:
astype = arr.dtype
else:
if astype is None:
raise ValueError(
f"Refusing to interpret {arr.dtype} data as bitpacked "
"(bitpacked data almost always should have an integer dtype). "
"Pass `astype=` to specify the dtype to convert this data to first "
"(typically 'uint8' or 'uint16')."
)
prep = arr.fillna(0)

prep = prep.astype(astype, copy=False).expand_dims("bit", bit_axis)

# TODO use xr.apply_ufunc? Or just add `unpackbits` to dask
if isinstance(prep.data, da.Array):
bits_arr = prep.data.map_blocks(
_unpack_block,
chunks=prep.chunks[:bit_axis]
+ (astype.itemsize * 8,)
+ prep.chunks[bit_axis + 1 :],
dtype="uint8",
axis=bit_axis,
)
else:
bits_arr = _unpack_block(prep.data, axis=bit_axis)

bits_arr_xr = xr.DataArray(bits_arr, coords=prep.coords)
if keep_mask:
bits_arr_xr = bits_arr_xr.where(~arr.isnull())
return bits_arr_xr