From 7125834bbf67e63ac1a436d7357221ea4f6d463f Mon Sep 17 00:00:00 2001 From: Carl Kadie Date: Sun, 24 Dec 2023 16:40:32 -0800 Subject: [PATCH] using read_cloud_i8 for read_i8 and all tests passed. --- bed_reader/__init__.py | 4 +- bed_reader/_open_bed.py | 8 +- bed_reader/_open_bed_cloud.py | 2920 +++++++++++------------ bed_reader/tests/test_open_bed_cloud.py | 1816 +++++++------- src/bed_cloud.rs | 2 +- src/python_module.rs | 149 +- tests/tests_api_cloud.rs | 18 +- useful.bat | 1 + 8 files changed, 2488 insertions(+), 2430 deletions(-) diff --git a/bed_reader/__init__.py b/bed_reader/__init__.py index 3ef7abf..b588b3f 100644 --- a/bed_reader/__init__.py +++ b/bed_reader/__init__.py @@ -8,7 +8,7 @@ from bed_reader._open_bed import get_num_threads, open_bed from bed_reader._sample_data import sample_file, tmp_path from bed_reader._to_bed import to_bed -from bed_reader._open_bed_cloud import open_bed_cloud +# cmk from bed_reader._open_bed_cloud import open_bed_cloud from .bed_reader import ( file_aat_piece_f32_orderf, @@ -20,7 +20,7 @@ read_f32, read_f64, read_i8, - # cmk read_cloud_i8, + read_cloud_i8, standardize_f32, standardize_f64, subset_f32_f32, diff --git a/bed_reader/_open_bed.py b/bed_reader/_open_bed.py index e29a525..5134f79 100644 --- a/bed_reader/_open_bed.py +++ b/bed_reader/_open_bed.py @@ -13,7 +13,7 @@ except ImportError: sparse = None -from .bed_reader import read_f32, read_f64, read_i8 # type: ignore +from .bed_reader import read_f32, read_f64, read_i8, read_cloud_i8 # type: ignore # https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python @@ -390,7 +390,8 @@ def read( if self.iid_count > 0 and self.sid_count > 0: if dtype == np.int8: - reader = read_i8 + # cmk000 + reader = read_cloud_i8 elif dtype == np.float64: reader = read_f64 elif dtype == np.float32: @@ -402,7 +403,8 @@ def read( ) reader( - str(self.filepath), + # cmk000 + str(self.filepath.as_posix()), iid_count=self.iid_count, sid_count=self.sid_count, is_a1_counted=self.count_A1, diff --git a/bed_reader/_open_bed_cloud.py b/bed_reader/_open_bed_cloud.py index d56b396..62110a9 100644 --- a/bed_reader/_open_bed_cloud.py +++ b/bed_reader/_open_bed_cloud.py @@ -1,1582 +1,1582 @@ -import logging -import multiprocessing -import os -from dataclasses import dataclass -from itertools import repeat, takewhile -from pathlib import Path -from typing import Any, List, Mapping, Optional, Union - -import numpy as np - -try: - from scipy import sparse -except ImportError: - sparse = None - -from .bed_reader import read_f32, read_f64, read_i8 # type: ignore - - -# https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python -def _rawincount(filepath): - with open(filepath, "rb") as f: - bufgen = takewhile(lambda x: x, (f.raw.read(1024 * 1024) for _ in repeat(None))) - return sum(buf.count(b"\n") for buf in bufgen) - - -@dataclass -class _MetaMeta: - suffix: str - column: int - dtype: type - missing_value: object - fill_sequence: object - - -def _all_same(key, length, missing, dtype): - if np.issubdtype(dtype, np.str_): - dtype = f"`_ - and `.bim specification `_ - lists the dtypes and missing values for each property. - - count_A1: bool, optional - True (default) to count the number of A1 alleles (the PLINK standard). - False to count the number of A2 alleles. - num_threads: None or int, optional - The number of threads with which to read data. Defaults to all available - processors. - Can also be set with these environment variables (listed in priority order): - 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. - skip_format_check: bool, optional - False (default) to immediately check for expected starting bytes in - the .bed file. True to delay the check until (and if) data is read. - fam_filepath: pathlib.Path or str, optional - Path to the file containing information about each individual (sample). - Defaults to replacing the .bed file’s suffix with .fam. - bim_filepath: pathlib.Path or str, optional - Path to the file containing information about each SNP (variant). - Defaults to replacing the .bed file’s suffix with .bim. - - Returns - ------- - open_bed_cloud - an open_bed_cloud object - - Examples - -------- - - List individual (sample) :attr:`iid` and SNP (variant) :attr:`sid`, then :meth:`read` - the whole file. - - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> bed = open_bed_cloud(file_name) - >>> print(bed.iid) - ['iid1' 'iid2' 'iid3'] - >>> print(bed.sid) - ['sid1' 'sid2' 'sid3' 'sid4'] - >>> print(bed.read()) - [[ 1. 0. nan 0.] - [ 2. 0. nan 2.] - [ 0. 1. 2. 0.]] - >>> del bed # optional: delete bed object - - Open the file and read data for one SNP (variant) - at index position 2. - - .. doctest:: - - >>> import numpy as np - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.read(np.s_[:,2])) - [[nan] - [nan] - [ 2.]] - - Replace :attr:`iid`. - - - >>> bed = open_bed_cloud(file_name, properties={"iid":["sample1","sample2","sample3"]}) - >>> print(bed.iid) # replaced - ['sample1' 'sample2' 'sample3'] - >>> print(bed.sid) # same as before - ['sid1' 'sid2' 'sid3' 'sid4'] - - Give the number of individuals (samples) and SNPs (variants) so that the .fam and - .bim files need never be opened. - - >>> with open_bed_cloud(file_name, iid_count=3, sid_count=4) as bed: - ... print(bed.read()) - [[ 1. 0. nan 0.] - [ 2. 0. nan 2.] - [ 0. 1. 2. 0.]] - - Mark some properties as "don’t read or offer". - - >>> bed = open_bed_cloud(file_name, properties={ - ... "father" : None, "mother" : None, "sex" : None, "pheno" : None, - ... "allele_1" : None, "allele_2":None }) - >>> print(bed.iid) # read from file - ['iid1' 'iid2' 'iid3'] - >>> print(bed.allele_2) # not read and not offered - None - - See the :meth:`read` for details of reading batches via slicing and fancy indexing. - - """ - - def __init__( - self, - filepath: Union[str, Path], - iid_count: Optional[int] = None, - sid_count: Optional[int] = None, - properties: Mapping[str, List[Any]] = {}, - count_A1: bool = True, - num_threads: Optional[int] = None, - fam_filepath: Union[str, Path] = None, - bim_filepath: Union[str, Path] = None, - ): - skip_format_check = True - - self.filepath = Path(filepath) - self.count_A1 = count_A1 - self._num_threads = num_threads - self.skip_format_check = skip_format_check - self._fam_filepath = ( - Path(fam_filepath) - if fam_filepath is not None - else self.filepath.parent / (self.filepath.stem + ".fam") - ) - self._bim_filepath = ( - Path(bim_filepath) - if bim_filepath is not None - else self.filepath.parent / (self.filepath.stem + ".bim") - ) - - self.properties_dict, self._counts = open_bed_cloud._fix_up_properties( - properties, iid_count, sid_count, use_fill_sequence=False - ) - self._iid_range = None - self._sid_range = None - - @classmethod - async def create( - cls, - filepath: Union[str, Path], - iid_count: Optional[int] = None, - sid_count: Optional[int] = None, - properties: Mapping[str, List[Any]] = {}, - count_A1: bool = True, - skip_format_check: bool = False, - num_threads: Optional[int] = None, - fam_filepath: Union[str, Path] = None, - bim_filepath: Union[str, Path] = None, - ): - print(type(filepath), filepath) - self = open_bed_cloud( - filepath, - iid_count=iid_count, - sid_count=sid_count, - properties=properties, - count_A1=count_A1, - num_threads=num_threads, - fam_filepath=fam_filepath, - bim_filepath=bim_filepath, - ) - self.skip_format_check = skip_format_check - if not self.skip_format_check: - with open(self.filepath, "rb") as filepointer: - self._check_file(filepointer) - return self - - async def __aenter__(self): - # Setup or additional operations when entering the context - return self - - async def __aexit__(self, exc_type, exc, tb): - # Cleanup or close operations when exiting the context - pass - - async def read( - self, - index: Optional[Any] = None, - dtype: Optional[Union[type, str]] = "float32", - order: Optional[str] = "F", - force_python_only: Optional[bool] = False, - num_threads=None, - ) -> np.ndarray: - """ - Read genotype information. - - Parameters - ---------- - index: - An optional expression specifying the individuals (samples) and SNPs - (variants) to read. (See examples, below). - Defaults to ``None``, meaning read all. - - (If index is a tuple, the first component indexes the individuals and the - second indexes - the SNPs. If it is not a tuple and not None, it indexes SNPs.) - - dtype: {'float32' (default), 'float64', 'int8'}, optional - The desired data-type for the returned array. - order : {'F','C'}, optional - The desired memory layout for the returned array. - Defaults to ``F`` (Fortran order, which is SNP-major). - force_python_only: bool, optional - If False (default), uses the faster Rust code; otherwise it uses the slower - pure Python code. - - num_threads: None or int, optional - The number of threads with which to read data. Defaults to all available - processors. - Can also be set with :class:`open_bed_cloud` or these - environment variables (listed in priority order): - 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. - - - Returns - ------- - numpy.ndarray - 2-D array containing values of 0, 1, 2, or missing - - - Rows represent individuals (samples). Columns represent SNPs (variants). - - For ``dtype`` 'float32' and 'float64', NaN indicates missing values. - For 'int8', -127 indicates missing values. - - Examples - -------- - - To read all data in a .bed file, set ``index`` to ``None``. This is the default. - - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.read()) - [[ 1. 0. nan 0.] - [ 2. 0. nan 2.] - [ 0. 1. 2. 0.]] - - To read selected individuals (samples) and/or SNPs (variants), set each part of - a :data:`numpy.s_` to an `int`, a list of `int`, a slice expression, or - a list of `bool`. - Negative integers count from the end of the list. - - - .. doctest:: - - >>> import numpy as np - >>> bed = open_bed_cloud(file_name) - >>> print(bed.read(np.s_[:,2])) # read the SNPs indexed by 2. - [[nan] - [nan] - [ 2.]] - >>> print(bed.read(np.s_[:,[2,3,0]])) # read the SNPs indexed by 2, 3, and 0 - [[nan 0. 1.] - [nan 2. 2.] - [ 2. 0. 0.]] - >>> # read SNPs from 1 (inclusive) to 4 (exclusive) - >>> print(bed.read(np.s_[:,1:4])) - [[ 0. nan 0.] - [ 0. nan 2.] - [ 1. 2. 0.]] - >>> print(np.unique(bed.chromosome)) # print unique chrom values - ['1' '5' 'Y'] - >>> print(bed.read(np.s_[:,bed.chromosome=='5'])) # read all SNPs in chrom 5 - [[nan] - [nan] - [ 2.]] - >>> print(bed.read(np.s_[0,:])) # Read 1st individual (across all SNPs) - [[ 1. 0. nan 0.]] - >>> print(bed.read(np.s_[::2,:])) # Read every 2nd individual - [[ 1. 0. nan 0.] - [ 0. 1. 2. 0.]] - >>> #read last and 2nd-to-last individuals and the last SNPs - >>> print(bed.read(np.s_[[-1,-2],-1])) - [[0.] - [2.]] - - - You can give a dtype for the output. - - .. doctest:: - - >>> print(bed.read(dtype='int8')) - [[ 1 0 -127 0] - [ 2 0 -127 2] - [ 0 1 2 0]] - >>> del bed # optional: delete bed object - - """ - - iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index) - - dtype = np.dtype(dtype) - if order not in {"F", "C"}: - raise ValueError(f"order '{order}' not known, only 'F', 'C'") - - # Later happy with _iid_range and _sid_range or could it be done with - # allocation them? - if self._iid_range is None: - self._iid_range = np.arange(await self.iid_count(), dtype="intp") - if self._sid_range is None: - self._sid_range = np.arange(await self.sid_count(), dtype="intp") - - iid_index = np.ascontiguousarray( - self._iid_range[iid_index_or_slice_etc], - dtype="intp", - ) - sid_index = np.ascontiguousarray( - self._sid_range[sid_index_or_slice_etc], dtype="intp" - ) - - if not force_python_only: - num_threads = get_num_threads( - self._num_threads if num_threads is None else num_threads - ) - - val = np.zeros((len(iid_index), len(sid_index)), order=order, dtype=dtype) - - if await self.iid_count() > 0 and await self.sid_count() > 0: - if dtype == np.int8: - reader = read_i8 - elif dtype == np.float64: - reader = read_f64 - elif dtype == np.float32: - reader = read_f32 - else: - raise ValueError( - f"dtype '{val.dtype}' not known, only " - + "'int8', 'float32', and 'float64' are allowed." - ) - - reader( - str(self.filepath), - iid_count=await self.iid_count(), - sid_count=await self.sid_count(), - is_a1_counted=self.count_A1, - iid_index=iid_index, - sid_index=sid_index, - val=val, - num_threads=num_threads, - ) - - else: - if not self.count_A1: - byteZero = 0 - byteThree = 2 - else: - byteZero = 2 - byteThree = 0 - if dtype == np.int8: - missing = -127 - else: - missing = np.nan - # An earlier version of this code had a way to read consecutive SNPs of code - # in one read. May want - # to add that ability back to the code. - # Also, note that reading with python will often result in - # non-contiguous memory - # logging.warn("using pure python plink parser (might be much slower!!)") - val = np.zeros( - ((int(np.ceil(0.25 * self.iid_count)) * 4), len(sid_index)), - order=order, - dtype=dtype, - ) # allocate it a little big - - nbyte = int(np.ceil(0.25 * self.iid_count)) - with open(self.filepath, "rb") as filepointer: - for SNPsIndex, bimIndex in enumerate(sid_index): - startbit = int(np.ceil(0.25 * self.iid_count) * bimIndex + 3) - filepointer.seek(startbit) - bytes = np.array(bytearray(filepointer.read(nbyte))).reshape( - (int(np.ceil(0.25 * self.iid_count)), 1), order="F" - ) - - val[3::4, SNPsIndex : SNPsIndex + 1] = byteZero - val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 64] = missing - val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 128] = 1 - val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 192] = byteThree - bytes = np.mod(bytes, 64) - val[2::4, SNPsIndex : SNPsIndex + 1] = byteZero - val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 16] = missing - val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 32] = 1 - val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 48] = byteThree - bytes = np.mod(bytes, 16) - val[1::4, SNPsIndex : SNPsIndex + 1] = byteZero - val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 4] = missing - val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 8] = 1 - val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 12] = byteThree - bytes = np.mod(bytes, 4) - val[0::4, SNPsIndex : SNPsIndex + 1] = byteZero - val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 1] = missing - val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 2] = 1 - val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 3] = byteThree - val = val[iid_index, :] # reorder or trim any extra allocation - assert val.dtype == np.dtype(dtype) # real assert - if not open_bed_cloud._array_properties_are_ok(val, order): - val = val.copy(order=order) - - return val - - def __str__(self) -> str: - return f"{self.__class__.__name__}('{self.filepath}',...)" - - async def fid(self) -> np.ndarray: - """ - Family id of each individual (sample). - - Returns - ------- - numpy.ndarray - array of str - - - '0' represents a missing value. - - If needed, will cause a one-time read of the .fam file. - - Example - ------- - - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.fid) - ['fid1' 'fid1' 'fid2'] - - """ - - return self.property_item("fid") - - # @property - async def iid(self) -> np.ndarray: - """ - Individual id of each individual (sample). - - Returns - ------- - numpy.ndarray - array of str - - - If needed, will cause a one-time read of the .fam file. - - Example - ------- - - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.iid) - ['iid1' 'iid2' 'iid3'] +# import logging +# import multiprocessing +# import os +# from dataclasses import dataclass +# from itertools import repeat, takewhile +# from pathlib import Path +# from typing import Any, List, Mapping, Optional, Union + +# import numpy as np + +# try: +# from scipy import sparse +# except ImportError: +# sparse = None + +# from .bed_reader import read_f32, read_f64, read_i8 # type: ignore + + +# # https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python +# def _rawincount(filepath): +# with open(filepath, "rb") as f: +# bufgen = takewhile(lambda x: x, (f.raw.read(1024 * 1024) for _ in repeat(None))) +# return sum(buf.count(b"\n") for buf in bufgen) + + +# @dataclass +# class _MetaMeta: +# suffix: str +# column: int +# dtype: type +# missing_value: object +# fill_sequence: object + + +# def _all_same(key, length, missing, dtype): +# if np.issubdtype(dtype, np.str_): +# dtype = f"`_ +# and `.bim specification `_ +# lists the dtypes and missing values for each property. + +# count_A1: bool, optional +# True (default) to count the number of A1 alleles (the PLINK standard). +# False to count the number of A2 alleles. +# num_threads: None or int, optional +# The number of threads with which to read data. Defaults to all available +# processors. +# Can also be set with these environment variables (listed in priority order): +# 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. +# skip_format_check: bool, optional +# False (default) to immediately check for expected starting bytes in +# the .bed file. True to delay the check until (and if) data is read. +# fam_filepath: pathlib.Path or str, optional +# Path to the file containing information about each individual (sample). +# Defaults to replacing the .bed file’s suffix with .fam. +# bim_filepath: pathlib.Path or str, optional +# Path to the file containing information about each SNP (variant). +# Defaults to replacing the .bed file’s suffix with .bim. + +# Returns +# ------- +# open_bed_cloud +# an open_bed_cloud object + +# Examples +# -------- + +# List individual (sample) :attr:`iid` and SNP (variant) :attr:`sid`, then :meth:`read` +# the whole file. + +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> bed = open_bed_cloud(file_name) +# >>> print(bed.iid) +# ['iid1' 'iid2' 'iid3'] +# >>> print(bed.sid) +# ['sid1' 'sid2' 'sid3' 'sid4'] +# >>> print(bed.read()) +# [[ 1. 0. nan 0.] +# [ 2. 0. nan 2.] +# [ 0. 1. 2. 0.]] +# >>> del bed # optional: delete bed object + +# Open the file and read data for one SNP (variant) +# at index position 2. + +# .. doctest:: + +# >>> import numpy as np +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.read(np.s_[:,2])) +# [[nan] +# [nan] +# [ 2.]] + +# Replace :attr:`iid`. + + +# >>> bed = open_bed_cloud(file_name, properties={"iid":["sample1","sample2","sample3"]}) +# >>> print(bed.iid) # replaced +# ['sample1' 'sample2' 'sample3'] +# >>> print(bed.sid) # same as before +# ['sid1' 'sid2' 'sid3' 'sid4'] + +# Give the number of individuals (samples) and SNPs (variants) so that the .fam and +# .bim files need never be opened. + +# >>> with open_bed_cloud(file_name, iid_count=3, sid_count=4) as bed: +# ... print(bed.read()) +# [[ 1. 0. nan 0.] +# [ 2. 0. nan 2.] +# [ 0. 1. 2. 0.]] + +# Mark some properties as "don’t read or offer". + +# >>> bed = open_bed_cloud(file_name, properties={ +# ... "father" : None, "mother" : None, "sex" : None, "pheno" : None, +# ... "allele_1" : None, "allele_2":None }) +# >>> print(bed.iid) # read from file +# ['iid1' 'iid2' 'iid3'] +# >>> print(bed.allele_2) # not read and not offered +# None + +# See the :meth:`read` for details of reading batches via slicing and fancy indexing. + +# """ + +# def __init__( +# self, +# filepath: Union[str, Path], +# iid_count: Optional[int] = None, +# sid_count: Optional[int] = None, +# properties: Mapping[str, List[Any]] = {}, +# count_A1: bool = True, +# num_threads: Optional[int] = None, +# fam_filepath: Union[str, Path] = None, +# bim_filepath: Union[str, Path] = None, +# ): +# skip_format_check = True + +# self.filepath = Path(filepath) +# self.count_A1 = count_A1 +# self._num_threads = num_threads +# self.skip_format_check = skip_format_check +# self._fam_filepath = ( +# Path(fam_filepath) +# if fam_filepath is not None +# else self.filepath.parent / (self.filepath.stem + ".fam") +# ) +# self._bim_filepath = ( +# Path(bim_filepath) +# if bim_filepath is not None +# else self.filepath.parent / (self.filepath.stem + ".bim") +# ) + +# self.properties_dict, self._counts = open_bed_cloud._fix_up_properties( +# properties, iid_count, sid_count, use_fill_sequence=False +# ) +# self._iid_range = None +# self._sid_range = None + +# @classmethod +# async def create( +# cls, +# filepath: Union[str, Path], +# iid_count: Optional[int] = None, +# sid_count: Optional[int] = None, +# properties: Mapping[str, List[Any]] = {}, +# count_A1: bool = True, +# skip_format_check: bool = False, +# num_threads: Optional[int] = None, +# fam_filepath: Union[str, Path] = None, +# bim_filepath: Union[str, Path] = None, +# ): +# print(type(filepath), filepath) +# self = open_bed_cloud( +# filepath, +# iid_count=iid_count, +# sid_count=sid_count, +# properties=properties, +# count_A1=count_A1, +# num_threads=num_threads, +# fam_filepath=fam_filepath, +# bim_filepath=bim_filepath, +# ) +# self.skip_format_check = skip_format_check +# if not self.skip_format_check: +# with open(self.filepath, "rb") as filepointer: +# self._check_file(filepointer) +# return self + +# async def __aenter__(self): +# # Setup or additional operations when entering the context +# return self + +# async def __aexit__(self, exc_type, exc, tb): +# # Cleanup or close operations when exiting the context +# pass + +# async def read( +# self, +# index: Optional[Any] = None, +# dtype: Optional[Union[type, str]] = "float32", +# order: Optional[str] = "F", +# force_python_only: Optional[bool] = False, +# num_threads=None, +# ) -> np.ndarray: +# """ +# Read genotype information. + +# Parameters +# ---------- +# index: +# An optional expression specifying the individuals (samples) and SNPs +# (variants) to read. (See examples, below). +# Defaults to ``None``, meaning read all. + +# (If index is a tuple, the first component indexes the individuals and the +# second indexes +# the SNPs. If it is not a tuple and not None, it indexes SNPs.) + +# dtype: {'float32' (default), 'float64', 'int8'}, optional +# The desired data-type for the returned array. +# order : {'F','C'}, optional +# The desired memory layout for the returned array. +# Defaults to ``F`` (Fortran order, which is SNP-major). +# force_python_only: bool, optional +# If False (default), uses the faster Rust code; otherwise it uses the slower +# pure Python code. + +# num_threads: None or int, optional +# The number of threads with which to read data. Defaults to all available +# processors. +# Can also be set with :class:`open_bed_cloud` or these +# environment variables (listed in priority order): +# 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. + + +# Returns +# ------- +# numpy.ndarray +# 2-D array containing values of 0, 1, 2, or missing + + +# Rows represent individuals (samples). Columns represent SNPs (variants). + +# For ``dtype`` 'float32' and 'float64', NaN indicates missing values. +# For 'int8', -127 indicates missing values. + +# Examples +# -------- + +# To read all data in a .bed file, set ``index`` to ``None``. This is the default. + +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.read()) +# [[ 1. 0. nan 0.] +# [ 2. 0. nan 2.] +# [ 0. 1. 2. 0.]] + +# To read selected individuals (samples) and/or SNPs (variants), set each part of +# a :data:`numpy.s_` to an `int`, a list of `int`, a slice expression, or +# a list of `bool`. +# Negative integers count from the end of the list. + + +# .. doctest:: + +# >>> import numpy as np +# >>> bed = open_bed_cloud(file_name) +# >>> print(bed.read(np.s_[:,2])) # read the SNPs indexed by 2. +# [[nan] +# [nan] +# [ 2.]] +# >>> print(bed.read(np.s_[:,[2,3,0]])) # read the SNPs indexed by 2, 3, and 0 +# [[nan 0. 1.] +# [nan 2. 2.] +# [ 2. 0. 0.]] +# >>> # read SNPs from 1 (inclusive) to 4 (exclusive) +# >>> print(bed.read(np.s_[:,1:4])) +# [[ 0. nan 0.] +# [ 0. nan 2.] +# [ 1. 2. 0.]] +# >>> print(np.unique(bed.chromosome)) # print unique chrom values +# ['1' '5' 'Y'] +# >>> print(bed.read(np.s_[:,bed.chromosome=='5'])) # read all SNPs in chrom 5 +# [[nan] +# [nan] +# [ 2.]] +# >>> print(bed.read(np.s_[0,:])) # Read 1st individual (across all SNPs) +# [[ 1. 0. nan 0.]] +# >>> print(bed.read(np.s_[::2,:])) # Read every 2nd individual +# [[ 1. 0. nan 0.] +# [ 0. 1. 2. 0.]] +# >>> #read last and 2nd-to-last individuals and the last SNPs +# >>> print(bed.read(np.s_[[-1,-2],-1])) +# [[0.] +# [2.]] + + +# You can give a dtype for the output. + +# .. doctest:: + +# >>> print(bed.read(dtype='int8')) +# [[ 1 0 -127 0] +# [ 2 0 -127 2] +# [ 0 1 2 0]] +# >>> del bed # optional: delete bed object + +# """ + +# iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index) + +# dtype = np.dtype(dtype) +# if order not in {"F", "C"}: +# raise ValueError(f"order '{order}' not known, only 'F', 'C'") + +# # Later happy with _iid_range and _sid_range or could it be done with +# # allocation them? +# if self._iid_range is None: +# self._iid_range = np.arange(await self.iid_count(), dtype="intp") +# if self._sid_range is None: +# self._sid_range = np.arange(await self.sid_count(), dtype="intp") + +# iid_index = np.ascontiguousarray( +# self._iid_range[iid_index_or_slice_etc], +# dtype="intp", +# ) +# sid_index = np.ascontiguousarray( +# self._sid_range[sid_index_or_slice_etc], dtype="intp" +# ) + +# if not force_python_only: +# num_threads = get_num_threads( +# self._num_threads if num_threads is None else num_threads +# ) + +# val = np.zeros((len(iid_index), len(sid_index)), order=order, dtype=dtype) + +# if await self.iid_count() > 0 and await self.sid_count() > 0: +# if dtype == np.int8: +# reader = read_i8 +# elif dtype == np.float64: +# reader = read_f64 +# elif dtype == np.float32: +# reader = read_f32 +# else: +# raise ValueError( +# f"dtype '{val.dtype}' not known, only " +# + "'int8', 'float32', and 'float64' are allowed." +# ) + +# reader( +# str(self.filepath), +# iid_count=await self.iid_count(), +# sid_count=await self.sid_count(), +# is_a1_counted=self.count_A1, +# iid_index=iid_index, +# sid_index=sid_index, +# val=val, +# num_threads=num_threads, +# ) + +# else: +# if not self.count_A1: +# byteZero = 0 +# byteThree = 2 +# else: +# byteZero = 2 +# byteThree = 0 +# if dtype == np.int8: +# missing = -127 +# else: +# missing = np.nan +# # An earlier version of this code had a way to read consecutive SNPs of code +# # in one read. May want +# # to add that ability back to the code. +# # Also, note that reading with python will often result in +# # non-contiguous memory +# # logging.warn("using pure python plink parser (might be much slower!!)") +# val = np.zeros( +# ((int(np.ceil(0.25 * self.iid_count)) * 4), len(sid_index)), +# order=order, +# dtype=dtype, +# ) # allocate it a little big + +# nbyte = int(np.ceil(0.25 * self.iid_count)) +# with open(self.filepath, "rb") as filepointer: +# for SNPsIndex, bimIndex in enumerate(sid_index): +# startbit = int(np.ceil(0.25 * self.iid_count) * bimIndex + 3) +# filepointer.seek(startbit) +# bytes = np.array(bytearray(filepointer.read(nbyte))).reshape( +# (int(np.ceil(0.25 * self.iid_count)), 1), order="F" +# ) + +# val[3::4, SNPsIndex : SNPsIndex + 1] = byteZero +# val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 64] = missing +# val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 128] = 1 +# val[3::4, SNPsIndex : SNPsIndex + 1][bytes >= 192] = byteThree +# bytes = np.mod(bytes, 64) +# val[2::4, SNPsIndex : SNPsIndex + 1] = byteZero +# val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 16] = missing +# val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 32] = 1 +# val[2::4, SNPsIndex : SNPsIndex + 1][bytes >= 48] = byteThree +# bytes = np.mod(bytes, 16) +# val[1::4, SNPsIndex : SNPsIndex + 1] = byteZero +# val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 4] = missing +# val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 8] = 1 +# val[1::4, SNPsIndex : SNPsIndex + 1][bytes >= 12] = byteThree +# bytes = np.mod(bytes, 4) +# val[0::4, SNPsIndex : SNPsIndex + 1] = byteZero +# val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 1] = missing +# val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 2] = 1 +# val[0::4, SNPsIndex : SNPsIndex + 1][bytes >= 3] = byteThree +# val = val[iid_index, :] # reorder or trim any extra allocation +# assert val.dtype == np.dtype(dtype) # real assert +# if not open_bed_cloud._array_properties_are_ok(val, order): +# val = val.copy(order=order) + +# return val + +# def __str__(self) -> str: +# return f"{self.__class__.__name__}('{self.filepath}',...)" + +# async def fid(self) -> np.ndarray: +# """ +# Family id of each individual (sample). + +# Returns +# ------- +# numpy.ndarray +# array of str + + +# '0' represents a missing value. + +# If needed, will cause a one-time read of the .fam file. + +# Example +# ------- + +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.fid) +# ['fid1' 'fid1' 'fid2'] + +# """ + +# return self.property_item("fid") + +# # @property +# async def iid(self) -> np.ndarray: +# """ +# Individual id of each individual (sample). + +# Returns +# ------- +# numpy.ndarray +# array of str + + +# If needed, will cause a one-time read of the .fam file. + +# Example +# ------- + +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.iid) +# ['iid1' 'iid2' 'iid3'] - """ - return self.property_item("iid") +# """ +# return self.property_item("iid") - # @property - async def father(self) -> np.ndarray: - """ - Father id of each individual (sample). +# # @property +# async def father(self) -> np.ndarray: +# """ +# Father id of each individual (sample). - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - '0' represents a missing value. +# '0' represents a missing value. - If needed, will cause a one-time read of the .fam file. +# If needed, will cause a one-time read of the .fam file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.father) - ['iid23' 'iid23' 'iid22'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.father) +# ['iid23' 'iid23' 'iid22'] - """ - return self.property_item("father") +# """ +# return self.property_item("father") - # @property - async def mother(self) -> np.ndarray: - """ - Mother id of each individual (sample). +# # @property +# async def mother(self) -> np.ndarray: +# """ +# Mother id of each individual (sample). - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - '0' represents a missing value. +# '0' represents a missing value. - If needed, will cause a one-time read of the .fam file. +# If needed, will cause a one-time read of the .fam file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.mother) - ['iid34' 'iid34' 'iid33'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.mother) +# ['iid34' 'iid34' 'iid33'] - """ - return self.property_item("mother") +# """ +# return self.property_item("mother") - # @property - async def sex(self) -> np.ndarray: - """ - Sex of each individual (sample). +# # @property +# async def sex(self) -> np.ndarray: +# """ +# Sex of each individual (sample). - Returns - ------- - numpy.ndarray - array of 0, 1, or 2 +# Returns +# ------- +# numpy.ndarray +# array of 0, 1, or 2 - 0 is unknown, 1 is male, 2 is female +# 0 is unknown, 1 is male, 2 is female - If needed, will cause a one-time read of the .fam file. +# If needed, will cause a one-time read of the .fam file. - Example - ------- - .. doctest:: +# Example +# ------- +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.sex) - [1 2 0] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.sex) +# [1 2 0] - """ - return self.property_item("sex") +# """ +# return self.property_item("sex") - # @property - async def pheno(self) -> np.ndarray: - """ - A phenotype for each individual (sample) - (seldom used). +# # @property +# async def pheno(self) -> np.ndarray: +# """ +# A phenotype for each individual (sample) +# (seldom used). - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - '0' may represent a missing value. +# '0' may represent a missing value. - If needed, will cause a one-time read of the .fam file. +# If needed, will cause a one-time read of the .fam file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.pheno) - ['red' 'red' 'blue'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.pheno) +# ['red' 'red' 'blue'] - """ - return self.property_item("pheno") +# """ +# return self.property_item("pheno") - @property - def properties(self) -> Mapping[str, np.array]: - """ - All the properties returned as a dictionary. +# @property +# def properties(self) -> Mapping[str, np.array]: +# """ +# All the properties returned as a dictionary. - Returns - ------- - dict - all the properties +# Returns +# ------- +# dict +# all the properties - The keys of the dictionary are the names of the properties, namely: +# The keys of the dictionary are the names of the properties, namely: - "fid" (family id), "iid" (individual or sample id), "father" (father id), - "mother" (mother id), "sex", "pheno" (phenotype), "chromosome", "sid" - (SNP or variant id), "cm_position" (centimorgan position), "bp_position" - (base-pair position), "allele_1", "allele_2". +# "fid" (family id), "iid" (individual or sample id), "father" (father id), +# "mother" (mother id), "sex", "pheno" (phenotype), "chromosome", "sid" +# (SNP or variant id), "cm_position" (centimorgan position), "bp_position" +# (base-pair position), "allele_1", "allele_2". - The values are :class:`numpy.ndarray`. +# The values are :class:`numpy.ndarray`. - If needed, will cause a one-time read of the .fam and .bim file. +# If needed, will cause a one-time read of the .fam and .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(len(bed.properties)) #length of dict - 12 +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(len(bed.properties)) #length of dict +# 12 - """ - for key in _meta_meta: - self.property_item(key) - return self.properties_dict +# """ +# for key in _meta_meta: +# self.property_item(key) +# return self.properties_dict - def property_item(self, name: str) -> np.ndarray: - """ - Retrieve one property by name. +# def property_item(self, name: str) -> np.ndarray: +# """ +# Retrieve one property by name. - Returns - ------- - numpy.ndarray - a property value +# Returns +# ------- +# numpy.ndarray +# a property value - The name is one of these: +# The name is one of these: - "fid" (family id), "iid" (individual or sample id), "father" (father id), - "mother" (mother id), "sex", "pheno" (phenotype), "chromosome", "sid" - (SNP or variant id), "cm_position" (centimorgan position), "bp_position" - (base-pair position), "allele_1", "allele_2". +# "fid" (family id), "iid" (individual or sample id), "father" (father id), +# "mother" (mother id), "sex", "pheno" (phenotype), "chromosome", "sid" +# (SNP or variant id), "cm_position" (centimorgan position), "bp_position" +# (base-pair position), "allele_1", "allele_2". - If needed, will cause a one-time read of the .fam or .bim file. +# If needed, will cause a one-time read of the .fam or .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.property_item('chromosome')) - ['1' '1' '5' 'Y'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.property_item('chromosome')) +# ['1' '1' '5' 'Y'] - """ - if name not in self.properties_dict: - mm = _meta_meta[name] - self._read_fam_or_bim(suffix=mm.suffix) - return self.properties_dict[name] +# """ +# if name not in self.properties_dict: +# mm = _meta_meta[name] +# self._read_fam_or_bim(suffix=mm.suffix) +# return self.properties_dict[name] - # @property - async def chromosome(self) -> np.ndarray: - """ - Chromosome of each SNP (variant) +# # @property +# async def chromosome(self) -> np.ndarray: +# """ +# Chromosome of each SNP (variant) - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - '0' represents a missing value. +# '0' represents a missing value. - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.chromosome) - ['1' '1' '5' 'Y'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.chromosome) +# ['1' '1' '5' 'Y'] - """ - return self.property_item("chromosome") +# """ +# return self.property_item("chromosome") - # @property - async def sid(self) -> np.ndarray: - """ - SNP id of each SNP (variant). +# # @property +# async def sid(self) -> np.ndarray: +# """ +# SNP id of each SNP (variant). - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.sid) - ['sid1' 'sid2' 'sid3' 'sid4'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.sid) +# ['sid1' 'sid2' 'sid3' 'sid4'] - """ - return self.property_item("sid") +# """ +# return self.property_item("sid") - # @property - async def cm_position(self) -> np.ndarray: - """ - Centimorgan position of each SNP (variant). +# # @property +# async def cm_position(self) -> np.ndarray: +# """ +# Centimorgan position of each SNP (variant). - Returns - ------- - numpy.ndarray - array of float +# Returns +# ------- +# numpy.ndarray +# array of float - 0.0 represents a missing value. +# 0.0 represents a missing value. - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.cm_position) - [ 100.4 2000.5 4000.7 7000.9] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.cm_position) +# [ 100.4 2000.5 4000.7 7000.9] - """ - return self.property_item("cm_position") +# """ +# return self.property_item("cm_position") - # @property - async def bp_position(self) -> np.ndarray: - """ - Base-pair position of each SNP (variant). +# # @property +# async def bp_position(self) -> np.ndarray: +# """ +# Base-pair position of each SNP (variant). - Returns - ------- - numpy.ndarray - array of int +# Returns +# ------- +# numpy.ndarray +# array of int - 0 represents a missing value. +# 0 represents a missing value. - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.bp_position) - [ 1 100 1000 1004] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.bp_position) +# [ 1 100 1000 1004] - """ - return self.property_item("bp_position") +# """ +# return self.property_item("bp_position") - # @property - async def allele_1(self) -> np.ndarray: - """ - First allele of each SNP (variant). +# # @property +# async def allele_1(self) -> np.ndarray: +# """ +# First allele of each SNP (variant). - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.allele_1) - ['A' 'T' 'A' 'T'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.allele_1) +# ['A' 'T' 'A' 'T'] - """ - return self.property_item("allele_1") +# """ +# return self.property_item("allele_1") - # @property - async def allele_2(self) -> np.ndarray: - """ - Second allele of each SNP (variant), +# # @property +# async def allele_2(self) -> np.ndarray: +# """ +# Second allele of each SNP (variant), - Returns - ------- - numpy.ndarray - array of str +# Returns +# ------- +# numpy.ndarray +# array of str - If needed, will cause a one-time read of the .bim file. +# If needed, will cause a one-time read of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.allele_2) - ['A' 'C' 'C' 'G'] +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.allele_2) +# ['A' 'C' 'C' 'G'] - """ - return self.property_item("allele_2") +# """ +# return self.property_item("allele_2") - # @property # remove - async def iid_count(self) -> np.ndarray: - """ - Number of individuals (samples). +# # @property # remove +# async def iid_count(self) -> np.ndarray: +# """ +# Number of individuals (samples). - Returns - ------- - int - number of individuals +# Returns +# ------- +# int +# number of individuals - If needed, will cause a fast line-count of the .fam file. +# If needed, will cause a fast line-count of the .fam file. - Example - ------- +# Example +# ------- - .. doctest:: +# .. doctest:: - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.iid_count) - 3 +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.iid_count) +# 3 - """ - return await self._count("fam") +# """ +# return await self._count("fam") - # @property - async def sid_count(self) -> np.ndarray: - """ - Number of SNPs (variants). +# # @property +# async def sid_count(self) -> np.ndarray: +# """ +# Number of SNPs (variants). - Returns - ------- - int - number of SNPs +# Returns +# ------- +# int +# number of SNPs - If needed, will cause a fast line-count of the .bim file. +# If needed, will cause a fast line-count of the .bim file. - Example - ------- +# Example +# ------- - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.sid_count) - 4 - - """ - return await self._count("bim") - - def _property_filepath(self, suffix): - if suffix == "fam": - return self._fam_filepath - else: - assert suffix == "bim" # real assert - return self._bim_filepath - - async def _count(self, suffix): - count = self._counts[suffix] - if count is None: - count = _rawincount(self._property_filepath(suffix)) - self._counts[suffix] = count - return count - - @staticmethod - def _check_file(filepointer): - mode = filepointer.read(2) - if mode != b"l\x1b": - raise ValueError("Not a valid .bed file") - mode = filepointer.read(1) # \x01 = SNP major \x00 = individual major - if mode != b"\x01": - raise ValueError("only SNP-major is implemented") - - def __del__(self): - self.__exit__() - - def __enter__(self): - return self - - def __exit__(self, *_): - pass - - @staticmethod - def _array_properties_are_ok(val, order): - if order == "F": - return val.flags["F_CONTIGUOUS"] - else: - assert order == "C" # real assert - return val.flags["C_CONTIGUOUS"] - - # @property - async def shape(self): - """ - Number of individuals (samples) and SNPs (variants). - - Returns - ------- - (int, int) - number of individuals, number of SNPs - - - If needed, will cause a fast line-count of the .fam and .bim files. - - Example - ------- - - .. doctest:: - - >>> from bed_reader import open_bed_cloud, sample_file - >>> - >>> file_name = sample_file("small.bed") - >>> with open_bed_cloud(file_name) as bed: - ... print(bed.shape) - (3, 4) - - """ - return (len(await self.iid()), len(await self.sid())) - - @staticmethod - def _split_index(index): - if not isinstance(index, tuple): - index = (None, index) - iid_index = open_bed_cloud._fix_up_index(index[0]) - sid_index = open_bed_cloud._fix_up_index(index[1]) - return iid_index, sid_index - - @staticmethod - def _fix_up_index(index): - if index is None: # make a shortcut for None - return slice(None) - try: # If index is an int, return it in an array - index = index.__index__() # (see - # https://stackoverflow.com/questions/3501382/checking-whether-a-variable-is-an-integer-or-not) - return [index] - except Exception: - pass - return index - - @staticmethod - def _write_fam_or_bim(base_filepath, properties, suffix, property_filepath): - assert suffix in {"fam", "bim"}, "real assert" - - filepath = ( - Path(property_filepath) - if property_filepath is not None - else base_filepath.parent / (base_filepath.stem + "." + suffix) - ) - - fam_bim_list = [] - for key, mm in _meta_meta.items(): - if mm.suffix == suffix: - assert len(fam_bim_list) == mm.column, "real assert" - fam_bim_list.append(properties[key]) - - sep = " " if suffix == "fam" else "\t" - - with open(filepath, "w") as filepointer: - for index in range(len(fam_bim_list[0])): - filepointer.write( - sep.join(str(seq[index]) for seq in fam_bim_list) + "\n" - ) - - @staticmethod - def _fix_up_properties_array(input, dtype, missing_value, key): - if input is None: - return None - if len(input) == 0: - return np.zeros([0], dtype=dtype) - - if not isinstance(input, np.ndarray): - return open_bed_cloud._fix_up_properties_array( - np.array(input), dtype, missing_value, key - ) - - if len(input.shape) != 1: - raise ValueError(f"{key} should be one dimensional") - - if not np.issubdtype(input.dtype, dtype): - # This will convert, for example, numerical sids to string sids or - # floats that happen to be integers into ints, - # but there will be a warning generated. - output = np.array(input, dtype=dtype) - else: - output = input - - # Change NaN in input to correct missing value - if np.issubdtype(input.dtype, np.floating): - output[input != input] = missing_value - - return output - - @staticmethod - def _fix_up_properties(properties, iid_count, sid_count, use_fill_sequence): - for key in properties: - if key not in _meta_meta: - raise KeyError(f"properties key '{key}' not known") - - count_dict = {"fam": iid_count, "bim": sid_count} - properties_dict = {} - for key, mm in _meta_meta.items(): - count = count_dict[mm.suffix] - - if key not in properties or (use_fill_sequence and properties[key] is None): - if use_fill_sequence: - output = mm.fill_sequence(key, count, mm.missing_value, mm.dtype) - else: - continue # Test coverage reaches this, but doesn't report it. - else: - output = open_bed_cloud._fix_up_properties_array( - properties[key], mm.dtype, mm.missing_value, key - ) - - if output is not None: - if count is None: - count_dict[mm.suffix] = len(output) - else: - if count != len(output): - raise ValueError( - f"The length of override {key}, {len(output)}, should not " - + "be different from the current " - + f"{_count_name[mm.suffix]}, {count}" - ) - properties_dict[key] = output - return properties_dict, count_dict - - def _read_fam_or_bim(self, suffix): - property_filepath = self._property_filepath(suffix) - - logging.info("Loading {0} file {1}".format(suffix, property_filepath)) - - count = self._counts[suffix] - - delimiter = _delimiters[suffix] - if delimiter in {r"\s+"}: - delimiter = None - - usecolsdict = {} - dtype_dict = {} - for key, mm in _meta_meta.items(): - if mm.suffix is suffix and key not in self.properties_dict: - usecolsdict[key] = mm.column - dtype_dict[mm.column] = mm.dtype - assert list(usecolsdict.values()) == sorted(usecolsdict.values()) # real assert - assert len(usecolsdict) > 0 # real assert - - if os.path.getsize(property_filepath) == 0: - columns, row_count = [], 0 - else: - columns, row_count = _read_csv( - property_filepath, - delimiter=delimiter, - dtype=dtype_dict, - usecols=usecolsdict.values(), - ) - - if count is None: - self._counts[suffix] = row_count - else: - if count != row_count: - raise ValueError( - f"The number of lines in the *.{suffix} file, {row_count}, " - + "should not be different from the current " - + "f{_count_name[suffix]}, {count}" - ) - for i, key in enumerate(usecolsdict.keys()): - mm = _meta_meta[key] - if row_count == 0: - output = np.array([], dtype=mm.dtype) - else: - output = columns[i] - if not np.issubdtype(output.dtype, mm.dtype): - output = np.array(output, dtype=mm.dtype) - self.properties_dict[key] = output - - # def read_sparse( - # self, - # index: Optional[Any] = None, - # dtype: Optional[Union[type, str]] = "float32", - # batch_size: Optional[int] = None, - # format: Optional[str] = "csc", - # num_threads=None, - # ) -> (Union[sparse.csc_matrix, sparse.csr_matrix]) if sparse is not None else None: - # """ - # Read genotype information into a :mod:`scipy.sparse` matrix. Sparse matrices - # may be useful when the data is mostly zeros. - - # .. note:: - # This method requires :mod:`scipy`. Install `scipy` with: - - # .. code-block:: bash - - # pip install --upgrade bed-reader[sparse] - - # Parameters - # ---------- - # index: - # An optional expression specifying the individuals (samples) and SNPs - # (variants) to read. (See examples, below). - # Defaults to ``None``, meaning read all. - - # (If index is a tuple, the first component indexes the individuals and the - # second indexes - # the SNPs. If it is not a tuple and not None, it indexes SNPs.) - - # dtype: {'float32' (default), 'float64', 'int8'}, optional - # The desired data-type for the returned array. - # batch_size: None or int, optional - # Number of dense columns or rows to read at a time, internally. - # Defaults to round(sqrt(total-number-of-columns-or-rows-to-read)). - # format : {'csc','csr'}, optional - # The desired format of the sparse matrix. - # Defaults to ``csc`` (Compressed Sparse Column, which is SNP-major). - # num_threads: None or int, optional - # The number of threads with which to read data. Defaults to all available - # processors. - # Can also be set with :class:`open_bed_cloud` or these - # environment variables (listed in priority order): - # 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. - - # Returns - # ------- - # a :class:`scipy.sparse.csc_matrix` (default) or :class:`scipy.sparse.csr_matrix` - - # Rows represent individuals (samples). Columns represent SNPs (variants). - - # For ``dtype`` 'float32' and 'float64', NaN indicates missing values. - # For 'int8', -127 indicates missing values. - - # The memory used by the final sparse matrix is approximately: - - # # of non-zero values * (4 bytes + 1 byte (for int8)) - - # For example, consider reading 1000 individuals (samples) x 50,000 SNPs (variants) - # into csc format where the data is 97% sparse. - # The memory used will be about 7.5 MB (1000 x 50,000 x 3% x 5 bytes). - # This is 15% of the 50 MB needed by a dense matrix. - - # Internally, the function reads the data via small dense matrices. - # For this example, by default, the function will read 1000 individuals x 224 SNPs - # (because 224 * 224 is about 50,000). - # The memory used by the small dense matrix is 1000 x 244 x 1 byte (for int8) = 0.224 MB. - - # You can set `batch_size`. Larger values will be faster. - # Smaller values will use less memory. - - # For this example, we might want to set the `batch_size` to 5000. Then, - # the memory used by the small dense matrix - # would be 1000 x 5000 x 1 byte (for int8) = 5 MB, - # similar to the 7.5 MB needed for the final sparse matrix. - - # Examples - # -------- - - # Read all data in a .bed file into a :class:`scipy.sparse.csc_matrix`. - # The file has 10 individuals (samples) by 20 SNPs (variants). - # All but eight values are 0. - - # .. doctest:: - - # >>> # pip install bed-reader[samples,sparse] # if needed - # >>> from bed_reader import open_bed_cloud, sample_file - # >>> - # >>> file_name = sample_file("sparse.bed") - # >>> with open_bed_cloud(file_name) as bed: - # ... print(bed.shape) - # ... val_sparse = bed.read_sparse(dtype="int8") - # ... print(val_sparse) # doctest:+NORMALIZE_WHITESPACE - # (10, 20) - # (8, 4) 1 - # (8, 5) 2 - # (0, 8) 2 - # (4, 9) 1 - # (7, 9) 1 - # (5, 11) 1 - # (2, 12) 1 - # (3, 12) 1 - - # To read selected individuals (samples) and/or SNPs (variants), set each part of - # a :data:`numpy.s_` to an `int`, a list of `int`, a slice expression, or - # a list of `bool`. - # Negative integers count from the end of the list. - - # .. doctest:: - - # >>> import numpy as np - # >>> bed = open_bed_cloud(file_name) - # >>> print(bed.read_sparse(np.s_[:,5], dtype="int8")) # read the SNPs indexed by 5. # doctest:+NORMALIZE_WHITESPACE - # (8, 0) 2 - # >>> # read the SNPs indexed by 5, 4, and 0 - # >>> print(bed.read_sparse(np.s_[:,[5,4,0]], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE - # (8, 0) 2 - # (8, 1) 1 - # >>> # read SNPs from 1 (inclusive) to 11 (exclusive) - # >>> print(bed.read_sparse(np.s_[:,1:11], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE - # (8, 3) 1 - # (8, 4) 2 - # (0, 7) 2 - # (4, 8) 1 - # (7, 8) 1 - # >>> print(np.unique(bed.chromosome)) # print unique chrom values - # ['1' '5' 'Y'] - # >>> # read all SNPs in chrom 5 - # >>> print(bed.read_sparse(np.s_[:,bed.chromosome=='5'], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE - # (8, 0) 1 - # (8, 1) 2 - # (0, 4) 2 - # (4, 5) 1 - # (7, 5) 1 - # (5, 7) 1 - # (2, 8) 1 - # (3, 8) 1 - # >>> # Read 1st individual (across all SNPs) - # >>> print(bed.read_sparse(np.s_[0,:], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE - # (0, 8) 2 - # >>> print(bed.read_sparse(np.s_[::2,:], dtype="int8")) # Read every 2nd individual # doctest:+NORMALIZE_WHITESPACE - # (4, 4) 1 - # (4, 5) 2 - # (0, 8) 2 - # (2, 9) 1 - # (1, 12) 1 - # >>> # read last and 2nd-to-last individuals and the 15th-from-the-last SNP - # >>> print(bed.read_sparse(np.s_[[-1,-2],-15], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE - # (1, 0) 2 - - # """ - # if sparse is None: - # raise ImportError( - # "The function read_sparse() requires scipy. " - # + "Install it with 'pip install --upgrade bed-reader[sparse]'." - # ) - # iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index) - - # dtype = np.dtype(dtype) - - # # Similar code in read(). - # # Later happy with _iid_range and _sid_range or could it be done with - # # allocation them? - # if self._iid_range is None: - # self._iid_range = np.arange(self.iid_count, dtype="intp") - # if self._sid_range is None: - # self._sid_range = np.arange(self.sid_count, dtype="intp") - - # iid_index = np.ascontiguousarray( - # self._iid_range[iid_index_or_slice_etc], - # dtype="intp", - # ) - # sid_index = np.ascontiguousarray( - # self._sid_range[sid_index_or_slice_etc], dtype="intp" - # ) - - # if ( - # len(iid_index) > np.iinfo(np.int32).max - # or len(sid_index) > np.iinfo(np.int32).max - # ): - # raise ValueError( - # "Too (many Individuals or SNPs (variants) requested. Maximum is {np.iinfo(np.int32).max}." - # ) - - # if batch_size is None: - # batch_size = round(np.sqrt(len(sid_index))) - - # num_threads = get_num_threads( - # self._num_threads if num_threads is None else num_threads - # ) - - # if format == "csc": - # order = "F" - # indptr = np.zeros(len(sid_index) + 1, dtype=np.int32) - # elif format == "csr": - # order = "C" - # indptr = np.zeros(len(iid_index) + 1, dtype=np.int32) - # else: - # raise ValueError(f"format '{format}' not known. Expected 'csc' or 'csr'.") - - # # We init data and indices with zero element arrays to set their dtype. - # data = [np.empty(0, dtype=dtype)] - # indices = [np.empty(0, dtype=np.int32)] - - # if self.iid_count > 0 and self.sid_count > 0: - # if dtype == np.int8: - # reader = read_i8 - # elif dtype == np.float64: - # reader = read_f64 - # elif dtype == np.float32: - # reader = read_f32 - # else: - # raise ValueError( - # f"dtype '{dtype}' not known, only " - # + "'int8', 'float32', and 'float64' are allowed." - # ) - - # if format == "csc": - # val = np.zeros((len(iid_index), batch_size), order=order, dtype=dtype) - # for batch_start in range(0, len(sid_index), batch_size): - # batch_end = batch_start + batch_size - # if batch_end > len(sid_index): - # batch_end = len(sid_index) - # del val - # val = np.zeros( - # (len(iid_index), batch_end - batch_start), - # order=order, - # dtype=dtype, - # ) - # batch_slice = np.s_[batch_start:batch_end] - # batch_index = sid_index[batch_slice] - - # reader( - # str(self.filepath), - # iid_count=self.iid_count, - # sid_count=self.sid_count, - # is_a1_counted=self.count_A1, - # iid_index=iid_index, - # sid_index=batch_index, - # val=val, - # num_threads=num_threads, - # ) - - # self.sparsify( - # val, order, iid_index, batch_slice, data, indices, indptr - # ) - # else: - # assert format == "csr" # real assert - # val = np.zeros((batch_size, len(sid_index)), order=order, dtype=dtype) - # for batch_start in range(0, len(iid_index), batch_size): - # batch_end = batch_start + batch_size - # if batch_end > len(iid_index): - # batch_end = len(iid_index) - # del val - # val = np.zeros( - # (batch_end - batch_start, len(sid_index)), - # order=order, - # dtype=dtype, - # ) - - # batch_slice = np.s_[batch_start:batch_end] - # batch_index = iid_index[batch_slice] - - # reader( - # str(self.filepath), - # iid_count=self.iid_count, - # sid_count=self.sid_count, - # is_a1_counted=self.count_A1, - # iid_index=batch_index, - # sid_index=sid_index, - # val=val, - # num_threads=num_threads, - # ) - - # self.sparsify( - # val, order, sid_index, batch_slice, data, indices, indptr - # ) - - # data = np.concatenate(data) - # indices = np.concatenate(indices) - - # if format == "csc": - # return sparse.csc_matrix( - # (data, indices, indptr), (len(iid_index), len(sid_index)) - # ) - # else: - # assert format == "csr" # real assert - # return sparse.csr_matrix( - # (data, indices, indptr), (len(iid_index), len(sid_index)) - # ) - - # def sparsify(self, val, order, minor_index, batch_slice, data, indices, indptr): - # flatten = np.ravel(val, order=order) - # nz_indices = np.flatnonzero(flatten).astype(np.int32) - # column_indexes = nz_indices // len(minor_index) - # counts = np.bincount( - # column_indexes, minlength=batch_slice.stop - batch_slice.start - # ).astype(np.int32) - # counts_with_initial = np.r_[ - # indptr[batch_slice.start : batch_slice.start + 1], counts - # ] - - # data.append(flatten[nz_indices]) - # indices.append(np.mod(nz_indices, len(minor_index))) - # indptr[1:][batch_slice] = np.cumsum(counts_with_initial)[1:] - - -def _read_csv(filepath, delimiter=None, dtype=None, usecols=None): - # Prepare the usecols by ensuring it is a list of indices - usecols_indices = list(usecols) - transposed = np.loadtxt( - filepath, - dtype=np.str_, - delimiter=delimiter, - usecols=usecols_indices, - unpack=True, - ) - if transposed.ndim == 1: - transposed = transposed.reshape(-1, 1) - row_count = transposed.shape[1] # because unpack=True - - # Convert column lists to numpy arrays with the specified dtype - columns = [] - for output_index, input_index in enumerate(usecols_indices): - col = transposed[output_index] - # Find the dtype for this column - col_dtype = dtype.get(input_index, np.str_) - # Convert the column list to a numpy array with the specified dtype - columns.append(_convert_to_dtype(col, col_dtype)) - - return columns, row_count - - -def _convert_to_dtype(str_arr, dtype): - assert dtype in [np.str_, np.float32, np.int32] # real assert - - if dtype == np.str_: - return str_arr - - try: - new_arr = str_arr.astype(dtype) - except ValueError as e: - if dtype == np.float32: - raise e - # for backwards compatibility, see if intermediate float helps int conversion - try: - assert dtype == np.int32 # real assert - float_arr = str_arr.astype(np.float32) - except ValueError: - raise e - new_arr = float_arr.astype(np.int32) - if not np.array_equal(new_arr, float_arr): - raise ValueError( - f"invalid literal for int: '{str_arr[np.where(new_arr != float_arr)][:1]}')" - ) - return new_arr - - -if __name__ == "__main__": - import pytest - - logging.basicConfig(level=logging.INFO) - - pytest.main(["--doctest-modules", __file__]) +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.sid_count) +# 4 + +# """ +# return await self._count("bim") + +# def _property_filepath(self, suffix): +# if suffix == "fam": +# return self._fam_filepath +# else: +# assert suffix == "bim" # real assert +# return self._bim_filepath + +# async def _count(self, suffix): +# count = self._counts[suffix] +# if count is None: +# count = _rawincount(self._property_filepath(suffix)) +# self._counts[suffix] = count +# return count + +# @staticmethod +# def _check_file(filepointer): +# mode = filepointer.read(2) +# if mode != b"l\x1b": +# raise ValueError("Not a valid .bed file") +# mode = filepointer.read(1) # \x01 = SNP major \x00 = individual major +# if mode != b"\x01": +# raise ValueError("only SNP-major is implemented") + +# def __del__(self): +# self.__exit__() + +# def __enter__(self): +# return self + +# def __exit__(self, *_): +# pass + +# @staticmethod +# def _array_properties_are_ok(val, order): +# if order == "F": +# return val.flags["F_CONTIGUOUS"] +# else: +# assert order == "C" # real assert +# return val.flags["C_CONTIGUOUS"] + +# # @property +# async def shape(self): +# """ +# Number of individuals (samples) and SNPs (variants). + +# Returns +# ------- +# (int, int) +# number of individuals, number of SNPs + + +# If needed, will cause a fast line-count of the .fam and .bim files. + +# Example +# ------- + +# .. doctest:: + +# >>> from bed_reader import open_bed_cloud, sample_file +# >>> +# >>> file_name = sample_file("small.bed") +# >>> with open_bed_cloud(file_name) as bed: +# ... print(bed.shape) +# (3, 4) + +# """ +# return (len(await self.iid()), len(await self.sid())) + +# @staticmethod +# def _split_index(index): +# if not isinstance(index, tuple): +# index = (None, index) +# iid_index = open_bed_cloud._fix_up_index(index[0]) +# sid_index = open_bed_cloud._fix_up_index(index[1]) +# return iid_index, sid_index + +# @staticmethod +# def _fix_up_index(index): +# if index is None: # make a shortcut for None +# return slice(None) +# try: # If index is an int, return it in an array +# index = index.__index__() # (see +# # https://stackoverflow.com/questions/3501382/checking-whether-a-variable-is-an-integer-or-not) +# return [index] +# except Exception: +# pass +# return index + +# @staticmethod +# def _write_fam_or_bim(base_filepath, properties, suffix, property_filepath): +# assert suffix in {"fam", "bim"}, "real assert" + +# filepath = ( +# Path(property_filepath) +# if property_filepath is not None +# else base_filepath.parent / (base_filepath.stem + "." + suffix) +# ) + +# fam_bim_list = [] +# for key, mm in _meta_meta.items(): +# if mm.suffix == suffix: +# assert len(fam_bim_list) == mm.column, "real assert" +# fam_bim_list.append(properties[key]) + +# sep = " " if suffix == "fam" else "\t" + +# with open(filepath, "w") as filepointer: +# for index in range(len(fam_bim_list[0])): +# filepointer.write( +# sep.join(str(seq[index]) for seq in fam_bim_list) + "\n" +# ) + +# @staticmethod +# def _fix_up_properties_array(input, dtype, missing_value, key): +# if input is None: +# return None +# if len(input) == 0: +# return np.zeros([0], dtype=dtype) + +# if not isinstance(input, np.ndarray): +# return open_bed_cloud._fix_up_properties_array( +# np.array(input), dtype, missing_value, key +# ) + +# if len(input.shape) != 1: +# raise ValueError(f"{key} should be one dimensional") + +# if not np.issubdtype(input.dtype, dtype): +# # This will convert, for example, numerical sids to string sids or +# # floats that happen to be integers into ints, +# # but there will be a warning generated. +# output = np.array(input, dtype=dtype) +# else: +# output = input + +# # Change NaN in input to correct missing value +# if np.issubdtype(input.dtype, np.floating): +# output[input != input] = missing_value + +# return output + +# @staticmethod +# def _fix_up_properties(properties, iid_count, sid_count, use_fill_sequence): +# for key in properties: +# if key not in _meta_meta: +# raise KeyError(f"properties key '{key}' not known") + +# count_dict = {"fam": iid_count, "bim": sid_count} +# properties_dict = {} +# for key, mm in _meta_meta.items(): +# count = count_dict[mm.suffix] + +# if key not in properties or (use_fill_sequence and properties[key] is None): +# if use_fill_sequence: +# output = mm.fill_sequence(key, count, mm.missing_value, mm.dtype) +# else: +# continue # Test coverage reaches this, but doesn't report it. +# else: +# output = open_bed_cloud._fix_up_properties_array( +# properties[key], mm.dtype, mm.missing_value, key +# ) + +# if output is not None: +# if count is None: +# count_dict[mm.suffix] = len(output) +# else: +# if count != len(output): +# raise ValueError( +# f"The length of override {key}, {len(output)}, should not " +# + "be different from the current " +# + f"{_count_name[mm.suffix]}, {count}" +# ) +# properties_dict[key] = output +# return properties_dict, count_dict + +# def _read_fam_or_bim(self, suffix): +# property_filepath = self._property_filepath(suffix) + +# logging.info("Loading {0} file {1}".format(suffix, property_filepath)) + +# count = self._counts[suffix] + +# delimiter = _delimiters[suffix] +# if delimiter in {r"\s+"}: +# delimiter = None + +# usecolsdict = {} +# dtype_dict = {} +# for key, mm in _meta_meta.items(): +# if mm.suffix is suffix and key not in self.properties_dict: +# usecolsdict[key] = mm.column +# dtype_dict[mm.column] = mm.dtype +# assert list(usecolsdict.values()) == sorted(usecolsdict.values()) # real assert +# assert len(usecolsdict) > 0 # real assert + +# if os.path.getsize(property_filepath) == 0: +# columns, row_count = [], 0 +# else: +# columns, row_count = _read_csv( +# property_filepath, +# delimiter=delimiter, +# dtype=dtype_dict, +# usecols=usecolsdict.values(), +# ) + +# if count is None: +# self._counts[suffix] = row_count +# else: +# if count != row_count: +# raise ValueError( +# f"The number of lines in the *.{suffix} file, {row_count}, " +# + "should not be different from the current " +# + "f{_count_name[suffix]}, {count}" +# ) +# for i, key in enumerate(usecolsdict.keys()): +# mm = _meta_meta[key] +# if row_count == 0: +# output = np.array([], dtype=mm.dtype) +# else: +# output = columns[i] +# if not np.issubdtype(output.dtype, mm.dtype): +# output = np.array(output, dtype=mm.dtype) +# self.properties_dict[key] = output + +# # def read_sparse( +# # self, +# # index: Optional[Any] = None, +# # dtype: Optional[Union[type, str]] = "float32", +# # batch_size: Optional[int] = None, +# # format: Optional[str] = "csc", +# # num_threads=None, +# # ) -> (Union[sparse.csc_matrix, sparse.csr_matrix]) if sparse is not None else None: +# # """ +# # Read genotype information into a :mod:`scipy.sparse` matrix. Sparse matrices +# # may be useful when the data is mostly zeros. + +# # .. note:: +# # This method requires :mod:`scipy`. Install `scipy` with: + +# # .. code-block:: bash + +# # pip install --upgrade bed-reader[sparse] + +# # Parameters +# # ---------- +# # index: +# # An optional expression specifying the individuals (samples) and SNPs +# # (variants) to read. (See examples, below). +# # Defaults to ``None``, meaning read all. + +# # (If index is a tuple, the first component indexes the individuals and the +# # second indexes +# # the SNPs. If it is not a tuple and not None, it indexes SNPs.) + +# # dtype: {'float32' (default), 'float64', 'int8'}, optional +# # The desired data-type for the returned array. +# # batch_size: None or int, optional +# # Number of dense columns or rows to read at a time, internally. +# # Defaults to round(sqrt(total-number-of-columns-or-rows-to-read)). +# # format : {'csc','csr'}, optional +# # The desired format of the sparse matrix. +# # Defaults to ``csc`` (Compressed Sparse Column, which is SNP-major). +# # num_threads: None or int, optional +# # The number of threads with which to read data. Defaults to all available +# # processors. +# # Can also be set with :class:`open_bed_cloud` or these +# # environment variables (listed in priority order): +# # 'PST_NUM_THREADS', 'NUM_THREADS', 'MKL_NUM_THREADS'. + +# # Returns +# # ------- +# # a :class:`scipy.sparse.csc_matrix` (default) or :class:`scipy.sparse.csr_matrix` + +# # Rows represent individuals (samples). Columns represent SNPs (variants). + +# # For ``dtype`` 'float32' and 'float64', NaN indicates missing values. +# # For 'int8', -127 indicates missing values. + +# # The memory used by the final sparse matrix is approximately: + +# # # of non-zero values * (4 bytes + 1 byte (for int8)) + +# # For example, consider reading 1000 individuals (samples) x 50,000 SNPs (variants) +# # into csc format where the data is 97% sparse. +# # The memory used will be about 7.5 MB (1000 x 50,000 x 3% x 5 bytes). +# # This is 15% of the 50 MB needed by a dense matrix. + +# # Internally, the function reads the data via small dense matrices. +# # For this example, by default, the function will read 1000 individuals x 224 SNPs +# # (because 224 * 224 is about 50,000). +# # The memory used by the small dense matrix is 1000 x 244 x 1 byte (for int8) = 0.224 MB. + +# # You can set `batch_size`. Larger values will be faster. +# # Smaller values will use less memory. + +# # For this example, we might want to set the `batch_size` to 5000. Then, +# # the memory used by the small dense matrix +# # would be 1000 x 5000 x 1 byte (for int8) = 5 MB, +# # similar to the 7.5 MB needed for the final sparse matrix. + +# # Examples +# # -------- + +# # Read all data in a .bed file into a :class:`scipy.sparse.csc_matrix`. +# # The file has 10 individuals (samples) by 20 SNPs (variants). +# # All but eight values are 0. + +# # .. doctest:: + +# # >>> # pip install bed-reader[samples,sparse] # if needed +# # >>> from bed_reader import open_bed_cloud, sample_file +# # >>> +# # >>> file_name = sample_file("sparse.bed") +# # >>> with open_bed_cloud(file_name) as bed: +# # ... print(bed.shape) +# # ... val_sparse = bed.read_sparse(dtype="int8") +# # ... print(val_sparse) # doctest:+NORMALIZE_WHITESPACE +# # (10, 20) +# # (8, 4) 1 +# # (8, 5) 2 +# # (0, 8) 2 +# # (4, 9) 1 +# # (7, 9) 1 +# # (5, 11) 1 +# # (2, 12) 1 +# # (3, 12) 1 + +# # To read selected individuals (samples) and/or SNPs (variants), set each part of +# # a :data:`numpy.s_` to an `int`, a list of `int`, a slice expression, or +# # a list of `bool`. +# # Negative integers count from the end of the list. + +# # .. doctest:: + +# # >>> import numpy as np +# # >>> bed = open_bed_cloud(file_name) +# # >>> print(bed.read_sparse(np.s_[:,5], dtype="int8")) # read the SNPs indexed by 5. # doctest:+NORMALIZE_WHITESPACE +# # (8, 0) 2 +# # >>> # read the SNPs indexed by 5, 4, and 0 +# # >>> print(bed.read_sparse(np.s_[:,[5,4,0]], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE +# # (8, 0) 2 +# # (8, 1) 1 +# # >>> # read SNPs from 1 (inclusive) to 11 (exclusive) +# # >>> print(bed.read_sparse(np.s_[:,1:11], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE +# # (8, 3) 1 +# # (8, 4) 2 +# # (0, 7) 2 +# # (4, 8) 1 +# # (7, 8) 1 +# # >>> print(np.unique(bed.chromosome)) # print unique chrom values +# # ['1' '5' 'Y'] +# # >>> # read all SNPs in chrom 5 +# # >>> print(bed.read_sparse(np.s_[:,bed.chromosome=='5'], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE +# # (8, 0) 1 +# # (8, 1) 2 +# # (0, 4) 2 +# # (4, 5) 1 +# # (7, 5) 1 +# # (5, 7) 1 +# # (2, 8) 1 +# # (3, 8) 1 +# # >>> # Read 1st individual (across all SNPs) +# # >>> print(bed.read_sparse(np.s_[0,:], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE +# # (0, 8) 2 +# # >>> print(bed.read_sparse(np.s_[::2,:], dtype="int8")) # Read every 2nd individual # doctest:+NORMALIZE_WHITESPACE +# # (4, 4) 1 +# # (4, 5) 2 +# # (0, 8) 2 +# # (2, 9) 1 +# # (1, 12) 1 +# # >>> # read last and 2nd-to-last individuals and the 15th-from-the-last SNP +# # >>> print(bed.read_sparse(np.s_[[-1,-2],-15], dtype="int8")) # doctest:+NORMALIZE_WHITESPACE +# # (1, 0) 2 + +# # """ +# # if sparse is None: +# # raise ImportError( +# # "The function read_sparse() requires scipy. " +# # + "Install it with 'pip install --upgrade bed-reader[sparse]'." +# # ) +# # iid_index_or_slice_etc, sid_index_or_slice_etc = self._split_index(index) + +# # dtype = np.dtype(dtype) + +# # # Similar code in read(). +# # # Later happy with _iid_range and _sid_range or could it be done with +# # # allocation them? +# # if self._iid_range is None: +# # self._iid_range = np.arange(self.iid_count, dtype="intp") +# # if self._sid_range is None: +# # self._sid_range = np.arange(self.sid_count, dtype="intp") + +# # iid_index = np.ascontiguousarray( +# # self._iid_range[iid_index_or_slice_etc], +# # dtype="intp", +# # ) +# # sid_index = np.ascontiguousarray( +# # self._sid_range[sid_index_or_slice_etc], dtype="intp" +# # ) + +# # if ( +# # len(iid_index) > np.iinfo(np.int32).max +# # or len(sid_index) > np.iinfo(np.int32).max +# # ): +# # raise ValueError( +# # "Too (many Individuals or SNPs (variants) requested. Maximum is {np.iinfo(np.int32).max}." +# # ) + +# # if batch_size is None: +# # batch_size = round(np.sqrt(len(sid_index))) + +# # num_threads = get_num_threads( +# # self._num_threads if num_threads is None else num_threads +# # ) + +# # if format == "csc": +# # order = "F" +# # indptr = np.zeros(len(sid_index) + 1, dtype=np.int32) +# # elif format == "csr": +# # order = "C" +# # indptr = np.zeros(len(iid_index) + 1, dtype=np.int32) +# # else: +# # raise ValueError(f"format '{format}' not known. Expected 'csc' or 'csr'.") + +# # # We init data and indices with zero element arrays to set their dtype. +# # data = [np.empty(0, dtype=dtype)] +# # indices = [np.empty(0, dtype=np.int32)] + +# # if self.iid_count > 0 and self.sid_count > 0: +# # if dtype == np.int8: +# # reader = read_i8 +# # elif dtype == np.float64: +# # reader = read_f64 +# # elif dtype == np.float32: +# # reader = read_f32 +# # else: +# # raise ValueError( +# # f"dtype '{dtype}' not known, only " +# # + "'int8', 'float32', and 'float64' are allowed." +# # ) + +# # if format == "csc": +# # val = np.zeros((len(iid_index), batch_size), order=order, dtype=dtype) +# # for batch_start in range(0, len(sid_index), batch_size): +# # batch_end = batch_start + batch_size +# # if batch_end > len(sid_index): +# # batch_end = len(sid_index) +# # del val +# # val = np.zeros( +# # (len(iid_index), batch_end - batch_start), +# # order=order, +# # dtype=dtype, +# # ) +# # batch_slice = np.s_[batch_start:batch_end] +# # batch_index = sid_index[batch_slice] + +# # reader( +# # str(self.filepath), +# # iid_count=self.iid_count, +# # sid_count=self.sid_count, +# # is_a1_counted=self.count_A1, +# # iid_index=iid_index, +# # sid_index=batch_index, +# # val=val, +# # num_threads=num_threads, +# # ) + +# # self.sparsify( +# # val, order, iid_index, batch_slice, data, indices, indptr +# # ) +# # else: +# # assert format == "csr" # real assert +# # val = np.zeros((batch_size, len(sid_index)), order=order, dtype=dtype) +# # for batch_start in range(0, len(iid_index), batch_size): +# # batch_end = batch_start + batch_size +# # if batch_end > len(iid_index): +# # batch_end = len(iid_index) +# # del val +# # val = np.zeros( +# # (batch_end - batch_start, len(sid_index)), +# # order=order, +# # dtype=dtype, +# # ) + +# # batch_slice = np.s_[batch_start:batch_end] +# # batch_index = iid_index[batch_slice] + +# # reader( +# # str(self.filepath), +# # iid_count=self.iid_count, +# # sid_count=self.sid_count, +# # is_a1_counted=self.count_A1, +# # iid_index=batch_index, +# # sid_index=sid_index, +# # val=val, +# # num_threads=num_threads, +# # ) + +# # self.sparsify( +# # val, order, sid_index, batch_slice, data, indices, indptr +# # ) + +# # data = np.concatenate(data) +# # indices = np.concatenate(indices) + +# # if format == "csc": +# # return sparse.csc_matrix( +# # (data, indices, indptr), (len(iid_index), len(sid_index)) +# # ) +# # else: +# # assert format == "csr" # real assert +# # return sparse.csr_matrix( +# # (data, indices, indptr), (len(iid_index), len(sid_index)) +# # ) + +# # def sparsify(self, val, order, minor_index, batch_slice, data, indices, indptr): +# # flatten = np.ravel(val, order=order) +# # nz_indices = np.flatnonzero(flatten).astype(np.int32) +# # column_indexes = nz_indices // len(minor_index) +# # counts = np.bincount( +# # column_indexes, minlength=batch_slice.stop - batch_slice.start +# # ).astype(np.int32) +# # counts_with_initial = np.r_[ +# # indptr[batch_slice.start : batch_slice.start + 1], counts +# # ] + +# # data.append(flatten[nz_indices]) +# # indices.append(np.mod(nz_indices, len(minor_index))) +# # indptr[1:][batch_slice] = np.cumsum(counts_with_initial)[1:] + + +# def _read_csv(filepath, delimiter=None, dtype=None, usecols=None): +# # Prepare the usecols by ensuring it is a list of indices +# usecols_indices = list(usecols) +# transposed = np.loadtxt( +# filepath, +# dtype=np.str_, +# delimiter=delimiter, +# usecols=usecols_indices, +# unpack=True, +# ) +# if transposed.ndim == 1: +# transposed = transposed.reshape(-1, 1) +# row_count = transposed.shape[1] # because unpack=True + +# # Convert column lists to numpy arrays with the specified dtype +# columns = [] +# for output_index, input_index in enumerate(usecols_indices): +# col = transposed[output_index] +# # Find the dtype for this column +# col_dtype = dtype.get(input_index, np.str_) +# # Convert the column list to a numpy array with the specified dtype +# columns.append(_convert_to_dtype(col, col_dtype)) + +# return columns, row_count + + +# def _convert_to_dtype(str_arr, dtype): +# assert dtype in [np.str_, np.float32, np.int32] # real assert + +# if dtype == np.str_: +# return str_arr + +# try: +# new_arr = str_arr.astype(dtype) +# except ValueError as e: +# if dtype == np.float32: +# raise e +# # for backwards compatibility, see if intermediate float helps int conversion +# try: +# assert dtype == np.int32 # real assert +# float_arr = str_arr.astype(np.float32) +# except ValueError: +# raise e +# new_arr = float_arr.astype(np.int32) +# if not np.array_equal(new_arr, float_arr): +# raise ValueError( +# f"invalid literal for int: '{str_arr[np.where(new_arr != float_arr)][:1]}')" +# ) +# return new_arr + + +# if __name__ == "__main__": +# import pytest + +# logging.basicConfig(level=logging.INFO) + +# pytest.main(["--doctest-modules", __file__]) diff --git a/bed_reader/tests/test_open_bed_cloud.py b/bed_reader/tests/test_open_bed_cloud.py index 3da1ae0..288796d 100644 --- a/bed_reader/tests/test_open_bed_cloud.py +++ b/bed_reader/tests/test_open_bed_cloud.py @@ -1,932 +1,932 @@ -import logging -import os -import platform -from pathlib import Path - -import numpy as np -import pytest - -from bed_reader import open_bed_cloud, to_bed - - -@pytest.mark.asyncio -async def test_cloud_read1(shared_datadir): - - file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" - print(type(file), file) - async with await open_bed_cloud.create(file) as bed_cloud: - assert await bed_cloud.iid_count() == 10 - assert (await bed_cloud.fid())[-1] == "0" - assert (await bed_cloud.iid())[-1] == "9" - assert await bed_cloud.shape() == (10, 100) - - val = await bed_cloud.read(dtype="int8") - # really shouldn't do mean on data where -127 represents missing - assert val.mean() == -13.142 - assert (await bed_cloud.chromosome())[-1] == "1" - assert (await bed_cloud.bp_position())[-1] == 100 - - -@pytest.mark.asyncio -async def test_cloud_write(tmp_path, shared_datadir): - in_file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" - out_file = tmp_path / "out.bed" - async with await open_bed_cloud.create(in_file) as bed_cloud: - val0 = await bed_cloud.read() - properties0 = { - "fid": await bed_cloud.fid(), - "iid": await bed_cloud.iid(), - "sid": await bed_cloud.sid(), - "chromosome": await bed_cloud.chromosome(), - "cm_position": await bed_cloud.cm_position(), - "bp_position": await bed_cloud.bp_position(), - } - # cmk write - to_bed(out_file, val0, properties=properties0) - async with await open_bed_cloud.create(out_file) as bed_cloud_1: - assert np.allclose(val0, await bed_cloud_1.read(), equal_nan=True) - assert np.array_equal(await bed_cloud_1.fid(), properties0["fid"]) - assert np.array_equal(await bed_cloud_1.iid(), properties0["iid"]) - assert np.array_equal(await bed_cloud_1.sid(), properties0["sid"]) - assert np.issubdtype((await bed_cloud_1.sid()).dtype, np.str_) - assert np.array_equal(await bed_cloud_1.chromosome(), properties0["chromosome"]) - assert np.allclose(await bed_cloud_1.cm_position(), properties0["cm_position"]) - assert np.allclose(await bed_cloud_1.bp_position(), properties0["bp_position"]) - - # val_float = val0.astype("float") - # val_float[0, 0] = 0.5 - - # for force_python_only in [False, True]: - # with pytest.raises(ValueError): - # to_bed( - # out_file, - # val_float, - # properties=properties0, - # force_python_only=force_python_only, - # ) - # val0[np.isnan(val0)] = 0 # set any nan to 0 - # val_int8 = val0.astype("int8") - # val_int8[0, 0] = -1 - # for force_python_only in [False, True]: - # with pytest.raises(ValueError): - # to_bed( - # out_file, - # val_int8, - # properties=properties0, - # force_python_only=force_python_only, - # ) - - -def test_cloud_overrides(shared_datadir): - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - fid = bed.fid - iid = bed.iid - father = bed.father - mother = bed.mother - sex = bed.sex - pheno = bed.pheno - chromosome = bed.chromosome - sid = bed.sid - cm_position = bed.cm_position - bp_position = bed.bp_position - allele_1 = bed.allele_1 - allele_2 = bed.allele_2 - # lock in the expected results: - # np.savez( - # shared_datadir / "some_missing.properties.npz", - # fid=fid, - # iid=iid, - # father=father, - # mother=mother, - # sex=sex, - # pheno=pheno, - # chromosome=chromosome, - # sid=sid, - # cm_position=cm_position, - # bp_position=bp_position, - # allele_1=allele_1, - # allele_2=allele_2, - # ) - property_dict = np.load(shared_datadir / "some_missing.properties.npz") - assert np.array_equal(property_dict["fid"], fid) - assert np.array_equal(property_dict["iid"], iid) - assert np.array_equal(property_dict["father"], father) - assert np.array_equal(property_dict["mother"], mother) - assert np.array_equal(property_dict["sex"], sex) - assert np.array_equal(property_dict["pheno"], pheno) - assert np.array_equal(property_dict["chromosome"], chromosome) - assert np.array_equal(property_dict["sid"], sid) - assert np.array_equal(property_dict["cm_position"], cm_position) - assert np.array_equal(property_dict["bp_position"], bp_position) - assert np.array_equal(property_dict["allele_1"], allele_1) - assert np.array_equal(property_dict["allele_2"], allele_2) - - with pytest.raises(KeyError): - open_bed_cloud( - shared_datadir / "some_missing.bed", properties={"unknown": [3, 4, 4]} - ) - with open_bed_cloud( - shared_datadir / "some_missing.bed", properties={"iid": None} - ) as bed1: - assert bed1.iid is None - with open_bed_cloud( - shared_datadir / "some_missing.bed", properties={"iid": []} - ) as bed1: - assert np.issubdtype(bed1.iid.dtype, np.str_) - assert len(bed1.iid) == 0 - with pytest.raises(ValueError): - bed1.father - - with open_bed_cloud( - shared_datadir / "some_missing.bed", - properties={"sid": [i for i in range(len(sid))]}, - ) as bed1: - assert np.issubdtype(bed1.sid.dtype, np.str_) - assert bed1.sid[0] == "0" - with pytest.raises(ValueError): - open_bed_cloud( - shared_datadir / "some_missing.bed", - properties={"sex": ["F" for i in range(len(sex))]}, - ) # Sex must be coded as a number - with open_bed_cloud( - shared_datadir / "some_missing.bed", - properties={"sid": np.array([i for i in range(len(sid))])}, - ) as bed1: - assert np.issubdtype(bed1.sid.dtype, np.str_) - assert bed1.sid[0] == "0" - with pytest.raises(ValueError): - open_bed_cloud( - shared_datadir / "some_missing.bed", - properties={"sid": np.array([(i, i) for i in range(len(sid))])}, - ) - with open_bed_cloud( - shared_datadir / "some_missing.bed", properties={"sid": [1, 2, 3]} - ) as bed1: - with pytest.raises(ValueError): - bed1.chromosome - - -def test_cloud_str(shared_datadir): - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - assert "open_bed_cloud(" in str(bed) - - -def test_cloud_bad_bed(shared_datadir): - with pytest.raises(ValueError): - open_bed_cloud(shared_datadir / "badfile.bed") - open_bed_cloud(shared_datadir / "badfile.bed", skip_format_check=True) - - -def test_cloud_bad_dtype_or_order(shared_datadir): - with pytest.raises(ValueError): - open_bed_cloud(shared_datadir / "some_missing.bed").read(dtype=np.int32) - with pytest.raises(ValueError): - open_bed_cloud(shared_datadir / "some_missing.bed").read(order="X") - with pytest.raises(ValueError): - open_bed_cloud(shared_datadir / "some_missing.bed").read_sparse(dtype=np.int32) - - -def setting_generator(seq_dict, seed=9392): - import itertools - - from numpy.random import RandomState - - longest = max((len(value_list) for value_list in seq_dict.values())) - - for test_index in range(longest): - setting = {} - for offset, (key, value_list) in enumerate(seq_dict.items()): - val = value_list[(test_index + offset) % len(value_list)] - if not (isinstance(val, str) and "leave_out" == val): - setting[key] = val - yield setting - - all_combo = list(itertools.product(*seq_dict.values())) - - random_state = RandomState(seed) - random_state.shuffle(all_combo) - for combo in all_combo: - setting = { - key: value_list - for key, value_list in itertools.zip_longest(seq_dict, combo) - if not (isinstance(value_list, str) and "leave_out" == value_list) - } - yield setting - - -def test_cloud_properties(shared_datadir): - file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" - with open_bed_cloud(file) as bed: - iid_list = bed.iid.tolist() - sid_list = bed.sid.tolist() - chromosome_list = bed.chromosome.tolist() - - test_count = 75 - - seq_dict = { - "iid": ["leave_out", None, iid_list, np.array(iid_list)], - "iid_count": ["leave_out", len(iid_list)], - "iid_before_read": [False, True], - "iid_after_read": [False, True], - "sid": ["leave_out", None, sid_list, np.array(sid_list)], - "sid_count": [None, len(sid_list)], - "sid_before_read": [False, True], - "sid_after_read": [False, True], - "chromosome": ["leave_out", None, chromosome_list, np.array(chromosome_list)], - "chromosome_before_read": [False, True], - "chromosome_after_read": [False, True], - } - - def _not_set_to_none(settings, key): - return key not in settings or settings[key] is not None - - for test_index, settings in enumerate(setting_generator(seq_dict)): - if test_index >= test_count: - break - with open_bed_cloud( - file, - iid_count=settings.get("iid_count"), - sid_count=settings.get("sid_count"), - properties={ - k: v for k, v in settings.items() if k in {"iid", "sid", "chromosome"} - }, - ) as bed: - logging.info(f"Test {test_count}") - if settings["iid_before_read"]: - if _not_set_to_none(settings, "iid"): - assert np.array_equal(bed.iid, iid_list) - else: - assert bed.iid is None - if settings["sid_before_read"]: - if _not_set_to_none(settings, "sid"): - assert np.array_equal(bed.sid, sid_list) - else: - assert bed.sid is None - if settings["chromosome_before_read"]: - if _not_set_to_none(settings, "chromosome"): - assert np.array_equal(bed.chromosome, chromosome_list) - else: - assert bed.chromosome is None - val = bed.read() - assert val.shape == ( - len(iid_list), - len(sid_list), - ) - val_sparse = bed.read_sparse() - assert np.allclose(val, val_sparse.toarray(), equal_nan=True) - if settings["iid_after_read"]: - if _not_set_to_none(settings, "iid"): - assert np.array_equal(bed.iid, iid_list) - else: - assert bed.iid is None - if settings["sid_after_read"]: - if _not_set_to_none(settings, "sid"): - assert np.array_equal(bed.sid, sid_list) - else: - assert bed.sid is None - if settings["chromosome_after_read"]: - if _not_set_to_none(settings, "chromosome"): - assert np.array_equal(bed.chromosome, chromosome_list) - else: - assert bed.chromosome is None - # bed._assert_iid_sid_chromosome() - - -def test_cloud_c_reader_bed(shared_datadir): - for force_python_only, format in [(False, "csc"), (True, "csr")]: - bed = open_bed_cloud(shared_datadir / "some_missing.bed", count_A1=False) - - val = bed.read(order="F", force_python_only=force_python_only) - assert val.dtype == np.float32 - ref_val = reference_val(shared_datadir) - ref_val = ref_val * -1 + 2 - assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True) - - val_sparse = bed.read_sparse(format=format) - assert val_sparse.dtype == np.float32 - assert np.allclose( - ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True - ) - - val = bed.read(order="F", dtype="int8", force_python_only=False) - assert val.dtype == np.int8 - ref_val[ref_val != ref_val] = -127 - ref_val = ref_val.astype("int8") - assert np.all(ref_val == val) - - del bed - - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - val = bed.read( - order="F", dtype="float64", force_python_only=force_python_only - ) - ref_val = reference_val(shared_datadir) - assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True) - val_sparse = bed.read_sparse(dtype="float64") - assert np.allclose( - ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True - ) - - -def reference_val(shared_datadir): - val = np.load(shared_datadir / "some_missing.val.npy") - return val - - -# def test_cloud_bed_int8(tmp_path, shared_datadir): +# import logging +# import os +# import platform +# from pathlib import Path + +# import numpy as np +# import pytest + +# from bed_reader import open_bed_cloud, to_bed + + +# @pytest.mark.asyncio +# async def test_cloud_read1(shared_datadir): + +# file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" +# print(type(file), file) +# async with await open_bed_cloud.create(file) as bed_cloud: +# assert await bed_cloud.iid_count() == 10 +# assert (await bed_cloud.fid())[-1] == "0" +# assert (await bed_cloud.iid())[-1] == "9" +# assert await bed_cloud.shape() == (10, 100) + +# val = await bed_cloud.read(dtype="int8") +# # really shouldn't do mean on data where -127 represents missing +# assert val.mean() == -13.142 +# assert (await bed_cloud.chromosome())[-1] == "1" +# assert (await bed_cloud.bp_position())[-1] == 100 + + +# @pytest.mark.asyncio +# async def test_cloud_write(tmp_path, shared_datadir): +# in_file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" +# out_file = tmp_path / "out.bed" +# async with await open_bed_cloud.create(in_file) as bed_cloud: +# val0 = await bed_cloud.read() +# properties0 = { +# "fid": await bed_cloud.fid(), +# "iid": await bed_cloud.iid(), +# "sid": await bed_cloud.sid(), +# "chromosome": await bed_cloud.chromosome(), +# "cm_position": await bed_cloud.cm_position(), +# "bp_position": await bed_cloud.bp_position(), +# } +# # cmk write +# to_bed(out_file, val0, properties=properties0) +# async with await open_bed_cloud.create(out_file) as bed_cloud_1: +# assert np.allclose(val0, await bed_cloud_1.read(), equal_nan=True) +# assert np.array_equal(await bed_cloud_1.fid(), properties0["fid"]) +# assert np.array_equal(await bed_cloud_1.iid(), properties0["iid"]) +# assert np.array_equal(await bed_cloud_1.sid(), properties0["sid"]) +# assert np.issubdtype((await bed_cloud_1.sid()).dtype, np.str_) +# assert np.array_equal(await bed_cloud_1.chromosome(), properties0["chromosome"]) +# assert np.allclose(await bed_cloud_1.cm_position(), properties0["cm_position"]) +# assert np.allclose(await bed_cloud_1.bp_position(), properties0["bp_position"]) + +# # val_float = val0.astype("float") +# # val_float[0, 0] = 0.5 + +# # for force_python_only in [False, True]: +# # with pytest.raises(ValueError): +# # to_bed( +# # out_file, +# # val_float, +# # properties=properties0, +# # force_python_only=force_python_only, +# # ) +# # val0[np.isnan(val0)] = 0 # set any nan to 0 +# # val_int8 = val0.astype("int8") +# # val_int8[0, 0] = -1 +# # for force_python_only in [False, True]: +# # with pytest.raises(ValueError): +# # to_bed( +# # out_file, +# # val_int8, +# # properties=properties0, +# # force_python_only=force_python_only, +# # ) + + +# def test_cloud_overrides(shared_datadir): # with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: -# for force_python_only in [False, True]: -# for order, format in [("F", "csc"), ("C", "csr")]: -# val = bed.read( -# dtype="int8", force_python_only=force_python_only, order=order -# ) -# assert val.dtype == np.int8 -# assert (val.flags["C_CONTIGUOUS"] and order == "C") or ( -# val.flags["F_CONTIGUOUS"] and order == "F" -# ) -# ref_val = reference_val(shared_datadir) -# ref_val[ref_val != ref_val] = -127 -# ref_val = ref_val.astype("int8") -# assert np.array_equal(ref_val, val) -# output = str(tmp_path / "int8.bed") -# for count_A1 in [False, True]: -# to_bed( -# output, -# ref_val, -# count_A1=count_A1, -# force_python_only=force_python_only, -# ) -# with open_bed_cloud(output, count_A1=count_A1) as bed2: -# assert np.array_equal( -# bed2.read( -# dtype="int8", force_python_only=force_python_only -# ), -# ref_val, -# ) -# val_sparse = bed2.read_sparse(dtype="int8", format=format) -# assert np.allclose(val_sparse.toarray(), ref_val) - - -# def test_cloud_write1_bed_f64cpp(tmp_path, shared_datadir): +# fid = bed.fid +# iid = bed.iid +# father = bed.father +# mother = bed.mother +# sex = bed.sex +# pheno = bed.pheno +# chromosome = bed.chromosome +# sid = bed.sid +# cm_position = bed.cm_position +# bp_position = bed.bp_position +# allele_1 = bed.allele_1 +# allele_2 = bed.allele_2 +# # lock in the expected results: +# # np.savez( +# # shared_datadir / "some_missing.properties.npz", +# # fid=fid, +# # iid=iid, +# # father=father, +# # mother=mother, +# # sex=sex, +# # pheno=pheno, +# # chromosome=chromosome, +# # sid=sid, +# # cm_position=cm_position, +# # bp_position=bp_position, +# # allele_1=allele_1, +# # allele_2=allele_2, +# # ) +# property_dict = np.load(shared_datadir / "some_missing.properties.npz") +# assert np.array_equal(property_dict["fid"], fid) +# assert np.array_equal(property_dict["iid"], iid) +# assert np.array_equal(property_dict["father"], father) +# assert np.array_equal(property_dict["mother"], mother) +# assert np.array_equal(property_dict["sex"], sex) +# assert np.array_equal(property_dict["pheno"], pheno) +# assert np.array_equal(property_dict["chromosome"], chromosome) +# assert np.array_equal(property_dict["sid"], sid) +# assert np.array_equal(property_dict["cm_position"], cm_position) +# assert np.array_equal(property_dict["bp_position"], bp_position) +# assert np.array_equal(property_dict["allele_1"], allele_1) +# assert np.array_equal(property_dict["allele_2"], allele_2) + +# with pytest.raises(KeyError): +# open_bed_cloud( +# shared_datadir / "some_missing.bed", properties={"unknown": [3, 4, 4]} +# ) +# with open_bed_cloud( +# shared_datadir / "some_missing.bed", properties={"iid": None} +# ) as bed1: +# assert bed1.iid is None +# with open_bed_cloud( +# shared_datadir / "some_missing.bed", properties={"iid": []} +# ) as bed1: +# assert np.issubdtype(bed1.iid.dtype, np.str_) +# assert len(bed1.iid) == 0 +# with pytest.raises(ValueError): +# bed1.father + +# with open_bed_cloud( +# shared_datadir / "some_missing.bed", +# properties={"sid": [i for i in range(len(sid))]}, +# ) as bed1: +# assert np.issubdtype(bed1.sid.dtype, np.str_) +# assert bed1.sid[0] == "0" +# with pytest.raises(ValueError): +# open_bed_cloud( +# shared_datadir / "some_missing.bed", +# properties={"sex": ["F" for i in range(len(sex))]}, +# ) # Sex must be coded as a number +# with open_bed_cloud( +# shared_datadir / "some_missing.bed", +# properties={"sid": np.array([i for i in range(len(sid))])}, +# ) as bed1: +# assert np.issubdtype(bed1.sid.dtype, np.str_) +# assert bed1.sid[0] == "0" +# with pytest.raises(ValueError): +# open_bed_cloud( +# shared_datadir / "some_missing.bed", +# properties={"sid": np.array([(i, i) for i in range(len(sid))])}, +# ) +# with open_bed_cloud( +# shared_datadir / "some_missing.bed", properties={"sid": [1, 2, 3]} +# ) as bed1: +# with pytest.raises(ValueError): +# bed1.chromosome + + +# def test_cloud_str(shared_datadir): # with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: -# for iid_index in [0, 1, 5]: -# for force_python_only, format in [(False, "csc"), (True, "csr")]: -# val_sparse = bed.read_sparse( -# np.s_[0:iid_index, :], dtype=np.float64, format=format -# ) -# assert val_sparse.shape == (iid_index, 100) -# val = bed.read( -# np.s_[0:iid_index, :], -# order="F", -# dtype=np.float64, -# force_python_only=force_python_only, -# ) -# assert val.shape == (iid_index, 100) -# output = str(tmp_path / f"toydata.F64cpp.{iid_index}") -# to_bed(output, val, count_A1=False) -# val2 = open_bed_cloud(output, count_A1=False).read(dtype="float64") -# assert np.allclose(val, val2, equal_nan=True) -# assert np.allclose(val_sparse.toarray(), val2, equal_nan=True) - - -# def test_cloud_write1_x_x_cpp(tmp_path, shared_datadir): -# for count_A1 in [False, True]: +# assert "open_bed_cloud(" in str(bed) + + +# def test_cloud_bad_bed(shared_datadir): +# with pytest.raises(ValueError): +# open_bed_cloud(shared_datadir / "badfile.bed") +# open_bed_cloud(shared_datadir / "badfile.bed", skip_format_check=True) + + +# def test_cloud_bad_dtype_or_order(shared_datadir): +# with pytest.raises(ValueError): +# open_bed_cloud(shared_datadir / "some_missing.bed").read(dtype=np.int32) +# with pytest.raises(ValueError): +# open_bed_cloud(shared_datadir / "some_missing.bed").read(order="X") +# with pytest.raises(ValueError): +# open_bed_cloud(shared_datadir / "some_missing.bed").read_sparse(dtype=np.int32) + + +# def setting_generator(seq_dict, seed=9392): +# import itertools + +# from numpy.random import RandomState + +# longest = max((len(value_list) for value_list in seq_dict.values())) + +# for test_index in range(longest): +# setting = {} +# for offset, (key, value_list) in enumerate(seq_dict.items()): +# val = value_list[(test_index + offset) % len(value_list)] +# if not (isinstance(val, str) and "leave_out" == val): +# setting[key] = val +# yield setting + +# all_combo = list(itertools.product(*seq_dict.values())) + +# random_state = RandomState(seed) +# random_state.shuffle(all_combo) +# for combo in all_combo: +# setting = { +# key: value_list +# for key, value_list in itertools.zip_longest(seq_dict, combo) +# if not (isinstance(value_list, str) and "leave_out" == value_list) +# } +# yield setting + + +# def test_cloud_properties(shared_datadir): +# file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" +# with open_bed_cloud(file) as bed: +# iid_list = bed.iid.tolist() +# sid_list = bed.sid.tolist() +# chromosome_list = bed.chromosome.tolist() + +# test_count = 75 + +# seq_dict = { +# "iid": ["leave_out", None, iid_list, np.array(iid_list)], +# "iid_count": ["leave_out", len(iid_list)], +# "iid_before_read": [False, True], +# "iid_after_read": [False, True], +# "sid": ["leave_out", None, sid_list, np.array(sid_list)], +# "sid_count": [None, len(sid_list)], +# "sid_before_read": [False, True], +# "sid_after_read": [False, True], +# "chromosome": ["leave_out", None, chromosome_list, np.array(chromosome_list)], +# "chromosome_before_read": [False, True], +# "chromosome_after_read": [False, True], +# } + +# def _not_set_to_none(settings, key): +# return key not in settings or settings[key] is not None + +# for test_index, settings in enumerate(setting_generator(seq_dict)): +# if test_index >= test_count: +# break # with open_bed_cloud( -# shared_datadir / "some_missing.bed", count_A1=count_A1 +# file, +# iid_count=settings.get("iid_count"), +# sid_count=settings.get("sid_count"), +# properties={ +# k: v for k, v in settings.items() if k in {"iid", "sid", "chromosome"} +# }, # ) as bed: -# for order, format in [("F", "csc"), ("C", "csr")]: -# for dtype in [np.float32, np.float64]: -# val = bed.read(order=order, dtype=dtype) -# properties = bed.properties -# val[-1, 0] = float("NAN") -# output = str( -# tmp_path -# / "toydata.{0}{1}.cpp".format( -# order, "32" if dtype == np.float32 else "64" -# ) -# ) -# to_bed(output, val, properties=properties, count_A1=count_A1) -# val2 = open_bed_cloud(output, count_A1=count_A1).read(dtype=dtype) -# assert np.allclose(val, val2, equal_nan=True) -# val_sparse = open_bed_cloud(output, count_A1=count_A1).read_sparse( -# dtype=dtype, format=format -# ) -# assert np.allclose(val, val_sparse.toarray(), equal_nan=True) - - -def test_cloud_respect_read_inputs(shared_datadir): - import scipy.sparse as sparse - - ref_val_float = reference_val(shared_datadir) - ref_val_float2 = ref_val_float.copy() - ref_val_float2[ref_val_float != ref_val_float] = -127 - ref_val_int8 = ref_val_float2.astype("int8") - - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - for order, format in [("F", "csc"), ("C", "csr")]: - for dtype in [np.int8, np.float32, np.float64]: - for force_python_only in [True, False]: - val = bed.read( - order=order, dtype=dtype, force_python_only=force_python_only - ) - has_right_order = (order == "C" and val.flags["C_CONTIGUOUS"]) or ( - order == "F" and val.flags["F_CONTIGUOUS"] - ) - assert val.dtype == dtype and has_right_order - ref_val = ref_val_int8 if dtype == np.int8 else ref_val_float - assert np.allclose(ref_val, val, equal_nan=True) - - val_sparse = bed.read_sparse(dtype=dtype, format=format) - has_right_format = ( - format == "csc" and isinstance(val_sparse, sparse.csc_matrix) - ) or (format == "csr" and isinstance(val_sparse, sparse.csr_matrix)) - assert val_sparse.dtype == dtype and has_right_format - assert np.allclose(ref_val, val_sparse.toarray(), equal_nan=True) - - -def test_cloud_threads(shared_datadir): - ref_val_float = reference_val(shared_datadir) - ref_val_float2 = ref_val_float.copy() - ref_val_float2[ref_val_float != ref_val_float] = -127 - ref_val_int8 = ref_val_float2.astype("int8") - - for num_threads in [1, 4]: - with open_bed_cloud( - shared_datadir / "some_missing.bed", num_threads=num_threads - ) as bed: - val = bed.read(dtype="int8") - assert np.allclose(ref_val_int8, val, equal_nan=True) - val_sparse = bed.read_sparse(dtype="int8") - assert np.allclose(ref_val_int8, val_sparse.toarray(), equal_nan=True) - - -# def test_cloud_write12(tmp_path): -# # =================================== -# # Starting main function -# # =================================== -# logging.info("starting 'test_writes'") -# np.random.seed(0) -# output_template = str(tmp_path / "writes.{0}.bed") -# i = 0 -# for row_count in [0, 5, 2, 1]: -# for col_count in [0, 4, 2, 1]: -# val = np.random.randint(0, 4, size=(row_count, col_count)) * 1.0 -# val[val == 3] = np.NaN -# row0 = ["0", "1", "2", "3", "4"][:row_count] -# row1 = ["0", "1", "2", "3", "4"][:row_count] -# col = ["s0", "s1", "s2", "s3", "s4"][:col_count] -# for is_none in [True, False]: -# properties = {"fid": row0, "iid": row1, "sid": col} -# if is_none: -# col_prop012 = [x for x in range(5)][:col_count] -# properties["chromosome"] = col_prop012 -# properties["bp_position"] = col_prop012 -# properties["cm_position"] = col_prop012 +# logging.info(f"Test {test_count}") +# if settings["iid_before_read"]: +# if _not_set_to_none(settings, "iid"): +# assert np.array_equal(bed.iid, iid_list) # else: -# col_prop012 = None - -# filename = output_template.format(i) -# logging.info(filename) -# i += 1 -# to_bed(filename, val, properties=properties) -# for subsetter in [None, np.s_[::2, ::3]]: -# with open_bed_cloud(filename) as bed: -# val2 = bed.read(index=subsetter, order="C", dtype="float32") -# if subsetter is None: -# expected = val -# else: -# expected = val[subsetter[0], :][:, subsetter[1]] -# assert np.allclose(val2, expected, equal_nan=True) -# assert np.array_equal(bed.fid, np.array(row0, dtype="str")) -# assert np.array_equal(bed.iid, np.array(row1, dtype="str")) -# assert np.array_equal(bed.sid, np.array(col, dtype="str")) -# if col_prop012 is not None: -# assert np.array_equal( -# bed.chromosome, np.array(col_prop012, dtype="str") -# ) -# assert np.array_equal( -# bed.bp_position, np.array(col_prop012) -# ) -# assert np.array_equal( -# bed.cm_position, np.array(col_prop012) -# ) -# try: -# os.remove(filename) -# except Exception: -# pass -# logging.info("done with 'test_writes'") - - -# def test_cloud_writes_small(tmp_path): -# output_file = tmp_path / "small.bed" - -# val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] - -# properties = { -# "fid": ["fid1", "fid1", "fid2"], -# "iid": ["iid1", "iid2", "iid3"], -# "father": ["iid23", "iid23", "iid22"], -# "mother": ["iid34", "iid34", "iid33"], -# "sex": [1, 2, 0], -# "pheno": ["red", "red", "blue"], -# "chromosome": ["1", "1", "5", "Y"], -# "sid": ["sid1", "sid2", "sid3", "sid4"], -# "cm_position": [100.4, 2000.5, 4000.7, 7000.9], -# "bp_position": [1, 100, 1000, 1004], -# "allele_1": ["A", "T", "A", "T"], -# "allele_2": ["A", "C", "C", "G"], -# } +# assert bed.iid is None +# if settings["sid_before_read"]: +# if _not_set_to_none(settings, "sid"): +# assert np.array_equal(bed.sid, sid_list) +# else: +# assert bed.sid is None +# if settings["chromosome_before_read"]: +# if _not_set_to_none(settings, "chromosome"): +# assert np.array_equal(bed.chromosome, chromosome_list) +# else: +# assert bed.chromosome is None +# val = bed.read() +# assert val.shape == ( +# len(iid_list), +# len(sid_list), +# ) +# val_sparse = bed.read_sparse() +# assert np.allclose(val, val_sparse.toarray(), equal_nan=True) +# if settings["iid_after_read"]: +# if _not_set_to_none(settings, "iid"): +# assert np.array_equal(bed.iid, iid_list) +# else: +# assert bed.iid is None +# if settings["sid_after_read"]: +# if _not_set_to_none(settings, "sid"): +# assert np.array_equal(bed.sid, sid_list) +# else: +# assert bed.sid is None +# if settings["chromosome_after_read"]: +# if _not_set_to_none(settings, "chromosome"): +# assert np.array_equal(bed.chromosome, chromosome_list) +# else: +# assert bed.chromosome is None +# # bed._assert_iid_sid_chromosome() -# to_bed(output_file, val, properties=properties) -# with open_bed_cloud(output_file) as bed: -# assert np.allclose(bed.read(), val, equal_nan=True) -# for key, value in bed.properties.items(): -# assert np.array_equal(value, properties[key]) or np.allclose( -# value, properties[key] +# def test_cloud_c_reader_bed(shared_datadir): +# for force_python_only, format in [(False, "csc"), (True, "csr")]: +# bed = open_bed_cloud(shared_datadir / "some_missing.bed", count_A1=False) + +# val = bed.read(order="F", force_python_only=force_python_only) +# assert val.dtype == np.float32 +# ref_val = reference_val(shared_datadir) +# ref_val = ref_val * -1 + 2 +# assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True) + +# val_sparse = bed.read_sparse(format=format) +# assert val_sparse.dtype == np.float32 +# assert np.allclose( +# ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True +# ) + +# val = bed.read(order="F", dtype="int8", force_python_only=False) +# assert val.dtype == np.int8 +# ref_val[ref_val != ref_val] = -127 +# ref_val = ref_val.astype("int8") +# assert np.all(ref_val == val) + +# del bed + +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# val = bed.read( +# order="F", dtype="float64", force_python_only=force_python_only +# ) +# ref_val = reference_val(shared_datadir) +# assert np.allclose(ref_val, val, rtol=1e-05, atol=1e-05, equal_nan=True) +# val_sparse = bed.read_sparse(dtype="float64") +# assert np.allclose( +# ref_val, val_sparse.toarray(), rtol=1e-05, atol=1e-05, equal_nan=True # ) -def test_cloud_index(shared_datadir): - ref_val_float = reference_val(shared_datadir) - - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - val = bed.read() - assert np.allclose(ref_val_float, val, equal_nan=True) - val_sparse = bed.read_sparse() - assert np.allclose(ref_val_float, val_sparse.toarray(), equal_nan=True) - - val = bed.read(2) - assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) - val_sparse = bed.read_sparse(2) - assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) - - val = bed.read((2)) - assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) - val_sparse = bed.read_sparse((2)) - assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) - - val = bed.read((None, 2)) - assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) - val_sparse = bed.read_sparse((None, 2)) - assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) - - val = bed.read((1, 2)) - assert np.allclose(ref_val_float[[1], [2]], val, equal_nan=True) - val_sparse = bed.read_sparse((1, 2)) - assert np.allclose( - ref_val_float[[1], [2]], val_sparse.toarray(), equal_nan=True - ) - - val = bed.read([2, -2]) - assert np.allclose(ref_val_float[:, [2, -2]], val, equal_nan=True) - val_sparse = bed.read_sparse([2, -2]) - assert np.allclose( - ref_val_float[:, [2, -2]], val_sparse.toarray(), equal_nan=True - ) - - val = bed.read(([1, -1], [2, -2])) - assert np.allclose(ref_val_float[[1, -1], :][:, [2, -2]], val, equal_nan=True) - val_sparse = bed.read_sparse(([1, -1], [2, -2])) - assert np.allclose( - ref_val_float[[1, -1], :][:, [2, -2]], val_sparse.toarray(), equal_nan=True - ) - - iid_bool = ([False, False, True] * bed.iid_count)[: bed.iid_count] - sid_bool = ([True, False, True] * bed.sid_count)[: bed.sid_count] - val = bed.read(sid_bool) - assert np.allclose(ref_val_float[:, sid_bool], val, equal_nan=True) - val_sparse = bed.read_sparse(sid_bool) - assert np.allclose( - ref_val_float[:, sid_bool], val_sparse.toarray(), equal_nan=True - ) - - val = bed.read((iid_bool, sid_bool)) - assert np.allclose(ref_val_float[iid_bool, :][:, sid_bool], val, equal_nan=True) - val_sparse = bed.read_sparse((iid_bool, sid_bool)) - - val = bed.read((1, sid_bool)) - assert np.allclose(ref_val_float[[1], :][:, sid_bool], val, equal_nan=True) - val_sparse = bed.read_sparse((1, sid_bool)) - assert np.allclose( - ref_val_float[[1], :][:, sid_bool], val_sparse.toarray(), equal_nan=True - ) - - slicer = np.s_[::2, ::3] - val = bed.read(slicer[1]) - assert np.allclose(ref_val_float[:, slicer[1]], val, equal_nan=True) - val_sparse = bed.read_sparse(slicer[1]) - assert np.allclose( - ref_val_float[:, slicer[1]], val_sparse.toarray(), equal_nan=True - ) - - val = bed.read(slicer) - assert np.allclose(ref_val_float[slicer], val, equal_nan=True) - val_sparse = bed.read_sparse(slicer) - assert np.allclose(ref_val_float[slicer], val_sparse.toarray(), equal_nan=True) - - val = bed.read((1, slicer[1])) - assert np.allclose(ref_val_float[[1], slicer[1]], val, equal_nan=True) - val_sparse = bed.read_sparse((1, slicer[1])) - assert np.allclose( - ref_val_float[[1], slicer[1]], val_sparse.toarray(), equal_nan=True - ) - - -def test_cloud_shape(shared_datadir): - with open_bed_cloud(shared_datadir / "plink_sim_10s_100v_10pmiss.bed") as bed: - assert bed.shape == (10, 100) - - -# def test_cloud_zero_files(tmp_path): -# for force_python_only, format in [(False, "csc"), (True, "csr")]: -# for iid_count in [3, 0]: -# for sid_count in [0, 5]: -# for dtype in [np.int8, np.float32, np.float64]: -# val = np.zeros((iid_count, sid_count), dtype=dtype) -# if iid_count * sid_count > 0: -# val[0, 0] = 2 -# val[0, 1] = -127 if np.dtype(dtype) == np.int8 else np.nan -# filename = str(tmp_path / "zero_files.bed") - -# # Write -# to_bed(filename, val, force_python_only=force_python_only) - -# # Read -# with open_bed_cloud(filename) as bed2: -# val2 = bed2.read(dtype=dtype) -# assert np.allclose(val, val2, equal_nan=True) -# val_sparse = bed2.read_sparse(dtype=dtype, format=format) -# assert np.allclose(val, val_sparse.toarray(), equal_nan=True) -# properties2 = bed2.properties -# for prop in properties2.values(): -# assert len(prop) in {iid_count, sid_count} - -# # Change properties and write again -# if iid_count > 0: -# properties2["iid"][0] = "iidx" -# if sid_count > 0: -# properties2["sid"][0] = "sidx" -# to_bed( -# filename, -# val2, -# properties=properties2, -# force_python_only=force_python_only, +# def reference_val(shared_datadir): +# val = np.load(shared_datadir / "some_missing.val.npy") +# return val + + +# # def test_cloud_bed_int8(tmp_path, shared_datadir): +# # with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# # for force_python_only in [False, True]: +# # for order, format in [("F", "csc"), ("C", "csr")]: +# # val = bed.read( +# # dtype="int8", force_python_only=force_python_only, order=order +# # ) +# # assert val.dtype == np.int8 +# # assert (val.flags["C_CONTIGUOUS"] and order == "C") or ( +# # val.flags["F_CONTIGUOUS"] and order == "F" +# # ) +# # ref_val = reference_val(shared_datadir) +# # ref_val[ref_val != ref_val] = -127 +# # ref_val = ref_val.astype("int8") +# # assert np.array_equal(ref_val, val) +# # output = str(tmp_path / "int8.bed") +# # for count_A1 in [False, True]: +# # to_bed( +# # output, +# # ref_val, +# # count_A1=count_A1, +# # force_python_only=force_python_only, +# # ) +# # with open_bed_cloud(output, count_A1=count_A1) as bed2: +# # assert np.array_equal( +# # bed2.read( +# # dtype="int8", force_python_only=force_python_only +# # ), +# # ref_val, +# # ) +# # val_sparse = bed2.read_sparse(dtype="int8", format=format) +# # assert np.allclose(val_sparse.toarray(), ref_val) + + +# # def test_cloud_write1_bed_f64cpp(tmp_path, shared_datadir): +# # with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# # for iid_index in [0, 1, 5]: +# # for force_python_only, format in [(False, "csc"), (True, "csr")]: +# # val_sparse = bed.read_sparse( +# # np.s_[0:iid_index, :], dtype=np.float64, format=format +# # ) +# # assert val_sparse.shape == (iid_index, 100) +# # val = bed.read( +# # np.s_[0:iid_index, :], +# # order="F", +# # dtype=np.float64, +# # force_python_only=force_python_only, +# # ) +# # assert val.shape == (iid_index, 100) +# # output = str(tmp_path / f"toydata.F64cpp.{iid_index}") +# # to_bed(output, val, count_A1=False) +# # val2 = open_bed_cloud(output, count_A1=False).read(dtype="float64") +# # assert np.allclose(val, val2, equal_nan=True) +# # assert np.allclose(val_sparse.toarray(), val2, equal_nan=True) + + +# # def test_cloud_write1_x_x_cpp(tmp_path, shared_datadir): +# # for count_A1 in [False, True]: +# # with open_bed_cloud( +# # shared_datadir / "some_missing.bed", count_A1=count_A1 +# # ) as bed: +# # for order, format in [("F", "csc"), ("C", "csr")]: +# # for dtype in [np.float32, np.float64]: +# # val = bed.read(order=order, dtype=dtype) +# # properties = bed.properties +# # val[-1, 0] = float("NAN") +# # output = str( +# # tmp_path +# # / "toydata.{0}{1}.cpp".format( +# # order, "32" if dtype == np.float32 else "64" +# # ) +# # ) +# # to_bed(output, val, properties=properties, count_A1=count_A1) +# # val2 = open_bed_cloud(output, count_A1=count_A1).read(dtype=dtype) +# # assert np.allclose(val, val2, equal_nan=True) +# # val_sparse = open_bed_cloud(output, count_A1=count_A1).read_sparse( +# # dtype=dtype, format=format +# # ) +# # assert np.allclose(val, val_sparse.toarray(), equal_nan=True) + + +# def test_cloud_respect_read_inputs(shared_datadir): +# import scipy.sparse as sparse + +# ref_val_float = reference_val(shared_datadir) +# ref_val_float2 = ref_val_float.copy() +# ref_val_float2[ref_val_float != ref_val_float] = -127 +# ref_val_int8 = ref_val_float2.astype("int8") + +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# for order, format in [("F", "csc"), ("C", "csr")]: +# for dtype in [np.int8, np.float32, np.float64]: +# for force_python_only in [True, False]: +# val = bed.read( +# order=order, dtype=dtype, force_python_only=force_python_only # ) +# has_right_order = (order == "C" and val.flags["C_CONTIGUOUS"]) or ( +# order == "F" and val.flags["F_CONTIGUOUS"] +# ) +# assert val.dtype == dtype and has_right_order +# ref_val = ref_val_int8 if dtype == np.int8 else ref_val_float +# assert np.allclose(ref_val, val, equal_nan=True) -# # Read again -# with open_bed_cloud(filename) as bed3: -# val3 = bed3.read(dtype=dtype) -# assert np.allclose(val, val3, equal_nan=True) -# val_sparse = bed3.read_sparse(dtype=dtype, format=format) -# assert np.allclose(val, val_sparse.toarray(), equal_nan=True) -# properties3 = bed3.properties -# for key2, value_list2 in properties2.items(): -# value_list3 = properties3[key2] -# assert np.array_equal(value_list2, value_list3) - - -def test_cloud_iid_sid_count(shared_datadir): - iid_count_ref, sid_count_ref = open_bed_cloud( - shared_datadir / "plink_sim_10s_100v_10pmiss.bed" - ).shape - assert (iid_count_ref, sid_count_ref) == open_bed_cloud( - shared_datadir / "plink_sim_10s_100v_10pmiss.bed", iid_count=iid_count_ref - ).shape - assert (iid_count_ref, sid_count_ref) == open_bed_cloud( - shared_datadir / "plink_sim_10s_100v_10pmiss.bed", sid_count=sid_count_ref - ).shape - assert (iid_count_ref, sid_count_ref) == open_bed_cloud( - shared_datadir / "plink_sim_10s_100v_10pmiss.bed", - iid_count=iid_count_ref, - sid_count=sid_count_ref, - ).shape - - -def test_cloud_sample_file(): - from bed_reader import open_bed_cloud, sample_file - - file_name = sample_file("small.bed") - with open_bed_cloud(file_name) as bed: - print(bed.iid) - print(bed.sid) - print(bed.read()) - print(bed.read_sparse()) - - -# def test_cloud_coverage2(shared_datadir, tmp_path): -# with open_bed_cloud( -# shared_datadir / "plink_sim_10s_100v_10pmiss.bed", properties={"iid": None} -# ) as bed: -# assert bed.iid is None -# with pytest.raises(ValueError): -# open_bed_cloud( -# shared_datadir / "plink_sim_10s_100v_10pmiss.bed", -# properties={"iid": [1, 2, 3], "mother": [1, 2]}, -# ) -# val = np.zeros((3, 5))[::2] -# assert not val.flags["C_CONTIGUOUS"] and not val.flags["F_CONTIGUOUS"] -# with pytest.raises(ValueError): -# to_bed(tmp_path / "ignore", val) -# val = np.zeros((3, 5), dtype=np.str_) -# with pytest.raises(ValueError): -# to_bed(tmp_path / "ignore", val) +# val_sparse = bed.read_sparse(dtype=dtype, format=format) +# has_right_format = ( +# format == "csc" and isinstance(val_sparse, sparse.csc_matrix) +# ) or (format == "csr" and isinstance(val_sparse, sparse.csr_matrix)) +# assert val_sparse.dtype == dtype and has_right_format +# assert np.allclose(ref_val, val_sparse.toarray(), equal_nan=True) -# def test_cloud_coverage3(shared_datadir, tmp_path): -# with pytest.warns(RuntimeWarning, match="invalid value encountered in cast"): +# def test_cloud_threads(shared_datadir): +# ref_val_float = reference_val(shared_datadir) +# ref_val_float2 = ref_val_float.copy() +# ref_val_float2[ref_val_float != ref_val_float] = -127 +# ref_val_int8 = ref_val_float2.astype("int8") + +# for num_threads in [1, 4]: # with open_bed_cloud( -# shared_datadir / "small.bed", properties={"sex": [1.0, np.nan, 1.0, 2.0]} +# shared_datadir / "some_missing.bed", num_threads=num_threads # ) as bed: -# assert np.array_equal(bed.sex, np.array([1, 0, 1, 2])) +# val = bed.read(dtype="int8") +# assert np.allclose(ref_val_int8, val, equal_nan=True) +# val_sparse = bed.read_sparse(dtype="int8") +# assert np.allclose(ref_val_int8, val_sparse.toarray(), equal_nan=True) + + +# # def test_cloud_write12(tmp_path): +# # # =================================== +# # # Starting main function +# # # =================================== +# # logging.info("starting 'test_writes'") +# # np.random.seed(0) +# # output_template = str(tmp_path / "writes.{0}.bed") +# # i = 0 +# # for row_count in [0, 5, 2, 1]: +# # for col_count in [0, 4, 2, 1]: +# # val = np.random.randint(0, 4, size=(row_count, col_count)) * 1.0 +# # val[val == 3] = np.NaN +# # row0 = ["0", "1", "2", "3", "4"][:row_count] +# # row1 = ["0", "1", "2", "3", "4"][:row_count] +# # col = ["s0", "s1", "s2", "s3", "s4"][:col_count] +# # for is_none in [True, False]: +# # properties = {"fid": row0, "iid": row1, "sid": col} +# # if is_none: +# # col_prop012 = [x for x in range(5)][:col_count] +# # properties["chromosome"] = col_prop012 +# # properties["bp_position"] = col_prop012 +# # properties["cm_position"] = col_prop012 +# # else: +# # col_prop012 = None + +# # filename = output_template.format(i) +# # logging.info(filename) +# # i += 1 +# # to_bed(filename, val, properties=properties) +# # for subsetter in [None, np.s_[::2, ::3]]: +# # with open_bed_cloud(filename) as bed: +# # val2 = bed.read(index=subsetter, order="C", dtype="float32") +# # if subsetter is None: +# # expected = val +# # else: +# # expected = val[subsetter[0], :][:, subsetter[1]] +# # assert np.allclose(val2, expected, equal_nan=True) +# # assert np.array_equal(bed.fid, np.array(row0, dtype="str")) +# # assert np.array_equal(bed.iid, np.array(row1, dtype="str")) +# # assert np.array_equal(bed.sid, np.array(col, dtype="str")) +# # if col_prop012 is not None: +# # assert np.array_equal( +# # bed.chromosome, np.array(col_prop012, dtype="str") +# # ) +# # assert np.array_equal( +# # bed.bp_position, np.array(col_prop012) +# # ) +# # assert np.array_equal( +# # bed.cm_position, np.array(col_prop012) +# # ) +# # try: +# # os.remove(filename) +# # except Exception: +# # pass +# # logging.info("done with 'test_writes'") + + +# # def test_cloud_writes_small(tmp_path): +# # output_file = tmp_path / "small.bed" + +# # val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] + +# # properties = { +# # "fid": ["fid1", "fid1", "fid2"], +# # "iid": ["iid1", "iid2", "iid3"], +# # "father": ["iid23", "iid23", "iid22"], +# # "mother": ["iid34", "iid34", "iid33"], +# # "sex": [1, 2, 0], +# # "pheno": ["red", "red", "blue"], +# # "chromosome": ["1", "1", "5", "Y"], +# # "sid": ["sid1", "sid2", "sid3", "sid4"], +# # "cm_position": [100.4, 2000.5, 4000.7, 7000.9], +# # "bp_position": [1, 100, 1000, 1004], +# # "allele_1": ["A", "T", "A", "T"], +# # "allele_2": ["A", "C", "C", "G"], +# # } + +# # to_bed(output_file, val, properties=properties) + +# # with open_bed_cloud(output_file) as bed: +# # assert np.allclose(bed.read(), val, equal_nan=True) +# # for key, value in bed.properties.items(): +# # assert np.array_equal(value, properties[key]) or np.allclose( +# # value, properties[key] +# # ) + + +# def test_cloud_index(shared_datadir): +# ref_val_float = reference_val(shared_datadir) -# with open_bed_cloud( -# shared_datadir / "small.bed", -# properties={"cm_position": [1000.0, np.nan, 2000.0, 3000.0]}, -# ) as bed: -# assert np.array_equal(bed.cm_position, np.array([1000, 0, 2000, 3000])) +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# val = bed.read() +# assert np.allclose(ref_val_float, val, equal_nan=True) +# val_sparse = bed.read_sparse() +# assert np.allclose(ref_val_float, val_sparse.toarray(), equal_nan=True) + +# val = bed.read(2) +# assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) +# val_sparse = bed.read_sparse(2) +# assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) + +# val = bed.read((2)) +# assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) +# val_sparse = bed.read_sparse((2)) +# assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) + +# val = bed.read((None, 2)) +# assert np.allclose(ref_val_float[:, [2]], val, equal_nan=True) +# val_sparse = bed.read_sparse((None, 2)) +# assert np.allclose(ref_val_float[:, [2]], val_sparse.toarray(), equal_nan=True) + +# val = bed.read((1, 2)) +# assert np.allclose(ref_val_float[[1], [2]], val, equal_nan=True) +# val_sparse = bed.read_sparse((1, 2)) +# assert np.allclose( +# ref_val_float[[1], [2]], val_sparse.toarray(), equal_nan=True +# ) -# list = [1.0, 0, np.nan, 0] -# output_file = tmp_path / "1d.bed" -# with pytest.raises(ValueError): -# to_bed(output_file, list) -# to_bed(output_file, np.array([list], dtype=np.float16)) -# with open_bed_cloud(output_file) as bed: -# assert np.allclose(bed.read(), [list], equal_nan=True) -# assert np.allclose(bed.read_sparse().toarray(), [list], equal_nan=True) - - -# def test_cloud_nones(shared_datadir, tmp_path): -# properties = { -# "father": None, -# "mother": None, -# "sex": None, -# "pheno": None, -# "allele_1": None, -# "allele_2": None, -# } +# val = bed.read([2, -2]) +# assert np.allclose(ref_val_float[:, [2, -2]], val, equal_nan=True) +# val_sparse = bed.read_sparse([2, -2]) +# assert np.allclose( +# ref_val_float[:, [2, -2]], val_sparse.toarray(), equal_nan=True +# ) -# with open_bed_cloud(shared_datadir / "small.bed", properties=properties) as bed: -# assert np.array_equal(bed.iid, ["iid1", "iid2", "iid3"]) -# assert bed.father is None +# val = bed.read(([1, -1], [2, -2])) +# assert np.allclose(ref_val_float[[1, -1], :][:, [2, -2]], val, equal_nan=True) +# val_sparse = bed.read_sparse(([1, -1], [2, -2])) +# assert np.allclose( +# ref_val_float[[1, -1], :][:, [2, -2]], val_sparse.toarray(), equal_nan=True +# ) -# val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] -# out_file = tmp_path / "testnones.bed" -# to_bed(out_file, val, properties=properties) +# iid_bool = ([False, False, True] * bed.iid_count)[: bed.iid_count] +# sid_bool = ([True, False, True] * bed.sid_count)[: bed.sid_count] +# val = bed.read(sid_bool) +# assert np.allclose(ref_val_float[:, sid_bool], val, equal_nan=True) +# val_sparse = bed.read_sparse(sid_bool) +# assert np.allclose( +# ref_val_float[:, sid_bool], val_sparse.toarray(), equal_nan=True +# ) +# val = bed.read((iid_bool, sid_bool)) +# assert np.allclose(ref_val_float[iid_bool, :][:, sid_bool], val, equal_nan=True) +# val_sparse = bed.read_sparse((iid_bool, sid_bool)) -# def test_cloud_fam_bim_filepath(shared_datadir, tmp_path): -# with open_bed_cloud(shared_datadir / "small.bed") as bed: -# val = bed.read() -# properties = bed.properties -# output_file = tmp_path / "small.deb" -# fam_file = tmp_path / "small.maf" -# bim_file = tmp_path / "small.mib" -# to_bed( -# output_file, -# val, -# properties=properties, -# fam_filepath=fam_file, -# bim_filepath=bim_file, -# ) -# assert output_file.exists() and fam_file.exists() and bim_file.exists() +# val = bed.read((1, sid_bool)) +# assert np.allclose(ref_val_float[[1], :][:, sid_bool], val, equal_nan=True) +# val_sparse = bed.read_sparse((1, sid_bool)) +# assert np.allclose( +# ref_val_float[[1], :][:, sid_bool], val_sparse.toarray(), equal_nan=True +# ) -# with open_bed_cloud( -# output_file, fam_filepath=fam_file, bim_filepath=bim_file -# ) as deb: -# val2 = deb.read() -# assert np.allclose(val, val2, equal_nan=True) -# val_sparse = deb.read_sparse() -# assert np.allclose(val, val_sparse.toarray(), equal_nan=True) -# properties2 = deb.properties -# for key in properties: -# np.array_equal(properties[key], properties2[key]) - - -# def test_cloud_write_nan_properties(shared_datadir, tmp_path): -# with open_bed_cloud(shared_datadir / "small.bed") as bed: -# val = bed.read() -# properties = bed.properties -# chrom = bed.chromosome.copy() -# chrom[bed.chromosome == "Y"] = 0 -# chrom = np.array(chrom, dtype="float") -# chrom2 = chrom.copy() -# chrom2[chrom2 == 0] = np.nan -# cm_p = bed.cm_position.copy() -# cm_p[cm_p < 3000] = 0 -# cm_p2 = cm_p.copy() -# cm_p2[cm_p == 0] = np.nan -# properties["chromosome"] = chrom2 -# properties["cm_position"] = cm_p2 - -# output_file = tmp_path / "nan.bed" -# to_bed(output_file, val, properties=properties) - -# with open_bed_cloud(output_file) as bed2: -# assert np.array_equal(bed2.chromosome, ["1.0", "1.0", "5.0", "0"]) -# assert np.array_equal(bed2.cm_position, cm_p) +# slicer = np.s_[::2, ::3] +# val = bed.read(slicer[1]) +# assert np.allclose(ref_val_float[:, slicer[1]], val, equal_nan=True) +# val_sparse = bed.read_sparse(slicer[1]) +# assert np.allclose( +# ref_val_float[:, slicer[1]], val_sparse.toarray(), equal_nan=True +# ) -# with open_bed_cloud( -# shared_datadir / "small.bed", -# properties={"chromosome": chrom2, "cm_position": cm_p2}, -# ) as bed3: -# assert np.array_equal(bed3.chromosome, ["1.0", "1.0", "5.0", "0"]) -# assert np.array_equal(bed3.cm_position, cm_p) - - -def test_cloud_env(shared_datadir): - if platform.system() == "Darwin": - return - - key = "MKL_NUM_THREADS" - original_val = os.environ.get(key) - try: - os.environ[key] = "1" - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - _ = bed.read(np.s_[:100, :100]) - _ = bed.read_sparse(np.s_[:100, :100]) - os.environ[key] = "10" - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - _ = bed.read(np.s_[:100, :100]) - _ = bed.read_sparse(np.s_[:100, :100]) - os.environ[key] = "BADVALUE" - with pytest.raises(ValueError): - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - _ = bed.read(np.s_[:100, :100]) - with pytest.raises(ValueError): - with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: - _ = bed.read_sparse(np.s_[:100, :100]) - finally: - if original_val is None: - if key in os.environ: - del os.environ[key] - else: - os.environ[key] = original_val - - -def test_cloud_bed_reading_example(): - import numpy as np - - from bed_reader import open_bed_cloud, sample_file - - file_name = sample_file("small.bed") - with open_bed_cloud(file_name, count_A1=False) as bed: - val = bed.read(index=np.s_[:, :3], dtype="int8", order="C", num_threads=1) - print(val.shape) - - -def test_cloud_sparse(): - import numpy as np - - from bed_reader import open_bed_cloud, sample_file - - file_name = sample_file("small.bed") - with open_bed_cloud(file_name, count_A1=False) as bed: - val_sparse = bed.read_sparse(index=np.s_[:, :3], dtype="int8") - print(val_sparse.shape) - - -def test_cloud_convert_to_dtype(): - from bed_reader._open_bed import _convert_to_dtype - - input = [ - [["a", "b", "c"], ["a", "b", "c"], None, None], - [["1.0", "2.0", "3.0"], ["1.0", "2.0", "3.0"], [1, 2, 3], [1.0, 2.0, 3.0]], - [["1.0", "2.0", "3.5"], ["1.0", "2.0", "3.5"], None, [1.0, 2.0, 3.5]], - [["1", "2", "3"], ["1", "2", "3"], [1, 2, 3], [1.0, 2.0, 3.0]], - [["1", "A", "3"], ["1", "A", "3"], None, None], - ] - # convert all to np.array - input = [ - [np.array(inner) if inner is not None else None for inner in outer] - for outer in input - ] - - for ori, exp_str, exp_int, exp_float in input: - for dtype, exp in ( - [np.str_, exp_str], - [np.int32, exp_int], - [ - np.float32, - exp_float, - ], - ): - try: - actual = _convert_to_dtype(ori, dtype) - assert np.array_equal(actual, exp) - except ValueError as e: - print(e) - assert exp is None - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - shared_datadir = Path(r"D:\OneDrive\programs\bed-reader\bed_reader\tests\data") - tmp_path = Path(r"m:/deldir/tests") - # test_bed_reading_example() - # test_zero_files(tmp_path) - # test_index(shared_datadir) - # test_c_reader_bed(shared_datadir) - # test_read1(shared_datadir) - pytest.main([__file__]) +# val = bed.read(slicer) +# assert np.allclose(ref_val_float[slicer], val, equal_nan=True) +# val_sparse = bed.read_sparse(slicer) +# assert np.allclose(ref_val_float[slicer], val_sparse.toarray(), equal_nan=True) + +# val = bed.read((1, slicer[1])) +# assert np.allclose(ref_val_float[[1], slicer[1]], val, equal_nan=True) +# val_sparse = bed.read_sparse((1, slicer[1])) +# assert np.allclose( +# ref_val_float[[1], slicer[1]], val_sparse.toarray(), equal_nan=True +# ) + + +# def test_cloud_shape(shared_datadir): +# with open_bed_cloud(shared_datadir / "plink_sim_10s_100v_10pmiss.bed") as bed: +# assert bed.shape == (10, 100) + + +# # def test_cloud_zero_files(tmp_path): +# # for force_python_only, format in [(False, "csc"), (True, "csr")]: +# # for iid_count in [3, 0]: +# # for sid_count in [0, 5]: +# # for dtype in [np.int8, np.float32, np.float64]: +# # val = np.zeros((iid_count, sid_count), dtype=dtype) +# # if iid_count * sid_count > 0: +# # val[0, 0] = 2 +# # val[0, 1] = -127 if np.dtype(dtype) == np.int8 else np.nan +# # filename = str(tmp_path / "zero_files.bed") + +# # # Write +# # to_bed(filename, val, force_python_only=force_python_only) + +# # # Read +# # with open_bed_cloud(filename) as bed2: +# # val2 = bed2.read(dtype=dtype) +# # assert np.allclose(val, val2, equal_nan=True) +# # val_sparse = bed2.read_sparse(dtype=dtype, format=format) +# # assert np.allclose(val, val_sparse.toarray(), equal_nan=True) +# # properties2 = bed2.properties +# # for prop in properties2.values(): +# # assert len(prop) in {iid_count, sid_count} + +# # # Change properties and write again +# # if iid_count > 0: +# # properties2["iid"][0] = "iidx" +# # if sid_count > 0: +# # properties2["sid"][0] = "sidx" +# # to_bed( +# # filename, +# # val2, +# # properties=properties2, +# # force_python_only=force_python_only, +# # ) + +# # # Read again +# # with open_bed_cloud(filename) as bed3: +# # val3 = bed3.read(dtype=dtype) +# # assert np.allclose(val, val3, equal_nan=True) +# # val_sparse = bed3.read_sparse(dtype=dtype, format=format) +# # assert np.allclose(val, val_sparse.toarray(), equal_nan=True) +# # properties3 = bed3.properties +# # for key2, value_list2 in properties2.items(): +# # value_list3 = properties3[key2] +# # assert np.array_equal(value_list2, value_list3) + + +# def test_cloud_iid_sid_count(shared_datadir): +# iid_count_ref, sid_count_ref = open_bed_cloud( +# shared_datadir / "plink_sim_10s_100v_10pmiss.bed" +# ).shape +# assert (iid_count_ref, sid_count_ref) == open_bed_cloud( +# shared_datadir / "plink_sim_10s_100v_10pmiss.bed", iid_count=iid_count_ref +# ).shape +# assert (iid_count_ref, sid_count_ref) == open_bed_cloud( +# shared_datadir / "plink_sim_10s_100v_10pmiss.bed", sid_count=sid_count_ref +# ).shape +# assert (iid_count_ref, sid_count_ref) == open_bed_cloud( +# shared_datadir / "plink_sim_10s_100v_10pmiss.bed", +# iid_count=iid_count_ref, +# sid_count=sid_count_ref, +# ).shape + + +# def test_cloud_sample_file(): +# from bed_reader import open_bed_cloud, sample_file + +# file_name = sample_file("small.bed") +# with open_bed_cloud(file_name) as bed: +# print(bed.iid) +# print(bed.sid) +# print(bed.read()) +# print(bed.read_sparse()) + + +# # def test_cloud_coverage2(shared_datadir, tmp_path): +# # with open_bed_cloud( +# # shared_datadir / "plink_sim_10s_100v_10pmiss.bed", properties={"iid": None} +# # ) as bed: +# # assert bed.iid is None +# # with pytest.raises(ValueError): +# # open_bed_cloud( +# # shared_datadir / "plink_sim_10s_100v_10pmiss.bed", +# # properties={"iid": [1, 2, 3], "mother": [1, 2]}, +# # ) +# # val = np.zeros((3, 5))[::2] +# # assert not val.flags["C_CONTIGUOUS"] and not val.flags["F_CONTIGUOUS"] +# # with pytest.raises(ValueError): +# # to_bed(tmp_path / "ignore", val) +# # val = np.zeros((3, 5), dtype=np.str_) +# # with pytest.raises(ValueError): +# # to_bed(tmp_path / "ignore", val) + + +# # def test_cloud_coverage3(shared_datadir, tmp_path): +# # with pytest.warns(RuntimeWarning, match="invalid value encountered in cast"): +# # with open_bed_cloud( +# # shared_datadir / "small.bed", properties={"sex": [1.0, np.nan, 1.0, 2.0]} +# # ) as bed: +# # assert np.array_equal(bed.sex, np.array([1, 0, 1, 2])) + +# # with open_bed_cloud( +# # shared_datadir / "small.bed", +# # properties={"cm_position": [1000.0, np.nan, 2000.0, 3000.0]}, +# # ) as bed: +# # assert np.array_equal(bed.cm_position, np.array([1000, 0, 2000, 3000])) + +# # list = [1.0, 0, np.nan, 0] +# # output_file = tmp_path / "1d.bed" +# # with pytest.raises(ValueError): +# # to_bed(output_file, list) +# # to_bed(output_file, np.array([list], dtype=np.float16)) +# # with open_bed_cloud(output_file) as bed: +# # assert np.allclose(bed.read(), [list], equal_nan=True) +# # assert np.allclose(bed.read_sparse().toarray(), [list], equal_nan=True) + + +# # def test_cloud_nones(shared_datadir, tmp_path): +# # properties = { +# # "father": None, +# # "mother": None, +# # "sex": None, +# # "pheno": None, +# # "allele_1": None, +# # "allele_2": None, +# # } + +# # with open_bed_cloud(shared_datadir / "small.bed", properties=properties) as bed: +# # assert np.array_equal(bed.iid, ["iid1", "iid2", "iid3"]) +# # assert bed.father is None + +# # val = [[1.0, 0, np.nan, 0], [2, 0, np.nan, 2], [0, 1, 2, 0]] +# # out_file = tmp_path / "testnones.bed" +# # to_bed(out_file, val, properties=properties) + + +# # def test_cloud_fam_bim_filepath(shared_datadir, tmp_path): +# # with open_bed_cloud(shared_datadir / "small.bed") as bed: +# # val = bed.read() +# # properties = bed.properties +# # output_file = tmp_path / "small.deb" +# # fam_file = tmp_path / "small.maf" +# # bim_file = tmp_path / "small.mib" +# # to_bed( +# # output_file, +# # val, +# # properties=properties, +# # fam_filepath=fam_file, +# # bim_filepath=bim_file, +# # ) +# # assert output_file.exists() and fam_file.exists() and bim_file.exists() + +# # with open_bed_cloud( +# # output_file, fam_filepath=fam_file, bim_filepath=bim_file +# # ) as deb: +# # val2 = deb.read() +# # assert np.allclose(val, val2, equal_nan=True) +# # val_sparse = deb.read_sparse() +# # assert np.allclose(val, val_sparse.toarray(), equal_nan=True) +# # properties2 = deb.properties +# # for key in properties: +# # np.array_equal(properties[key], properties2[key]) + + +# # def test_cloud_write_nan_properties(shared_datadir, tmp_path): +# # with open_bed_cloud(shared_datadir / "small.bed") as bed: +# # val = bed.read() +# # properties = bed.properties +# # chrom = bed.chromosome.copy() +# # chrom[bed.chromosome == "Y"] = 0 +# # chrom = np.array(chrom, dtype="float") +# # chrom2 = chrom.copy() +# # chrom2[chrom2 == 0] = np.nan +# # cm_p = bed.cm_position.copy() +# # cm_p[cm_p < 3000] = 0 +# # cm_p2 = cm_p.copy() +# # cm_p2[cm_p == 0] = np.nan +# # properties["chromosome"] = chrom2 +# # properties["cm_position"] = cm_p2 + +# # output_file = tmp_path / "nan.bed" +# # to_bed(output_file, val, properties=properties) + +# # with open_bed_cloud(output_file) as bed2: +# # assert np.array_equal(bed2.chromosome, ["1.0", "1.0", "5.0", "0"]) +# # assert np.array_equal(bed2.cm_position, cm_p) + +# # with open_bed_cloud( +# # shared_datadir / "small.bed", +# # properties={"chromosome": chrom2, "cm_position": cm_p2}, +# # ) as bed3: +# # assert np.array_equal(bed3.chromosome, ["1.0", "1.0", "5.0", "0"]) +# # assert np.array_equal(bed3.cm_position, cm_p) + + +# def test_cloud_env(shared_datadir): +# if platform.system() == "Darwin": +# return + +# key = "MKL_NUM_THREADS" +# original_val = os.environ.get(key) +# try: +# os.environ[key] = "1" +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# _ = bed.read(np.s_[:100, :100]) +# _ = bed.read_sparse(np.s_[:100, :100]) +# os.environ[key] = "10" +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# _ = bed.read(np.s_[:100, :100]) +# _ = bed.read_sparse(np.s_[:100, :100]) +# os.environ[key] = "BADVALUE" +# with pytest.raises(ValueError): +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# _ = bed.read(np.s_[:100, :100]) +# with pytest.raises(ValueError): +# with open_bed_cloud(shared_datadir / "some_missing.bed") as bed: +# _ = bed.read_sparse(np.s_[:100, :100]) +# finally: +# if original_val is None: +# if key in os.environ: +# del os.environ[key] +# else: +# os.environ[key] = original_val + + +# def test_cloud_bed_reading_example(): +# import numpy as np + +# from bed_reader import open_bed_cloud, sample_file + +# file_name = sample_file("small.bed") +# with open_bed_cloud(file_name, count_A1=False) as bed: +# val = bed.read(index=np.s_[:, :3], dtype="int8", order="C", num_threads=1) +# print(val.shape) + + +# def test_cloud_sparse(): +# import numpy as np + +# from bed_reader import open_bed_cloud, sample_file + +# file_name = sample_file("small.bed") +# with open_bed_cloud(file_name, count_A1=False) as bed: +# val_sparse = bed.read_sparse(index=np.s_[:, :3], dtype="int8") +# print(val_sparse.shape) + + +# def test_cloud_convert_to_dtype(): +# from bed_reader._open_bed import _convert_to_dtype + +# input = [ +# [["a", "b", "c"], ["a", "b", "c"], None, None], +# [["1.0", "2.0", "3.0"], ["1.0", "2.0", "3.0"], [1, 2, 3], [1.0, 2.0, 3.0]], +# [["1.0", "2.0", "3.5"], ["1.0", "2.0", "3.5"], None, [1.0, 2.0, 3.5]], +# [["1", "2", "3"], ["1", "2", "3"], [1, 2, 3], [1.0, 2.0, 3.0]], +# [["1", "A", "3"], ["1", "A", "3"], None, None], +# ] +# # convert all to np.array +# input = [ +# [np.array(inner) if inner is not None else None for inner in outer] +# for outer in input +# ] + +# for ori, exp_str, exp_int, exp_float in input: +# for dtype, exp in ( +# [np.str_, exp_str], +# [np.int32, exp_int], +# [ +# np.float32, +# exp_float, +# ], +# ): +# try: +# actual = _convert_to_dtype(ori, dtype) +# assert np.array_equal(actual, exp) +# except ValueError as e: +# print(e) +# assert exp is None + + +# if __name__ == "__main__": +# logging.basicConfig(level=logging.INFO) + +# shared_datadir = Path(r"D:\OneDrive\programs\bed-reader\bed_reader\tests\data") +# tmp_path = Path(r"m:/deldir/tests") +# # test_bed_reading_example() +# # test_zero_files(tmp_path) +# # test_index(shared_datadir) +# # test_c_reader_bed(shared_datadir) +# # test_read1(shared_datadir) +# pytest.main([__file__]) diff --git a/src/bed_cloud.rs b/src/bed_cloud.rs index 217f59f..aa2e5da 100644 --- a/src/bed_cloud.rs +++ b/src/bed_cloud.rs @@ -1573,7 +1573,7 @@ where /// /// This returns a struct with 12 fields. Each field is a ndarray. /// The struct will always be new, but the 12 ndarrays will be - /// shared with this [`Bed`](struct.BedCloud.html). + /// shared with this [`BedCloud`](struct.BedCloud.html). /// /// If the needed, the metadata will be read from the .fam and/or .bim files. /// ``` diff --git a/src/python_module.rs b/src/python_module.rs index 8e6cb8d..4fc2347 100644 --- a/src/python_module.rs +++ b/src/python_module.rs @@ -1,19 +1,15 @@ #![cfg(feature = "extension-module")] -use std::time::Duration; - -use num_traits::sign; use numpy::{PyArray1, PyArray2, PyArray3}; use object_store::{local::LocalFileSystem, path::Path as StorePath}; -use pyo3::Py; use pyo3::{ exceptions::PyIOError, exceptions::PyIndexError, exceptions::PyValueError, prelude::{pymodule, PyModule, PyResult, Python}, - pyfunction, PyAny, PyErr, PyObject, + PyErr, }; -use tokio::time::sleep; +use tokio::runtime; // CMK use pyo3_asyncio::tokio::future_into_py; use crate::{ @@ -155,71 +151,114 @@ fn bed_reader(_py: Python<'_>, m: &PyModule) -> PyResult<()> { Ok(()) } - #[allow(unused_variables)] #[pyfn(m)] #[allow(clippy::too_many_arguments)] - fn read_cloud_i8<'a>( - py: Python<'a>, - // // cmk currently only supports LocalFileSystem "cloud" storage - filename: &'a str, + fn read_cloud_i8( + filename: &str, iid_count: usize, sid_count: usize, is_a1_counted: bool, iid_index: &PyArray1, sid_index: &PyArray1, - val: Py>, // py: Python, - // val: &PyArray2, - max_concurrent_requests: usize, - max_chunk_size: usize, + val: &PyArray2, num_threads: usize, - // - ) -> PyResult<&'a PyAny> { + ) -> Result<(), PyErr> { let iid_index = iid_index.readonly(); let sid_index = sid_index.readonly(); let ii = &iid_index.as_slice()?; let si = &sid_index.as_slice()?; - let store_path: StorePath = filename.into(); - let mut bed_cloud = BedCloud::builder((LocalFileSystem::new(), store_path)) - .skip_early_check() - .iid_count(iid_count) - .sid_count(sid_count) - .build_no_check()?; - let bed_cloud = bed_cloud; - - let read_options = ReadOptions::::builder() - .iid_index(*ii) - .sid_index(*si) - .is_a1_counted(is_a1_counted) - .max_concurrent_requests(max_concurrent_requests) - .max_chunk_size(max_chunk_size) - .num_threads(num_threads) - .build()?; + let mut val = val.readwrite(); + let mut val = val.as_array_mut(); - pyo3_asyncio::tokio::future_into_py(py, async move { - Python::with_gil(|py| { - let val = val.as_ref(py); - let mut val = val.readwrite(); - let mut val = val.as_array_mut(); - - let f = bed_cloud.read_and_fill_with_options_no_mut( - iid_count, - sid_count, - &mut val.view_mut(), - &read_options, - ); - f - // let r = f.await; - // r?; - // Ok(()) - }); - - // sleep(Duration::from_secs(1)).await; + let rt = runtime::Runtime::new().unwrap(); + + rt.block_on(async { + let store_path = StorePath::from_filesystem_path(filename) + .map_err(|e| Box::new(BedErrorPlus::from(e)))?; + let mut bed_cloud = BedCloud::builder((LocalFileSystem::new(), store_path)) + .iid_count(iid_count) + .sid_count(sid_count) + .build() + .await?; + + ReadOptions::builder() + .iid_index(*ii) + .sid_index(*si) + .is_a1_counted(is_a1_counted) + .num_threads(num_threads) + .read_and_fill_cloud(&mut bed_cloud, &mut val.view_mut()) + .await?; Ok(()) }) } + // #[allow(unused_variables)] + // #[pyfn(m)] + // #[allow(clippy::too_many_arguments)] + // fn read_cloud_i8<'a>( + // py: Python<'a>, + // // // cmk currently only supports LocalFileSystem "cloud" storage + // filename: &'a str, + // iid_count: usize, + // sid_count: usize, + // is_a1_counted: bool, + // iid_index: &PyArray1, + // sid_index: &PyArray1, + // val: Py>, // py: Python, + // // val: &PyArray2, + // max_concurrent_requests: usize, + // max_chunk_size: usize, + // num_threads: usize, + // // + // ) -> PyResult<&'a PyAny> { + // let iid_index = iid_index.readonly(); + // let sid_index = sid_index.readonly(); + // let ii = &iid_index.as_slice()?; + // let si = &sid_index.as_slice()?; + + // let store_path: StorePath = filename.into(); + // let mut bed_cloud = BedCloud::builder((LocalFileSystem::new(), store_path)) + // .skip_early_check() + // .iid_count(iid_count) + // .sid_count(sid_count) + // .build_no_check()?; + // let bed_cloud = bed_cloud; + + // let read_options = ReadOptions::::builder() + // .iid_index(*ii) + // .sid_index(*si) + // .is_a1_counted(is_a1_counted) + // .max_concurrent_requests(max_concurrent_requests) + // .max_chunk_size(max_chunk_size) + // .num_threads(num_threads) + // .build()?; + + // pyo3_asyncio::tokio::future_into_py(py, async move { + // Python::with_gil(|py| { + // let val = val.as_ref(py); + // let mut val = val.readwrite(); + // let mut val = val.as_array_mut(); + + // let f = bed_cloud.read_and_fill_with_options_no_mut( + // iid_count, + // sid_count, + // &mut val.view_mut(), + // &read_options, + // ); + // f + // // let r = f.await; + // // r?; + // // Ok(()) + // }); + + // // sleep(Duration::from_secs(1)).await; + + // Ok(()) + // }) + // } + // #[allow(unused_variables)] // #[pyfn(m)] // #[allow(clippy::too_many_arguments)] @@ -283,7 +322,7 @@ fn bed_reader(_py: Python<'_>, m: &PyModule) -> PyResult<()> { num_threads: usize, ) -> Result<(), PyErr> { let mut val = val.readwrite(); - let mut val = val.as_array_mut(); + let val = val.as_array_mut(); WriteOptions::builder(filename) .is_a1_counted(is_a1_counted) @@ -303,7 +342,7 @@ fn bed_reader(_py: Python<'_>, m: &PyModule) -> PyResult<()> { num_threads: usize, ) -> Result<(), PyErr> { let mut val = val.readwrite(); - let mut val = val.as_array_mut(); + let val = val.as_array_mut(); WriteOptions::builder(filename) .is_a1_counted(is_a1_counted) @@ -323,7 +362,7 @@ fn bed_reader(_py: Python<'_>, m: &PyModule) -> PyResult<()> { num_threads: usize, ) -> Result<(), PyErr> { let mut val = val.readwrite(); - let mut val = val.as_array_mut(); + let val = val.as_array_mut(); WriteOptions::builder(filename) .is_a1_counted(is_a1_counted) diff --git a/tests/tests_api_cloud.rs b/tests/tests_api_cloud.rs index 4792552..f170417 100644 --- a/tests/tests_api_cloud.rs +++ b/tests/tests_api_cloud.rs @@ -2015,7 +2015,7 @@ async fn cloud_metadata_bed() -> Result<(), Box> { #[tokio::test] async fn cloud_read_and_fill_with_options() -> Result<(), Box> { use bed_reader::assert_eq_nan; - use bed_reader::{BedCloud, ReadOptions}; + use bed_reader::ReadOptions; use ndarray as nd; // Read the SNPs indexed by 2. let object_path = sample_bed_object_path("small.bed")?; @@ -2556,6 +2556,7 @@ async fn object_path_extension() -> Result<(), Box> { Ok(()) } +// cmk requires aws credentials #[tokio::test] async fn s3() -> Result<(), Box> { // cmk too many unwraps @@ -2576,3 +2577,18 @@ async fn s3() -> Result<(), Box> { assert_eq!(object_path.size().await?, 1_250_003); Ok(()) } + +// 'C:\\Users\\carlk\\AppData\\Local\\bed_reader\\bed_reader\\Cache\\small.bed' + +#[tokio::test] +async fn windows_cloud() -> Result<(), Box> { + let object_store = Arc::new(LocalFileSystem::new()); + let file_name = "C:\\Users\\carlk\\AppData\\Local\\bed_reader\\bed_reader\\Cache\\small.bed"; + let store_path = StorePath::from_filesystem_path(file_name).map_err(BedErrorPlus::from)?; + + let mut bed_cloud = BedCloud::new((object_store, store_path)).await?; + let val = bed_cloud.read::().await?; + println!("{val:?}"); + + Ok(()) +} diff --git a/useful.bat b/useful.bat index 04a7f7d..a6ab6ab 100644 --- a/useful.bat +++ b/useful.bat @@ -1,5 +1,6 @@ # test async in python: pytest bed_reader\tests\test_open_bed_cloud.py -k read1 +pytest bed_reader\tests\test_open_bed.py -k test_zero_files pip# test Rust