diff --git a/pastis/io/read/all_data.py b/pastis/io/read/all_data.py index 6b440586..5f5ace5a 100644 --- a/pastis/io/read/all_data.py +++ b/pastis/io/read/all_data.py @@ -10,11 +10,19 @@ def _get_lengths(lengths): """Load chromosome lengths from file, or reformat lengths object. """ - if isinstance(lengths, str) and os.path.exists(lengths): - lengths = load_lengths(lengths) - elif lengths is not None and (isinstance(lengths, list) or isinstance(lengths, np.ndarray)): - if len(lengths) == 1 and isinstance(lengths[0], str) and os.path.exists(lengths[0]): - lengths = load_lengths(lengths[0]) + if lengths is not None: + if (isinstance(lengths, list) or isinstance(lengths, np.ndarray)) \ + and len(lengths) == 1: + lengths = lengths[0] + if isinstance(lengths, str): + if os.path.exists(lengths): + lengths = load_lengths(lengths) + else: + raise ValueError("Path to lengths does not exist.") + if isinstance(lengths, np.int64): + lengths = lengths.item() + if isinstance(lengths, int): + lengths = [lengths] lengths = np.array(lengths).astype(int) return lengths @@ -24,12 +32,18 @@ def _get_chrom(chrom, lengths): """ lengths = _get_lengths(lengths) - if isinstance(chrom, str) and os.path.exists(chrom): - chrom = np.array(np.genfromtxt(chrom, dtype='str')).reshape(-1) - elif chrom is not None and (isinstance(chrom, list) or isinstance(chrom, np.ndarray)): - if len(chrom) == 1 and isinstance(chrom[0], str) and os.path.exists(chrom[0]): - chrom = np.array(np.genfromtxt(chrom[0], dtype='str')).reshape(-1) - chrom = np.array(chrom) + if chrom is not None: + if (isinstance(chrom, list) or isinstance(chrom, np.ndarray)) \ + and len(chrom) == 1: + chrom = chrom[0] + if isinstance(chrom, str): + if os.path.exists(chrom): + chrom = np.genfromtxt(chrom, dtype='str') + else: + raise ValueError("Path to chrom does not exist.") + if isinstance(chrom, np.ndarray): + chrom = [chrom].item() + chrom = np.array(chrom).reshape(-1) else: chrom = np.array(['num%d' % i for i in range(1, len(lengths) + 1)]) return chrom @@ -60,8 +74,33 @@ def _get_counts(counts, lengths): return output +def _get_bias(bias): + """Load bias from file, or reformat bias object. + """ + + if bias is not None: + if (isinstance(bias, list) or isinstance(bias, np.ndarray)) \ + and len(bias) == 1: + bias = bias[0] + if isinstance(bias, str): + if os.path.exists(bias): + if bias.endswith(".npy"): + bias = np.load(bias) + else: + bias = np.loadtxt(bias) + else: + raise ValueError("Path to bias vector does not exist.") + if isinstance(bias, np.int64): + bias = bias.item() + if isinstance(bias, int): + bias = [bias] + bias = np.array(bias).astype(float) + return bias + + def load_data(counts, lengths_full, ploidy, chrom_full=None, - chrom_subset=None, exclude_zeros=False, struct_true=None): + chrom_subset=None, exclude_zeros=False, struct_true=None, + bias=None): """Load all input data from files, and/or reformat data objects. If files are provided, load data from files. Also reformats data objects. @@ -81,6 +120,8 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None, chrom_subset : list of str, optional Label for each chromosome to be excised from the full data; labels of chromosomes for which inference should be performed. + bias : str, optional + The path to the bias vector to normalize the counts with. Returns ------- @@ -106,6 +147,7 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None, lengths_full = _get_lengths(lengths_full) chrom_full = _get_chrom(chrom_full, lengths_full) counts = _get_counts(counts, lengths_full) + bias = _get_bias(bias) if struct_true is not None and isinstance(struct_true, str): struct_true = np.loadtxt(struct_true) @@ -115,4 +157,4 @@ def load_data(counts, lengths_full, ploidy, chrom_full=None, chrom_full=chrom_full, chrom_subset=chrom_subset, exclude_zeros=exclude_zeros, struct_true=struct_true) - return counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true + return counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true, bias diff --git a/pastis/optimization/counts.py b/pastis/optimization/counts.py index be531dc2..e60bf367 100644 --- a/pastis/optimization/counts.py +++ b/pastis/optimization/counts.py @@ -295,7 +295,8 @@ def check_counts(counts, lengths, ploidy, exclude_zeros=False, def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize, filter_threshold, beta=None, fullres_torm=None, excluded_counts=None, mixture_coefs=None, - exclude_zeros=False, input_weight=None, verbose=True): + exclude_zeros=False, input_weight=None, verbose=True, + bias=None): """Check counts, reformat, reduce resolution, filter, and compute bias. Preprocessing options include reducing resolution, computing bias (if @@ -329,6 +330,8 @@ def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize, therefore be removed. There should be one array per counts matrix. excluded_counts : {"inter", "intra"}, optional Whether to exclude inter- or intra-chromosomal counts from optimization. + bias: str, optional + The path to the bias vector to normalize the counts data with. Returns ------- @@ -344,7 +347,7 @@ def preprocess_counts(counts_raw, lengths, ploidy, multiscale_factor, normalize, counts_raw, lengths=lengths, ploidy=ploidy, multiscale_factor=multiscale_factor, normalize=normalize, filter_threshold=filter_threshold, exclude_zeros=exclude_zeros, - verbose=verbose) + verbose=verbose, bias=bias) lengths_lowres = decrease_lengths_res(lengths, multiscale_factor) @@ -389,7 +392,7 @@ def _percent_nan_beads(counts): def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, normalize=True, filter_threshold=0.04, exclude_zeros=True, - verbose=True): + verbose=True, bias=None): """Copy counts, check matrix, reduce resolution, filter, and compute bias. """ @@ -502,10 +505,11 @@ def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, individual_counts_torms = individual_counts_torms | torm counts = sparse.coo_matrix(counts) counts_dict[counts_type] = counts - + + verbose=True # Optionally normalize counts - bias = None - if normalize: + if normalize and bias is None: + # If we have to calculate the bias if verbose: print('COMPUTING BIAS: all counts together', flush=True) bias = ICE_normalization( @@ -513,6 +517,7 @@ def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=True), max_iter=300, output_bias=True)[1].flatten() + if bias is not None: # In each counts matrix, zero out counts for which bias is NaN for counts_type, counts in counts_dict.items(): initial_zero_beads = find_beads_to_remove( @@ -533,7 +538,6 @@ def _prep_counts(counts_list, lengths, ploidy=1, multiscale_factor=1, print(' removing %d additional beads from %s' % (torm.sum() - initial_zero_beads, counts_type), flush=True) - output_counts = check_counts( list(counts_dict.values()), lengths=lengths_lowres, ploidy=ploidy, exclude_zeros=exclude_zeros) diff --git a/pastis/optimization/initialization.py b/pastis/optimization/initialization.py index b2984e77..dca257e5 100644 --- a/pastis/optimization/initialization.py +++ b/pastis/optimization/initialization.py @@ -31,7 +31,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state, ua_beta = ua_counts.beta if ua_beta is not None: ua_beta *= multiscale_factor ** 2 - + struct = estimate_X( ua_counts._counts.astype(float), alpha=-3. if alpha is None else alpha, beta=ua_beta, verbose=False, @@ -42,6 +42,7 @@ def _initialize_struct_mds(counts, lengths, ploidy, alpha, bias, random_state, struct = struct.reshape(-1, 3) torm = find_beads_to_remove(counts, struct.shape[0]) + struct[torm] = np.nan return struct diff --git a/pastis/optimization/mds.py b/pastis/optimization/mds.py index 1d4446b2..3b399c6f 100644 --- a/pastis/optimization/mds.py +++ b/pastis/optimization/mds.py @@ -127,8 +127,9 @@ def estimate_X(counts, alpha=-3., beta=1., ini=None, The structure as an ndarray of shape (n, 3). """ + n = counts.shape[0] - + random_state = check_random_state(random_state) if ini is None or ini == "random": ini = 1 - 2 * random_state.rand(n * 3) diff --git a/pastis/optimization/pastis_algorithms.py b/pastis/optimization/pastis_algorithms.py index 48a1cd8c..7d73893a 100644 --- a/pastis/optimization/pastis_algorithms.py +++ b/pastis/optimization/pastis_algorithms.py @@ -58,7 +58,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0, struct_draft_fullres=None, callback_freq=None, callback_function=None, reorienter=None, alpha_true=None, struct_true=None, input_weight=None, exclude_zeros=False, - null=False, mixture_coefs=None, verbose=True): + null=False, mixture_coefs=None, verbose=True, bias=None): """Infer draft 3D structures with PASTIS via Poisson model. """ @@ -92,7 +92,8 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0, counts_raw=counts_raw, lengths=lengths, ploidy=ploidy, normalize=normalize, filter_threshold=filter_threshold, multiscale_factor=1, exclude_zeros=exclude_zeros, beta=beta, - input_weight=input_weight, verbose=False, mixture_coefs=mixture_coefs) + input_weight=input_weight, verbose=False, mixture_coefs=mixture_coefs, + bias=bias) beta = [c.beta for c in counts if c.sum() != 0] alpha_ = alpha @@ -117,7 +118,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0, reorienter=reorienter, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, mixture_coefs=mixture_coefs, - verbose=verbose) + verbose=verbose, bias=bias) if not infer_var_fullres['converged']: return struct_draft_fullres, alpha_, beta_, hsc_r, False if alpha is not None: @@ -176,7 +177,7 @@ def _infer_draft(counts_raw, lengths, ploidy, outdir=None, alpha=None, seed=0, reorienter=reorienter, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, - mixture_coefs=mixture_coefs, verbose=verbose) + mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if not infer_var_lowres['converged']: return struct_draft_fullres, alpha_, beta_, hsc_r, False hsc_r = distance_between_homologs( @@ -205,7 +206,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0, struct_draft_fullres=None, draft=False, simple_diploid=False, callback_freq=None, callback_function=None, reorienter=None, alpha_true=None, struct_true=None, input_weight=None, - exclude_zeros=False, null=False, mixture_coefs=None, verbose=True): + exclude_zeros=False, null=False, mixture_coefs=None, verbose=True, + bias=None): """Infer 3D structures with PASTIS via Poisson model. Optimize 3D structure from Hi-C contact counts data for diploid @@ -298,6 +300,8 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0, For diploid organisms: whether this optimization is inferring a "simple diploid" structure in which homologs are assumed to be identical and completely overlapping with one another. + bias: str, optional + The path to the bias vector to normalize the counts data with. Returns ------- @@ -364,7 +368,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0, reorienter=reorienter, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, mixture_coefs=mixture_coefs, - verbose=verbose) + verbose=verbose, bias=bias) if not draft_converged: return None, {'alpha': alpha_, 'beta': beta_, 'seed': seed, 'converged': draft_converged} @@ -408,7 +412,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0, filter_threshold=filter_threshold, multiscale_factor=multiscale_factor, exclude_zeros=exclude_zeros, beta=beta_, input_weight=input_weight, verbose=verbose, fullres_torm=fullres_torm, - excluded_counts=excluded_counts, mixture_coefs=mixture_coefs) + excluded_counts=excluded_counts, mixture_coefs=mixture_coefs, bias=bias) if verbose: print('BETA: %s' % ', '.join( ['%s=%.3g' % (c.ambiguity, c.beta) for c in counts if c.sum() != 0]), @@ -578,7 +582,7 @@ def infer(counts_raw, lengths, ploidy, outdir='', alpha=None, seed=0, callback_freq=callback_freq, reorienter=reorienter, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=verbose) + null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if not infer_var['converged']: return struct_, infer_var if reorienter is not None and reorienter.reorient: @@ -605,7 +609,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None, piecewise_step1_accuracy=1, alpha_true=None, struct_true=None, init='mds', input_weight=None, exclude_zeros=False, null=False, mixture_coefs=None, - verbose=True): + verbose=True, bias=None): """Infer 3D structures with PASTIS via Poisson model. Infer 3D structure from Hi-C contact counts data for haploid or diploid @@ -681,6 +685,8 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None, homolog-separating constraint specificying the expected mean inter- homolog count for each chromosome, scaled by beta and biases. If not supplied, `mhs_k` will be estimated from the counts data. + bias : str, optional + The path to the bias vector to normalize the counts with Returns ------- @@ -706,10 +712,11 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None, callback_freq = {'print': print_freq, 'history': history_freq, 'save': save_freq} - counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true = load_data( + counts, lengths_subset, chrom_subset, lengths_full, chrom_full, struct_true, bias = load_data( counts=counts, lengths_full=lengths_full, ploidy=ploidy, chrom_full=chrom_full, chrom_subset=chrom_subset, - exclude_zeros=exclude_zeros, struct_true=struct_true) + exclude_zeros=exclude_zeros, struct_true=struct_true, + bias=bias) outdir = _output_subdir( outdir=outdir, chrom_full=chrom_full, chrom_subset=chrom_subset, @@ -731,7 +738,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None, callback_function=callback_function, callback_freq=callback_freq, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=verbose) + null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) else: from .piecewise_whole_genome import infer_piecewise @@ -757,7 +764,7 @@ def pastis_poisson(counts, lengths, ploidy, outdir='', chromosomes=None, piecewise_step1_accuracy=piecewise_step1_accuracy, alpha_true=alpha_true, struct_true=struct_true, init=init, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, - mixture_coefs=mixture_coefs, verbose=verbose) + mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if verbose: if infer_var['converged']: diff --git a/pastis/optimization/piecewise_whole_genome.py b/pastis/optimization/piecewise_whole_genome.py index 4d89a0e2..f92d89d7 100644 --- a/pastis/optimization/piecewise_whole_genome.py +++ b/pastis/optimization/piecewise_whole_genome.py @@ -410,7 +410,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, piecewise_step1_accuracy=1, alpha_true=None, struct_true=None, init='msd', input_weight=None, exclude_zeros=False, null=False, - mixture_coefs=None, verbose=True): + mixture_coefs=None, verbose=True, bias=None): """Infer whole genome 3D structures piecewise, first inferring chromosomes. """ @@ -457,7 +457,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, callback_function=callback_function, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, mixture_coefs=mixture_coefs, - verbose=verbose) + verbose=verbose, bias=bias) if not draft_converged: return None, {'alpha': alpha_, 'beta': beta_, 'seed': seed, 'converged': draft_converged} @@ -473,7 +473,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, normalize=normalize, filter_threshold=filter_threshold, multiscale_factor=1, exclude_zeros=exclude_zeros, beta=beta, input_weight=input_weight, verbose=verbose, - mixture_coefs=mixture_coefs) + mixture_coefs=mixture_coefs, bias=bias) else: fullres_torm_for_multiscale = None @@ -519,7 +519,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, callback_freq=callback_freq, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=verbose) + null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if not infer_var['converged']: return struct_, infer_var @@ -599,7 +599,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, callback_freq=callback_freq, alpha_true=alpha_true, struct_true=chrom_struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, null=null, - mixture_coefs=mixture_coefs, verbose=verbose) + mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if not infer_var['converged']: return struct_, infer_var @@ -630,7 +630,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, 'save': 0}, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=False) + null=null, mixture_coefs=mixture_coefs, verbose=False, bias=bias) # Optionally rotate & translate previously inferred chromosomes if piecewise_opt_orient: @@ -669,7 +669,7 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, callback_freq=callback_freq, reorienter=reorienter, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=verbose) + null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) if not infer_var['converged']: return struct_, infer_var @@ -699,6 +699,6 @@ def infer_piecewise(counts_raw, outdir, lengths, ploidy, chromosomes, alpha, callback_function=callback_function, callback_freq=callback_freq, alpha_true=alpha_true, struct_true=struct_true, input_weight=input_weight, exclude_zeros=exclude_zeros, - null=null, mixture_coefs=mixture_coefs, verbose=verbose) + null=null, mixture_coefs=mixture_coefs, verbose=verbose, bias=bias) return struct_, infer_var diff --git a/pastis/optimization/utils.py b/pastis/optimization/utils.py index efc4bf63..0bee9c91 100644 --- a/pastis/optimization/utils.py +++ b/pastis/optimization/utils.py @@ -155,6 +155,7 @@ def compute_wish_distances(counts, alpha=-3., beta=1., bias=None): ------- wish_distances """ + if beta == 0: raise ValueError("beta cannot be equal to 0.") counts = counts.copy() @@ -170,5 +171,4 @@ def compute_wish_distances(counts, alpha=-3., beta=1., bias=None): else: wish_distances = counts.copy() / beta wish_distances[wish_distances != 0] **= 1. / alpha - return wish_distances diff --git a/pastis/script/pastis-poisson b/pastis/script/pastis-poisson index 7017bb6d..071e0403 100644 --- a/pastis/script/pastis-poisson +++ b/pastis/script/pastis-poisson @@ -51,6 +51,8 @@ parser.add_argument("--multiscale_rounds", default=1, type=int, " should be inferred during multiscale optimization." " Values of 1 or 0 disable multiscale" " optimization.") +parser.add_argument("--bias", default=None, type=str, + help="Path to the bias vector for normalization.") # Optimization convergence parser.add_argument("--max_iter", default=30000, type=int,