diff --git a/2.0/include/pgenlib_misc.cc b/2.0/include/pgenlib_misc.cc index 0f14d67b..16508c8b 100644 --- a/2.0/include/pgenlib_misc.cc +++ b/2.0/include/pgenlib_misc.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -1108,10 +1108,14 @@ void DifflistCountSubsetFreqs(const uintptr_t* __restrict sample_include, const #ifdef USE_SSE2 static_assert(kPglNypTransposeBatch == S_CAST(uint32_t, kNypsPerCacheline), "TransposeNypblock64() needs to be updated."); +# ifdef CACHELINE64 void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) { // buf0 and buf1 must each be vector-aligned and have size 16k // Tried using previous AVX2 small-buffer approach, but that was a bit // worse... though maybe it should be revisited? + + // Each input row has 256 nyps, across 8 words. + // First word of each row goes into first buf0 row, etc. const uint32_t buf0_row_ct = DivUp(write_batch_size, 32); { // Fold the first 3 shuffles into the ingestion loop. @@ -1143,18 +1147,18 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui const VecW* buf0_read_iter = R_CAST(const VecW*, buf0); __m128i* write_iter0 = R_CAST(__m128i*, buf1); const uint32_t buf0_row_clwidth = DivUp(read_batch_size, 8); -# ifdef USE_SHUFFLE8 +# ifdef USE_SHUFFLE8 const VecW gather_u32s = vecw_setr8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); -# else +# else const VecW m16 = VCONST_W(kMask0000FFFF); -# endif +# endif for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) { __m128i* write_iter1 = &(write_iter0[32]); __m128i* write_iter2 = &(write_iter1[32]); __m128i* write_iter3 = &(write_iter2[32]); for (uint32_t clidx = 0; clidx != buf0_row_clwidth; ++clidx) { -# ifdef USE_AVX2 +# ifdef USE_AVX2 VecW loader0 = buf0_read_iter[clidx * 2]; VecW loader1 = buf0_read_iter[clidx * 2 + 1]; loader0 = vecw_shuffle8(loader0, gather_u32s); @@ -1176,17 +1180,17 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui write_iter1[clidx] = _mm256_castsi256_si128(final2367); write_iter2[clidx] = _mm256_extracti128_si256(final0145, 1); write_iter3[clidx] = _mm256_extracti128_si256(final2367, 1); -# else +# else VecW loader0 = buf0_read_iter[clidx * 4]; VecW loader1 = buf0_read_iter[clidx * 4 + 1]; VecW loader2 = buf0_read_iter[clidx * 4 + 2]; VecW loader3 = buf0_read_iter[clidx * 4 + 3]; -# ifdef USE_SHUFFLE8 +# ifdef USE_SHUFFLE8 loader0 = vecw_shuffle8(loader0, gather_u32s); loader1 = vecw_shuffle8(loader1, gather_u32s); loader2 = vecw_shuffle8(loader2, gather_u32s); loader3 = vecw_shuffle8(loader3, gather_u32s); -# else +# else VecW tmp_lo = vecw_unpacklo16(loader0, loader1); VecW tmp_hi = vecw_unpackhi16(loader0, loader1); loader0 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16); @@ -1195,7 +1199,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui tmp_hi = vecw_unpackhi16(loader2, loader3); loader2 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16); loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16); -# endif +# endif // (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3) // + (0,4) ... (31,7) // -> (0,0) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7) @@ -1207,7 +1211,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui write_iter1[clidx] = WToVec(vecw_unpackhi64(lo_0_15, hi_0_15)); write_iter2[clidx] = WToVec(vecw_unpacklo64(lo_16_31, hi_16_31)); write_iter3[clidx] = WToVec(vecw_unpackhi64(lo_16_31, hi_16_31)); -# endif +# endif } buf0_read_iter = &(buf0_read_iter[2048 / kBytesPerVec]); write_iter0 = &(write_iter3[32]); @@ -1278,6 +1282,141 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui target_iter0[vidx] = vecw_movemask(target_0123); } } +# else +# ifndef CACHELINE128 +# error "CACHELINE64 or CACHELINE128 expected." +# endif +// assumes USE_SHUFFLE8, !USE_AVX2 +void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) { + // buf0 and buf1 must each be vector-aligned and have size 64k + // Each input row has 512 nyps, across 16 words. + // First word of each row goes into first buf0 row, etc. + const uint32_t buf0_row_ct = DivUp(write_batch_size, 32); + { + // Fold the first 4 shuffles into the ingestion loop. + const uintptr_t* initial_read_iter = read_iter; + const uintptr_t* initial_read_end = &(initial_read_iter[buf0_row_ct]); + uintptr_t* initial_target_iter = R_CAST(uintptr_t*, buf0); + const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size; + for (; initial_read_iter != initial_read_end; ++initial_read_iter) { + const uintptr_t* read_iter_tmp = initial_read_iter; + for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) { + *initial_target_iter++ = *read_iter_tmp; + read_iter_tmp = &(read_iter_tmp[read_ul_stride]); + } + ZeroWArr(read_batch_rem, initial_target_iter); + initial_target_iter = &(initial_target_iter[read_batch_rem]); + } + } + + // First buf0 row now corresponds to a 512x32 nyp matrix (512 * 8 bytes) that + // we wish to transpose. We split this into eight 512x4 matrices. + // (ARMv8 doesn't have efficient movemask, so this should be better than four + // 512x8 matrices.) + // This is nearly identical to the middle step in TransposeBitblock64(). + { + const VecW* buf0_read_iter = R_CAST(const VecW*, buf0); + uintptr_t* write_iter0 = R_CAST(uintptr_t*, buf1); + const VecW gather_u16s = vecw_setr8(0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15); + const uint32_t buf0_row_b64width = DivUp(read_batch_size, 8); + for (uint32_t ridx = 0; ridx != buf0_row_ct; ++ridx) { + uintptr_t* write_iter1 = &(write_iter0[64]); + uintptr_t* write_iter2 = &(write_iter1[64]); + uintptr_t* write_iter3 = &(write_iter2[64]); + uintptr_t* write_iter4 = &(write_iter3[64]); + uintptr_t* write_iter5 = &(write_iter4[64]); + uintptr_t* write_iter6 = &(write_iter5[64]); + uintptr_t* write_iter7 = &(write_iter6[64]); + for (uint32_t b64idx = 0; b64idx != buf0_row_b64width; ++b64idx) { + VecW loader0 = buf0_read_iter[b64idx * 4]; + VecW loader1 = buf0_read_iter[b64idx * 4 + 1]; + VecW loader2 = buf0_read_iter[b64idx * 4 + 2]; + VecW loader3 = buf0_read_iter[b64idx * 4 + 3]; + loader0 = vecw_shuffle8(loader0, gather_u16s); + loader1 = vecw_shuffle8(loader1, gather_u16s); + loader2 = vecw_shuffle8(loader2, gather_u16s); + loader3 = vecw_shuffle8(loader3, gather_u16s); + const VecW lo_0123 = vecw_unpacklo16(loader0, loader1); + const VecW lo_4567 = vecw_unpackhi16(loader0, loader1); + const VecW hi_0123 = vecw_unpacklo16(loader2, loader3); + const VecW hi_4567 = vecw_unpackhi16(loader2, loader3); + + const VecW final01 = vecw_unpacklo32(lo_0123, hi_0123); + const VecW final23 = vecw_unpackhi32(lo_0123, hi_0123); + const VecW final45 = vecw_unpacklo32(lo_4567, hi_4567); + const VecW final67 = vecw_unpackhi32(lo_4567, hi_4567); + write_iter0[b64idx] = vecw_extract64_0(final01); + write_iter1[b64idx] = vecw_extract64_1(final01); + write_iter2[b64idx] = vecw_extract64_0(final23); + write_iter3[b64idx] = vecw_extract64_1(final23); + write_iter4[b64idx] = vecw_extract64_0(final45); + write_iter5[b64idx] = vecw_extract64_1(final45); + write_iter6[b64idx] = vecw_extract64_0(final67); + write_iter7[b64idx] = vecw_extract64_1(final67); + } + buf0_read_iter = &(buf0_read_iter[256]); + write_iter0 = &(write_iter7[64]); + } + } + + // 8 -> 2 + // This is similar to the main TransposeNybbleblock() loop. + const VecW* source_iter = R_CAST(VecW*, buf1); + const VecW m2 = VCONST_W(kMask3333); + const VecW m4 = VCONST_W(kMask0F0F); + const VecW m8 = VCONST_W(kMask00FF); + const VecW gather_even = vecw_setr8(0, 2, 4, 6, 8, 10, 12, 14, + -1, -1, -1, -1, -1, -1, -1, -1); + // Take advantage of current function contract. + const uint32_t buf1_row_ct = (write_batch_size + 3) / 4; + + const uint32_t fourword_ct = DivUp(read_batch_size, 32); + uintptr_t* target_iter0 = write_iter; + for (uint32_t ridx = 0; ridx != buf1_row_ct; ++ridx) { + uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]); + uintptr_t* target_iter2 = &(target_iter1[write_ul_stride]); + uintptr_t* target_iter3 = &(target_iter2[write_ul_stride]); + for (uint32_t dvidx = 0; dvidx != fourword_ct; ++dvidx) { + const VecW loader0 = source_iter[dvidx * 2]; + const VecW loader1 = source_iter[dvidx * 2 + 1]; + + VecW even_nyps0 = loader0 & m2; + VecW even_nyps1 = loader1 & m2; + VecW odd_nyps0 = vecw_srli(loader0, 2) & m2; + VecW odd_nyps1 = vecw_srli(loader1, 2) & m2; + even_nyps0 = even_nyps0 | vecw_srli(even_nyps0, 6); + even_nyps1 = even_nyps1 | vecw_srli(even_nyps1, 6); + odd_nyps0 = odd_nyps0 | vecw_srli(odd_nyps0, 6); + odd_nyps1 = odd_nyps1 | vecw_srli(odd_nyps1, 6); + // Low four bits of even_nyps{0,1}[0], [2], ..., [14] are destined for + // target_iter0; high four bits of those bytes are destined for + // target_iter2. + const VecW even_nyps = vecw_gather_even(even_nyps0, even_nyps1, m8); + const VecW odd_nyps = vecw_gather_even(odd_nyps0, odd_nyps1, m8); + + VecW mod0_nyps = even_nyps & m4; + VecW mod1_nyps = odd_nyps & m4; + VecW mod2_nyps = vecw_srli(even_nyps, 4) & m4; + VecW mod3_nyps = vecw_srli(odd_nyps, 4) & m4; + mod0_nyps = mod0_nyps | vecw_srli(mod0_nyps, 4); + mod1_nyps = mod1_nyps | vecw_srli(mod1_nyps, 4); + mod2_nyps = mod2_nyps | vecw_srli(mod2_nyps, 4); + mod3_nyps = mod3_nyps | vecw_srli(mod3_nyps, 4); + mod0_nyps = vecw_shuffle8(mod0_nyps, gather_even); + mod1_nyps = vecw_shuffle8(mod1_nyps, gather_even); + mod2_nyps = vecw_shuffle8(mod2_nyps, gather_even); + mod3_nyps = vecw_shuffle8(mod3_nyps, gather_even); + target_iter0[dvidx] = vecw_extract64_0(mod0_nyps); + target_iter1[dvidx] = vecw_extract64_0(mod1_nyps); + target_iter2[dvidx] = vecw_extract64_0(mod2_nyps); + target_iter3[dvidx] = vecw_extract64_0(mod3_nyps); + } + source_iter = &(source_iter[32]); + target_iter0 = &(target_iter3[write_ul_stride]); + } +} +# endif #else // !USE_SSE2 # ifdef __LP64__ static_assert(kWordsPerVec == 1, "TransposeNypblock64() needs to be updated."); diff --git a/2.0/include/pgenlib_misc.h b/2.0/include/pgenlib_misc.h index 00bb604e..2835c33e 100644 --- a/2.0/include/pgenlib_misc.h +++ b/2.0/include/pgenlib_misc.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_MISC_H__ #define __PGENLIB_MISC_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -79,7 +79,7 @@ // 10000 * major + 100 * minor + patch // Exception to CONSTI32, since we want the preprocessor to have access to this // value. Named with all caps as a consequence. -#define PGENLIB_INTERNAL_VERNUM 1912 +#define PGENLIB_INTERNAL_VERNUM 1913 #ifdef __cplusplus namespace plink2 { @@ -337,6 +337,9 @@ HEADER_INLINE uint64_t GetVint64Unsafe(const unsigned char** buf_iterp) { } } +// TODO: make this work properly with kCacheline == 128, then fix other +// transpose functions, etc. + // main batch size CONSTI32(kPglNypTransposeBatch, kNypsPerCacheline); @@ -355,7 +358,8 @@ void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, ui #endif CONSTI32(kPglNypTransposeBufwords, kPglNypTransposeBufbytes / kBytesPerWord); -// - up to 256x256; vecaligned_buf must have size 16k (64-bit) or 32k (32-bit) +// - single block is up to 256x256 (CACHELINE64) or 512x512 (CACHELINE128) +// - vecaligned_buf must have size 32k (CACHELINE64) or 128k (CACHELINE128) // - does NOT zero out trailing bits, because main application is ind-major-bed // <-> plink2 format conversion, where the zeroing would be undone... // - important: write_iter must be allocated up to at least diff --git a/2.0/include/pgenlib_read.cc b/2.0/include/pgenlib_read.cc index 39650817..82b44247 100644 --- a/2.0/include/pgenlib_read.cc +++ b/2.0/include/pgenlib_read.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -2948,7 +2948,7 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl return kPglRetSuccess; } -PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { +PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { if (!sample_ct) { *difflist_common_geno_ptr = UINT32_MAX; return kPglRetSuccess; @@ -5228,6 +5228,37 @@ PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t return reterr; } +PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { + if (!sample_ct) { + *difflist_common_geno_ptr = UINT32_MAX; + return kPglRetSuccess; + } + const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx); + const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype); + if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) { + PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); + if (unlikely(reterr)) { + return reterr; + } + if (allele_idx) { + if (*difflist_common_geno_ptr == UINT32_MAX) { + GenovecInvertUnsafe(sample_ct, allele_invcountvec); + } else { + const uint32_t orig_common_geno = *difflist_common_geno_ptr; + GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno); + if (orig_common_geno != 3) { + *difflist_common_geno_ptr = 2 - orig_common_geno; + } + } + } + return kPglRetSuccess; + } + *difflist_common_geno_ptr = UINT32_MAX; + PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_invcountvec, nullptr); + GenovecInvertUnsafe(sample_ct, allele_invcountvec); + return reterr; +} + // Assumes allele_idx0 < allele_idx1, and allele_idx0 < 2. Rotates hardcalls // such that, if no multiallelic hardcalls are present, 0 = 0/0, 1 = 0/1, // 2 = 1/1, and 3 = anything else. diff --git a/2.0/include/pgenlib_read.h b/2.0/include/pgenlib_read.h index f6740b61..08de1353 100644 --- a/2.0/include/pgenlib_read.h +++ b/2.0/include/pgenlib_read.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_READ_H__ #define __PGENLIB_READ_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -545,7 +545,12 @@ PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex p // Note that the returned difflist_len can be much larger than // max_simple_difflist_len when the variant is LD-encoded; it's bounded by // 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor). -PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); +// (probable todo: this interface has... rather sharp edges, even relative to +// the rest of this low-level library. Maybe it shouldn't be deleted, but it +// would be better if there was a function that took a max_difflist_len +// parameter, and it was safe for difflist_sample_ids to only be allocated up +// to that length.) +PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); // genocounts[0] = # hom ref, [1] = # het ref, [2] = two alts, [3] = missing PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts); @@ -578,6 +583,14 @@ HEADER_INLINE PglErr PgrGetInv1(const uintptr_t* __restrict sample_include, PgrS return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec); } +PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); + +HEADER_INLINE PglErr PgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { + PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m); + const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts); + return IMPLPgrGetInv1DifflistOrGenovec(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, allele_idx, pgrp, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); +} + PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec); HEADER_INLINE PglErr PgrGet2(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec) { diff --git a/2.0/include/pgenlib_write.cc b/2.0/include/pgenlib_write.cc index d52f4b33..839d3147 100644 --- a/2.0/include/pgenlib_write.cc +++ b/2.0/include/pgenlib_write.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/pgenlib_write.h b/2.0/include/pgenlib_write.h index d7b0d737..c7507c3b 100644 --- a/2.0/include/pgenlib_write.h +++ b/2.0/include/pgenlib_write.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_WRITE_H__ #define __PGENLIB_WRITE_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_base.cc b/2.0/include/plink2_base.cc index 905287f5..98f598bc 100644 --- a/2.0/include/plink2_base.cc +++ b/2.0/include/plink2_base.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_base.h b/2.0/include/plink2_base.h index 3e672612..0d3a12ee 100644 --- a/2.0/include/plink2_base.h +++ b/2.0/include/plink2_base.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BASE_H__ #define __PLINK2_BASE_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -106,7 +106,7 @@ // 10000 * major + 100 * minor + patch // Exception to CONSTI32, since we want the preprocessor to have access // to this value. Named with all caps as a consequence. -#define PLINK2_BASE_VERNUM 811 +#define PLINK2_BASE_VERNUM 812 #define _FILE_OFFSET_BITS 64 @@ -943,6 +943,8 @@ HEADER_INLINE VecI8 veci8_set1(char cc) { return VecToI8( _mm256_set1_epi8(cc)); } +// TODO: on ARM, replace most movemask uses: +// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon HEADER_INLINE uint32_t vecw_movemask(VecW vv) { return _mm256_movemask_epi8(WToVec(vv)); } @@ -1606,6 +1608,7 @@ HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) { return R_CAST(uintptr_t, _mm_movepi64_pi64(WToVec(vv))); } +// compiler recognizes this on ARMv8 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) { const __m128i v0 = _mm_srli_si128(WToVec(vv), 8); return R_CAST(uintptr_t, _mm_movepi64_pi64(v0)); @@ -1900,7 +1903,14 @@ CONSTI32(kInt16PerVec, kBytesPerVec / 2); CONSTI32(kFloatPerFVec, kBytesPerFVec / 4); CONSTI32(kDoublePerDVec, kBytesPerDVec / 8); +#if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__) +// TODO: make this 128 once that stops breaking code +# define CACHELINE128 +CONSTI32(kCacheline, 128); +#else +# define CACHELINE64 CONSTI32(kCacheline, 64); +#endif CONSTI32(kBitsPerCacheline, kCacheline * CHAR_BIT); CONSTI32(kNypsPerCacheline, kCacheline * 4); @@ -1923,7 +1933,11 @@ CONSTI32(kMaxBytesPerIO, 0x7ffff000); // Maximum size of "dynamically" allocated line load buffer. (This is the // limit that applies to .vcf and similar files.) Inconvenient to go higher // since fgets() takes a int32_t size argument. +#if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__) +CONSTI32(kMaxLongLine, 0x7fffff80); +#else CONSTI32(kMaxLongLine, 0x7fffffc0); +#endif static_assert(!(kMaxLongLine % kCacheline), "kMaxLongLine must be a multiple of kCacheline."); #ifdef __APPLE__ diff --git a/2.0/include/plink2_bgzf.cc b/2.0/include/plink2_bgzf.cc index 08539cb5..d1807bb6 100644 --- a/2.0/include/plink2_bgzf.cc +++ b/2.0/include/plink2_bgzf.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_bgzf.h b/2.0/include/plink2_bgzf.h index 48e13975..0d1ea958 100644 --- a/2.0/include/plink2_bgzf.h +++ b/2.0/include/plink2_bgzf.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BGZF_H__ #define __PLINK2_BGZF_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_bitmap.cc b/2.0/include/plink2_bitmap.cc index e983f456..6da347a5 100644 --- a/2.0/include/plink2_bitmap.cc +++ b/2.0/include/plink2_bitmap.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_bitmap.h b/2.0/include/plink2_bitmap.h index f19defaf..a81fd98e 100644 --- a/2.0/include/plink2_bitmap.h +++ b/2.0/include/plink2_bitmap.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BITMAP_H__ #define __PLINK2_BITMAP_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_bits.cc b/2.0/include/plink2_bits.cc index 5f4403e6..08ef9de5 100644 --- a/2.0/include/plink2_bits.cc +++ b/2.0/include/plink2_bits.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -2043,8 +2043,8 @@ void Expand1bitTo16(const void* __restrict bytearr, uint32_t input_bit_ct, uint3 #ifdef USE_SSE2 static_assert(kPglBitTransposeBatch == S_CAST(uint32_t, kBitsPerCacheline), "TransposeBitblock64() needs to be updated."); void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_row_ct, uint32_t write_row_ct, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1) { - // We need to perform the equivalent of 9 shuffles (assuming a full-size - // 512x512 bitblock). + // We need to perform the equivalent of 9-10 shuffles (assuming a full-size + // 512x512 or 1024x1024 bitblock). // The first shuffles are performed by the ingestion loop: we write the first // word from every row to buf0, then the second word from every row, etc., // yielding @@ -2052,8 +2052,8 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u // (0,64) ... (0,127) (1,64) ... (1,127) (2,64) ... (511,127) // ... // (0,448) ... (0,511) (1,448) ... (1,511) (2,448) ... (511,511) - // in terms of the original bit positions. - // Since each input row has 8 words, this amounts to 3 shuffles. + // in terms of the original bit positions when kCacheline==64. + // Since each input row has 8-16 words, this amounts to 3-4 shuffles. // // The second step writes // (0,0) (0,1) ... (0,7) (1,0) (1,1) ... (1,7) ... (511,7) @@ -2063,15 +2063,16 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u // to buf1, performing the equivalent of 3 shuffles, and the third step // finishes the transpose using movemask. // - // buf0 and buf1 must both be 32KiB vector-aligned buffers. + // buf0 and buf1 must both be 32KiB vector-aligned buffers when + // kCacheline==64, and 128KiB when kCacheline==128. const uint32_t buf0_row_ct = DivUp(write_row_ct, 64); { uintptr_t* buf0_ul = DowncastVecWToW(buf0); - const uint32_t zfill_ct = (-read_row_ct) & 63; - for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) { - const uintptr_t* read_iter_tmp = &(read_iter[bidx]); - uintptr_t* buf0_row_start = &(buf0_ul[512 * bidx]); + const uint32_t zfill_ct = (-read_row_ct) & (kBitsPerWord - 1); + for (uint32_t ridx = 0; ridx != buf0_row_ct; ++ridx) { + const uintptr_t* read_iter_tmp = &(read_iter[ridx]); + uintptr_t* buf0_row_start = &(buf0_ul[kPglBitTransposeBatch * ridx]); for (uint32_t uii = 0; uii != read_row_ct; ++uii) { buf0_row_start[uii] = *read_iter_tmp; read_iter_tmp = &(read_iter_tmp[read_ul_stride]); @@ -2084,8 +2085,7 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u ZeroWArr(zfill_ct, &(buf0_row_start[read_row_ct])); } } - // Each width-unit corresponds to 64 input rows. - const uint32_t buf_row_xwidth = DivUp(read_row_ct, 64); + const uint32_t write_word_width = DivUp(read_row_ct, 64); { const VecW* buf0_read_iter = buf0; uintptr_t* write_iter0 = DowncastVecWToW(buf1); @@ -2099,19 +2099,19 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u # else const VecW m8 = VCONST_W(kMask00FF); # endif - const uint32_t buf0_row_clwidth = buf_row_xwidth * 8; - for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) { - uintptr_t* write_iter1 = &(write_iter0[64]); - uintptr_t* write_iter2 = &(write_iter1[64]); - uintptr_t* write_iter3 = &(write_iter2[64]); - uintptr_t* write_iter4 = &(write_iter3[64]); - uintptr_t* write_iter5 = &(write_iter4[64]); - uintptr_t* write_iter6 = &(write_iter5[64]); - uintptr_t* write_iter7 = &(write_iter6[64]); - for (uint32_t clidx = 0; clidx != buf0_row_clwidth; ++clidx) { + const uint32_t buf0_row_b64width = write_word_width * 8; + for (uint32_t ridx = 0; ridx != buf0_row_ct; ++ridx) { + uintptr_t* write_iter1 = &(write_iter0[kCacheline]); + uintptr_t* write_iter2 = &(write_iter1[kCacheline]); + uintptr_t* write_iter3 = &(write_iter2[kCacheline]); + uintptr_t* write_iter4 = &(write_iter3[kCacheline]); + uintptr_t* write_iter5 = &(write_iter4[kCacheline]); + uintptr_t* write_iter6 = &(write_iter5[kCacheline]); + uintptr_t* write_iter7 = &(write_iter6[kCacheline]); + for (uint32_t b64idx = 0; b64idx != buf0_row_b64width; ++b64idx) { # ifdef USE_AVX2 - VecW loader0 = buf0_read_iter[clidx * 2]; - VecW loader1 = buf0_read_iter[clidx * 2 + 1]; + VecW loader0 = buf0_read_iter[b64idx * 2]; + VecW loader1 = buf0_read_iter[b64idx * 2 + 1]; // (0,0) (0,1) ... (0,7) (1,0) (1,1) ... (1,7) (2,0) ... (3,7) // -> (0,0) (1,0) (0,1) (1,1) (0,2) .... (1,7) (2,0) (3,0) (2,1) ... loader0 = vecw_shuffle8(loader0, gather_u16s); @@ -2125,19 +2125,19 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u vec_hi = vecw_shuffle8(vec_hi, gather_u32s); const VecW final0145 = vecw_unpacklo32(vec_lo, vec_hi); const VecW final2367 = vecw_unpackhi32(vec_lo, vec_hi); - write_iter0[clidx] = vecw_extract64_0(final0145); - write_iter1[clidx] = vecw_extract64_1(final0145); - write_iter2[clidx] = vecw_extract64_0(final2367); - write_iter3[clidx] = vecw_extract64_1(final2367); - write_iter4[clidx] = vecw_extract64_2(final0145); - write_iter5[clidx] = vecw_extract64_3(final0145); - write_iter6[clidx] = vecw_extract64_2(final2367); - write_iter7[clidx] = vecw_extract64_3(final2367); + write_iter0[b64idx] = vecw_extract64_0(final0145); + write_iter1[b64idx] = vecw_extract64_1(final0145); + write_iter2[b64idx] = vecw_extract64_0(final2367); + write_iter3[b64idx] = vecw_extract64_1(final2367); + write_iter4[b64idx] = vecw_extract64_2(final0145); + write_iter5[b64idx] = vecw_extract64_3(final0145); + write_iter6[b64idx] = vecw_extract64_2(final2367); + write_iter7[b64idx] = vecw_extract64_3(final2367); # else // !USE_AVX2 - VecW loader0 = buf0_read_iter[clidx * 4]; - VecW loader1 = buf0_read_iter[clidx * 4 + 1]; - VecW loader2 = buf0_read_iter[clidx * 4 + 2]; - VecW loader3 = buf0_read_iter[clidx * 4 + 3]; + VecW loader0 = buf0_read_iter[b64idx * 4]; + VecW loader1 = buf0_read_iter[b64idx * 4 + 1]; + VecW loader2 = buf0_read_iter[b64idx * 4 + 2]; + VecW loader3 = buf0_read_iter[b64idx * 4 + 3]; // (0,0) (0,1) ... (0,7) (1,0) (1,1) ... (1,7) // -> (0,0) (1,0) (0,1) (1,1) (0,2) ... (1,7) # ifdef USE_SHUFFLE8 @@ -2166,26 +2166,26 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u VecW final23 = vecw_unpackhi32(lo_0123, hi_0123); VecW final45 = vecw_unpacklo32(lo_4567, hi_4567); VecW final67 = vecw_unpackhi32(lo_4567, hi_4567); - write_iter0[clidx] = vecw_extract64_0(final01); - write_iter1[clidx] = vecw_extract64_1(final01); - write_iter2[clidx] = vecw_extract64_0(final23); - write_iter3[clidx] = vecw_extract64_1(final23); - write_iter4[clidx] = vecw_extract64_0(final45); - write_iter5[clidx] = vecw_extract64_1(final45); - write_iter6[clidx] = vecw_extract64_0(final67); - write_iter7[clidx] = vecw_extract64_1(final67); + write_iter0[b64idx] = vecw_extract64_0(final01); + write_iter1[b64idx] = vecw_extract64_1(final01); + write_iter2[b64idx] = vecw_extract64_0(final23); + write_iter3[b64idx] = vecw_extract64_1(final23); + write_iter4[b64idx] = vecw_extract64_0(final45); + write_iter5[b64idx] = vecw_extract64_1(final45); + write_iter6[b64idx] = vecw_extract64_0(final67); + write_iter7[b64idx] = vecw_extract64_1(final67); # endif // !USE_AVX2 } - buf0_read_iter = &(buf0_read_iter[512 / kWordsPerVec]); - write_iter0 = &(write_iter7[64]); + buf0_read_iter = &(buf0_read_iter[kPglBitTransposeBatch / kWordsPerVec]); + write_iter0 = &(write_iter7[kCacheline]); } } const VecW* buf1_read_iter = buf1; const uint32_t write_v8ui_stride = kVec8thUintPerWord * write_ul_stride; const uint32_t buf1_fullrow_ct = write_row_ct / 8; - const uint32_t buf1_row_vecwidth = buf_row_xwidth * (8 / kWordsPerVec); + const uint32_t buf1_row_vecwidth = write_word_width * (8 / kWordsPerVec); Vec8thUint* write_iter0 = DowncastWToV8(write_iter); - for (uint32_t bidx = 0; bidx != buf1_fullrow_ct; ++bidx) { + for (uint32_t ridx = 0; ridx != buf1_fullrow_ct; ++ridx) { Vec8thUint* write_iter1 = &(write_iter0[write_v8ui_stride]); Vec8thUint* write_iter2 = &(write_iter1[write_v8ui_stride]); Vec8thUint* write_iter3 = &(write_iter2[write_v8ui_stride]); @@ -2211,7 +2211,7 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u loader = vecw_slli(loader, 1); write_iter0[vidx] = vecw_movemask(loader); } - buf1_read_iter = &(buf1_read_iter[64 / kWordsPerVec]); + buf1_read_iter = &(buf1_read_iter[kCacheline / kWordsPerVec]); write_iter0 = &(write_iter7[write_v8ui_stride]); } const uint32_t row_ct_rem = write_row_ct % 8; @@ -2384,9 +2384,10 @@ void TransposeBitblock32(const uintptr_t* read_iter, uintptr_t read_ul_stride, u #ifdef USE_SSE2 void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, VecW* vecaligned_buf) { // Very similar to TransposeNypblock64() in pgenlib_internal. - // vecaligned_buf must be vector-aligned and have size 8k + // vecaligned_buf must be vector-aligned and have size 8k if kCacheline==64, + // 32k if kCacheline==128 const uint32_t buf_row_ct = DivUp(write_batch_size, 8); - // fold the first 4 shuffles into the initial ingestion loop + // fold the first 4-5 shuffles into the initial ingestion loop const uint32_t* initial_read_iter = DowncastKWToU32(read_iter); const uint32_t* initial_read_end = &(initial_read_iter[buf_row_ct]); uint32_t* initial_target_iter = DowncastVecWToU32(vecaligned_buf); @@ -2409,7 +2410,7 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u const VecW* source_iter = vecaligned_buf; const VecW m4 = VCONST_W(kMask0F0F); const uint32_t buf_fullrow_ct = write_batch_size / 8; - const uint32_t eightword_ct = DivUp(read_batch_size, 16); + const uint32_t b64width = DivUp(read_batch_size, 16); uintptr_t* target_iter0 = write_iter; uint32_t cur_dst_row_ct = 8; # ifdef USE_SHUFFLE8 @@ -2436,9 +2437,9 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u uintptr_t* target_iter5 = &(target_iter4[write_ul_stride]); uintptr_t* target_iter6 = &(target_iter5[write_ul_stride]); uintptr_t* target_iter7 = &(target_iter6[write_ul_stride]); - for (uint32_t dvidx = 0; dvidx != eightword_ct; ++dvidx) { - const VecW loader0 = source_iter[dvidx * 2]; - const VecW loader1 = source_iter[dvidx * 2 + 1]; + for (uint32_t b64idx = 0; b64idx != b64width; ++b64idx) { + const VecW loader0 = source_iter[b64idx * 2]; + const VecW loader1 = source_iter[b64idx * 2 + 1]; VecW even_nybbles0 = loader0 & m4; VecW odd_nybbles0 = vecw_and_notfirst(m4, loader0); VecW even_nybbles1 = loader1 & m4; @@ -2486,28 +2487,28 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u // tried using _mm_stream_si64 here, that totally sucked switch (cur_dst_row_ct) { case 8: - target_iter7[dvidx] = vecw_extract64_3(target_odd); + target_iter7[b64idx] = vecw_extract64_3(target_odd); // fall through case 7: - target_iter6[dvidx] = vecw_extract64_3(target_even); + target_iter6[b64idx] = vecw_extract64_3(target_even); // fall through case 6: - target_iter5[dvidx] = vecw_extract64_2(target_odd); + target_iter5[b64idx] = vecw_extract64_2(target_odd); // fall through case 5: - target_iter4[dvidx] = vecw_extract64_2(target_even); + target_iter4[b64idx] = vecw_extract64_2(target_even); // fall through case 4: - target_iter3[dvidx] = vecw_extract64_1(target_odd); + target_iter3[b64idx] = vecw_extract64_1(target_odd); // fall through case 3: - target_iter2[dvidx] = vecw_extract64_1(target_even); + target_iter2[b64idx] = vecw_extract64_1(target_even); // fall through case 2: - target_iter1[dvidx] = vecw_extract64_0(target_odd); + target_iter1[b64idx] = vecw_extract64_0(target_odd); // fall through default: - target_iter0[dvidx] = vecw_extract64_0(target_even); + target_iter0[b64idx] = vecw_extract64_0(target_even); } } source_iter = &(source_iter[(4 * kPglNybbleTransposeBatch) / kBytesPerVec]); @@ -2528,11 +2529,11 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u uintptr_t* target_iter5 = &(target_iter4[write_ul_stride]); uintptr_t* target_iter6 = &(target_iter5[write_ul_stride]); uintptr_t* target_iter7 = &(target_iter6[write_ul_stride]); - for (uint32_t qvidx = 0; qvidx != eightword_ct; ++qvidx) { - const VecW loader0 = source_iter[qvidx * 4]; - const VecW loader1 = source_iter[qvidx * 4 + 1]; - const VecW loader2 = source_iter[qvidx * 4 + 2]; - const VecW loader3 = source_iter[qvidx * 4 + 3]; + for (uint32_t b64idx = 0; b64idx != b64width; ++b64idx) { + const VecW loader0 = source_iter[b64idx * 4]; + const VecW loader1 = source_iter[b64idx * 4 + 1]; + const VecW loader2 = source_iter[b64idx * 4 + 2]; + const VecW loader3 = source_iter[b64idx * 4 + 3]; VecW even_nybbles0 = loader0 & m4; VecW odd_nybbles0 = vecw_and_notfirst(m4, loader0); VecW even_nybbles1 = loader1 & m4; @@ -2606,28 +2607,28 @@ void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, u const VecW final57 = vecw_unpackhi32(odd_lo, odd_hi); switch (cur_dst_row_ct) { case 8: - target_iter7[qvidx] = vecw_extract64_1(final57); + target_iter7[b64idx] = vecw_extract64_1(final57); // fall through case 7: - target_iter6[qvidx] = vecw_extract64_1(final46); + target_iter6[b64idx] = vecw_extract64_1(final46); // fall through case 6: - target_iter5[qvidx] = vecw_extract64_0(final57); + target_iter5[b64idx] = vecw_extract64_0(final57); // fall through case 5: - target_iter4[qvidx] = vecw_extract64_0(final46); + target_iter4[b64idx] = vecw_extract64_0(final46); // fall through case 4: - target_iter3[qvidx] = vecw_extract64_1(final13); + target_iter3[b64idx] = vecw_extract64_1(final13); // fall through case 3: - target_iter2[qvidx] = vecw_extract64_1(final02); + target_iter2[b64idx] = vecw_extract64_1(final02); // fall through case 2: - target_iter1[qvidx] = vecw_extract64_0(final13); + target_iter1[b64idx] = vecw_extract64_0(final13); // fall through default: - target_iter0[qvidx] = vecw_extract64_0(final02); + target_iter0[b64idx] = vecw_extract64_0(final02); } } source_iter = &(source_iter[(4 * kPglNybbleTransposeBatch) / kBytesPerVec]); diff --git a/2.0/include/plink2_bits.h b/2.0/include/plink2_bits.h index 7560d436..e81e17e0 100644 --- a/2.0/include/plink2_bits.h +++ b/2.0/include/plink2_bits.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BITS_H__ #define __PLINK2_BITS_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -514,7 +514,8 @@ uintptr_t PopcountBytesMasked(const void* bitarr, const uintptr_t* mask_arr, uin // TransposeNypblock(), which is more plink-specific, is in pgenlib_misc CONSTI32(kPglBitTransposeBatch, kBitsPerCacheline); CONSTI32(kPglBitTransposeWords, kWordsPerCacheline); -// * Up to 512x512; vecaligned_buf must have size 64k +// * Up to 512x512 (CACHELINE64) or 1024x1024 (CACHELINE128) +// * vecaligned_buf must have size 64k (CACHELINE64) or 256k (CACHELINE128) // * write_iter must be allocated up to at least // RoundUpPow2(write_batch_size, 2) rows // * We use pointers with different types to read from and write to buf0/buf1, @@ -549,8 +550,9 @@ CONSTI32(kPglNybbleTransposeWords, kWordsPerCacheline); CONSTI32(kPglNybbleTransposeBufbytes, (kPglNybbleTransposeBatch * kPglNybbleTransposeBatch) / 2); -// up to 128x128; vecaligned_buf must have size 8k -// now ok for write_iter to not be padded when write_batch_size odd +// * Up to 128x128 (CACHELINE64) or 256x256 (CACHELINE128) +// * vecaligned_buf must have size 8k (CACHELINE64) or 32k (CACHELINE128) +// * Now ok for write_iter to not be padded when write_batch_size odd void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, VecW* vecaligned_buf); #ifdef USE_SSE2 diff --git a/2.0/include/plink2_fmath.cc b/2.0/include/plink2_fmath.cc index 797092ca..d4badc0b 100644 --- a/2.0/include/plink2_fmath.cc +++ b/2.0/include/plink2_fmath.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_fmath.h b/2.0/include/plink2_fmath.h index 25ebbea2..70ffbbc3 100644 --- a/2.0/include/plink2_fmath.h +++ b/2.0/include/plink2_fmath.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_FMATH_H__ #define __PLINK2_FMATH_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_stats.cc b/2.0/include/plink2_stats.cc index f5387aeb..a2acbf6c 100644 --- a/2.0/include/plink2_stats.cc +++ b/2.0/include/plink2_stats.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_stats.h b/2.0/include/plink2_stats.h index 982fd012..1d0c692d 100644 --- a/2.0/include/plink2_stats.h +++ b/2.0/include/plink2_stats.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_STATS_H__ #define __PLINK2_STATS_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_string.cc b/2.0/include/plink2_string.cc index 15d844b6..a323df04 100644 --- a/2.0/include/plink2_string.cc +++ b/2.0/include/plink2_string.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_string.h b/2.0/include/plink2_string.h index 4223b578..ae656e20 100644 --- a/2.0/include/plink2_string.h +++ b/2.0/include/plink2_string.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_STRING_H__ #define __PLINK2_STRING_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_text.cc b/2.0/include/plink2_text.cc index 060595d3..c3894ae2 100644 --- a/2.0/include/plink2_text.cc +++ b/2.0/include/plink2_text.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_text.h b/2.0/include/plink2_text.h index 3c88285a..d2c66307 100644 --- a/2.0/include/plink2_text.h +++ b/2.0/include/plink2_text.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_TEXT_H__ #define __PLINK2_TEXT_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_thread.cc b/2.0/include/plink2_thread.cc index 6f457574..765e86d2 100644 --- a/2.0/include/plink2_thread.cc +++ b/2.0/include/plink2_thread.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_thread.h b/2.0/include/plink2_thread.h index c3d00ddb..3bb8f198 100644 --- a/2.0/include/plink2_thread.h +++ b/2.0/include/plink2_thread.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_THREAD_H__ #define __PLINK2_THREAD_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_zstfile.cc b/2.0/include/plink2_zstfile.cc index bba8192a..495b141c 100644 --- a/2.0/include/plink2_zstfile.cc +++ b/2.0/include/plink2_zstfile.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/include/plink2_zstfile.h b/2.0/include/plink2_zstfile.h index 1b94f612..7b3668e5 100644 --- a/2.0/include/plink2_zstfile.h +++ b/2.0/include/plink2_zstfile.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_ZSTFILE_H__ #define __PLINK2_ZSTFILE_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/pgenlib_ffi_support.h b/2.0/pgenlib_ffi_support.h index 7c81de6b..ba8ae631 100644 --- a/2.0/pgenlib_ffi_support.h +++ b/2.0/pgenlib_ffi_support.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_FFI_SUPPORT_H__ #define __PGENLIB_FFI_SUPPORT_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2.cc b/2.0/plink2.cc index 4b629c45..b9ed4751 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it @@ -72,10 +72,10 @@ static const char ver_str[] = "PLINK v2.00a6" #elif defined(USE_AOCL) " AMD" #endif - " (12 Dec 2023)"; + " (1 Jan 2024)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same - "" + " " #ifdef NOLAPACK #elif defined(LAPACK_ILP64) @@ -101,16 +101,14 @@ static const char ver_str2[] = #endif " www.cog-genomics.org/plink/2.0/\n" - "(C) 2005-2023 Shaun Purcell, Christopher Chang GNU General Public License v3\n"; + "(C) 2005-2024 Shaun Purcell, Christopher Chang GNU General Public License v3\n"; static const char errstr_append[] = "For more info, try \"" PROG_NAME_STR " --help \" or \"" PROG_NAME_STR " --help | more\".\n"; #ifndef NOLAPACK -static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise, --ld,\n--sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --pca, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; -// static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,\n--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --pca, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; +static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,\n--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --pca, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; #else // no --pca -static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise, --ld,\n--sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; -// static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,\n--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; +static const char notestr_null_calc2[] = "Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,\n--sample-counts, --missing, --hardy, --het, --fst, --indep-pairwise,\n--r2-phased, --sample-diff, --make-king, --king-cutoff, --pmerge, --pgen-diff,\n--write-samples, --write-snplist, --make-grm-list, --glm, --adjust-file,\n--gwas-ssf, --clump, --score-list, --variant-score, --genotyping-rate,\n--pgen-info, --validate, and --zst-decompress.\n\n\"" PROG_NAME_STR " --help | more\" describes all functions.\n"; #endif // multiallelics-already-joined + terminating null diff --git a/2.0/plink2_adjust.cc b/2.0/plink2_adjust.cc index 15e69250..f80d7b84 100644 --- a/2.0/plink2_adjust.cc +++ b/2.0/plink2_adjust.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_adjust.h b/2.0/plink2_adjust.h index 81e6d12a..16c8cae7 100644 --- a/2.0/plink2_adjust.h +++ b/2.0/plink2_adjust.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_ADJUST_H__ #define __PLINK2_ADJUST_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_cmdline.cc b/2.0/plink2_cmdline.cc index 86a45267..46732b55 100644 --- a/2.0/plink2_cmdline.cc +++ b/2.0/plink2_cmdline.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_cmdline.h b/2.0/plink2_cmdline.h index 2f691d7f..5d2dd317 100644 --- a/2.0/plink2_cmdline.h +++ b/2.0/plink2_cmdline.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_CMDLINE_H__ #define __PLINK2_CMDLINE_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_common.cc b/2.0/plink2_common.cc index 569a6b66..73dfc663 100644 --- a/2.0/plink2_common.cc +++ b/2.0/plink2_common.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_common.h b/2.0/plink2_common.h index f0ae9ed0..7bf004ac 100644 --- a/2.0/plink2_common.h +++ b/2.0/plink2_common.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_COMMON_H__ #define __PLINK2_COMMON_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_compress_stream.cc b/2.0/plink2_compress_stream.cc index b4d96f87..d56dee2a 100644 --- a/2.0/plink2_compress_stream.cc +++ b/2.0/plink2_compress_stream.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_compress_stream.h b/2.0/plink2_compress_stream.h index c1457dba..87a6f48a 100644 --- a/2.0/plink2_compress_stream.h +++ b/2.0/plink2_compress_stream.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_COMPRESS_STREAM_H__ #define __PLINK2_COMPRESS_STREAM_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_data.cc b/2.0/plink2_data.cc index 68dd2640..d5a0b341 100644 --- a/2.0/plink2_data.cc +++ b/2.0/plink2_data.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_data.h b/2.0/plink2_data.h index c0cd5664..f9159f6f 100644 --- a/2.0/plink2_data.h +++ b/2.0/plink2_data.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_DATA_H__ #define __PLINK2_DATA_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_decompress.cc b/2.0/plink2_decompress.cc index c496a3ec..4f25828d 100644 --- a/2.0/plink2_decompress.cc +++ b/2.0/plink2_decompress.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_decompress.h b/2.0/plink2_decompress.h index e0690db2..8af99448 100644 --- a/2.0/plink2_decompress.h +++ b/2.0/plink2_decompress.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_DECOMPRESS_H__ #define __PLINK2_DECOMPRESS_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_export.cc b/2.0/plink2_export.cc index 0814b7c9..b7041151 100644 --- a/2.0/plink2_export.cc +++ b/2.0/plink2_export.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_export.h b/2.0/plink2_export.h index 566f7f40..8eea5983 100644 --- a/2.0/plink2_export.h +++ b/2.0/plink2_export.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_EXPORT_H__ #define __PLINK2_EXPORT_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_export_legacy.cc b/2.0/plink2_export_legacy.cc index 4d763327..4e51a138 100644 --- a/2.0/plink2_export_legacy.cc +++ b/2.0/plink2_export_legacy.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_export_legacy.h b/2.0/plink2_export_legacy.h index b13feb38..0fbce61b 100644 --- a/2.0/plink2_export_legacy.h +++ b/2.0/plink2_export_legacy.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_EXPORT_LEGACY_H__ #define __PLINK2_EXPORT_LEGACY_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_fasta.cc b/2.0/plink2_fasta.cc index ebe4e397..21ef854c 100644 --- a/2.0/plink2_fasta.cc +++ b/2.0/plink2_fasta.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_fasta.h b/2.0/plink2_fasta.h index a695f878..5ada8f8c 100644 --- a/2.0/plink2_fasta.h +++ b/2.0/plink2_fasta.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_FASTA_H__ #define __PLINK2_FASTA_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_filter.cc b/2.0/plink2_filter.cc index 8c05b785..23f6505e 100644 --- a/2.0/plink2_filter.cc +++ b/2.0/plink2_filter.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_filter.h b/2.0/plink2_filter.h index 10e17591..5f0238a3 100644 --- a/2.0/plink2_filter.h +++ b/2.0/plink2_filter.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_FILTER_H__ #define __PLINK2_FILTER_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm.cc b/2.0/plink2_glm.cc index 54a76edc..fa042087 100644 --- a/2.0/plink2_glm.cc +++ b/2.0/plink2_glm.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm.h b/2.0/plink2_glm.h index 8f9e36a3..4487086d 100644 --- a/2.0/plink2_glm.h +++ b/2.0/plink2_glm.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_GLM_H__ #define __PLINK2_GLM_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_linear.cc b/2.0/plink2_glm_linear.cc index faf27bd1..29b7a88b 100644 --- a/2.0/plink2_glm_linear.cc +++ b/2.0/plink2_glm_linear.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_linear.h b/2.0/plink2_glm_linear.h index 6af9e759..b065c831 100644 --- a/2.0/plink2_glm_linear.h +++ b/2.0/plink2_glm_linear.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_GLM_LINEAR_H__ #define __PLINK2_GLM_LINEAR_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_logistic.cc b/2.0/plink2_glm_logistic.cc index 18dc95cc..ea26a49a 100644 --- a/2.0/plink2_glm_logistic.cc +++ b/2.0/plink2_glm_logistic.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_logistic.h b/2.0/plink2_glm_logistic.h index c53961e9..4c9014f8 100644 --- a/2.0/plink2_glm_logistic.h +++ b/2.0/plink2_glm_logistic.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_GLM_LOGISTIC_H__ #define __PLINK2_GLM_LOGISTIC_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_shared.cc b/2.0/plink2_glm_shared.cc index bf1d2824..76f78f00 100644 --- a/2.0/plink2_glm_shared.cc +++ b/2.0/plink2_glm_shared.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_glm_shared.h b/2.0/plink2_glm_shared.h index 7b705410..f2190067 100644 --- a/2.0/plink2_glm_shared.h +++ b/2.0/plink2_glm_shared.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_GLM_SHARED_H__ #define __PLINK2_GLM_SHARED_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_help.cc b/2.0/plink2_help.cc index 5003118e..5b1898c9 100644 --- a/2.0/plink2_help.cc +++ b/2.0/plink2_help.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_help.h b/2.0/plink2_help.h index 4d86eeb0..feff240b 100644 --- a/2.0/plink2_help.h +++ b/2.0/plink2_help.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_HELP_H__ #define __PLINK2_HELP_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_import.cc b/2.0/plink2_import.cc index d3f6f7c1..28948abd 100644 --- a/2.0/plink2_import.cc +++ b/2.0/plink2_import.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_import.h b/2.0/plink2_import.h index 063a6d4f..21ee7a21 100644 --- a/2.0/plink2_import.h +++ b/2.0/plink2_import.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_IMPORT_H__ #define __PLINK2_IMPORT_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_import_legacy.cc b/2.0/plink2_import_legacy.cc index b888dc04..04ad39c2 100644 --- a/2.0/plink2_import_legacy.cc +++ b/2.0/plink2_import_legacy.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_import_legacy.h b/2.0/plink2_import_legacy.h index 537e8274..b2377418 100644 --- a/2.0/plink2_import_legacy.h +++ b/2.0/plink2_import_legacy.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_IMPORT_LEGACY_H__ #define __PLINK2_IMPORT_LEGACY_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_ld.cc b/2.0/plink2_ld.cc index f0f4f799..f8b59159 100644 --- a/2.0/plink2_ld.cc +++ b/2.0/plink2_ld.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it @@ -5598,31 +5598,32 @@ ENUM_U31_DEF_END(ClumpJobType); // Unpacked representations: // 1. If not loading dosage: -// a. If not loading phase: {one_bitvec, two_bitvec, nm_bitvec}, -// founder_ctaw words each, i.e. 3 * bitvec_byte_ct -// uint32_t nmaj_ct, ssq?, nm_ct, x_male_nm_ct?, -// x_male_nmaj_ct?, x_male_ssq? for another -// RoundUpPow2({8|12|16|24}, kBytesPerVec) +// a. If not loading phase: +// uint32_t is_sparse, nm_ct, nmaj_ct, ssq: +// RoundUpPow2(16, kBytesPerVec) +// {one_bitvec, two_bitvec, nm_bitvec}: founder_ctaw words each, total +// 3 * bitvec_byte_ct +// uint32_t x_male_nm_ct, x_male_nmaj_ct, x_male_ssq: if chrX, another +// RoundUpPow2(12, kBytesPerVec) +// (sparse representation guaranteed to be no larger than dense) // Male-specific values placed in the back so that the unpacker doesn't // need to distinguish between no-x-male-stats and // x-male-stats-not-needed. // b. If loading phase, also need phasepresent and phaseinfo, founder_ctaw // words each, for a total of 5 * bitvec_byte_ct + -// RoundUpPow2({8|12|16|24}, kBytesPerVec) +// RoundUpPow2(16, kBytesPerVec) + RoundUpPow2(0|12, kBytesPerVec) // 2. If loading dosage: -// a. If not loading phase: {dosage_vec, dosage_het} founder_dosagev_ct -// vectors each; plus nm_bitvec, i.e. -// 2 * dosagevec_byte_ct + bitvec_byte_ct -// uint64_t nmaj_dosage, nmaj_dosage_ssq?, -// uint32_t nm_ct, x_male_nm_ct?, -// uint64_t x_male_nmaj_dosage?, -// x_male_nmaj_dosage_ssq? for another -// RoundUpPow2({12|20|24|40}, kBytesPerVec) +// a. If not loading phase: +// {dosage_vec, dosage_het}: founder_dosagev_ct vectors each; plus +// nm_bitvec, i.e. 2 * dosagevec_byte_ct + bitvec_byte_ct +// uint64_t nmaj_dosage, nmaj_dosage_ssq?, uint32_t nm_ct, +// x_male_nm_ct?, uint64_t x_male_nmaj_dosage?, +// x_male_nmaj_dosage_ssq?: RoundUpPow2({12|20|24|40}, kBytesPerVec) // b. If loading phase, also need main_dphase_deltas, for a total of // 3 * dosagevec_byte_ct + bitvec_byte_ct + RoundUpPow2({12|20|24|40}, // kBytesPerVec) // -// Probable todo: Unpack the index variant in a way that allows r^2 to be +// Possible todo: Unpack the index variant in a way that allows r^2 to be // computed efficiently against other variants *without* unpacking them. // In the HighmemUnpack step, the main thread wants to prepare the next copy of @@ -5688,6 +5689,9 @@ typedef struct ClumpCtxStruct { // load_dosage. unsigned char* unpacked_variants; + // In unphased case, one of these per thread to enable sparse-optimization. + uintptr_t** raregeno_bufs; + uint32_t** difflist_sample_id_bufs; // chrX workspaces // nm_buf: max(founder_nonmale_ct, founder_male_ct) bits // invmask_buf: max(founder_nonmale_ct, founder_male_ct) uint16s @@ -5720,19 +5724,16 @@ typedef struct ClumpCtxStruct { // phase_type and load_dosage are not read from ctx, since they can change as // we try to extend an island group along a single chromosome. uintptr_t UnpackedByteStride(const ClumpCtx* ctx, R2PhaseType phase_type, uint32_t x_exists, uint32_t load_dosage) { - uintptr_t trail_byte_ct; - if (load_dosage) { - trail_byte_ct = (1 + (phase_type == kR2PhaseTypeUnphased)) * sizeof(int64_t) + sizeof(int32_t); - } else { - trail_byte_ct = (2 + (phase_type == kR2PhaseTypeUnphased)) * sizeof(int32_t); - } - trail_byte_ct <<= x_exists; - trail_byte_ct = RoundUpPow2(trail_byte_ct, kBytesPerVec); const uintptr_t bitvec_byte_ct = ctx->bitvec_byte_ct; - if (load_dosage) { - return (1 + phase_type) * ctx->dosagevec_byte_ct + bitvec_byte_ct + trail_byte_ct; + if (!load_dosage) { + uintptr_t stride = RoundUpPow2(16, kBytesPerVec) + (3 + 2 * (phase_type == kR2PhaseTypePresent)) * bitvec_byte_ct; + if (x_exists) { + stride += RoundUpPow2((2 + (phase_type == kR2PhaseTypeUnphased)) * sizeof(int32_t), kBytesPerVec); + } + return stride; } - return (3 + 2 * (phase_type == kR2PhaseTypePresent)) * bitvec_byte_ct + trail_byte_ct; + const uintptr_t trail_byte_ct = RoundUpPow2(((1 + (phase_type == kR2PhaseTypeUnphased)) * sizeof(int64_t) + sizeof(int32_t)) << x_exists, kBytesPerVec); + return (1 + phase_type) * ctx->dosagevec_byte_ct + bitvec_byte_ct + trail_byte_ct; } // does not check multiallelic fields @@ -5756,9 +5757,11 @@ void ClumpPgenVariantIncr(uintptr_t byte_ct, PgenVariant* pgvp) { // x_male_nmaj_ct, x_male_ssq, and x_male_nm_ct only filled if // founder_male_collapsed is non-null. -void LdUnpackNondosage(const PgenVariant* pgvp, const uintptr_t* founder_male_collapsed, uint32_t sample_ct, R2PhaseType phase_type, unsigned char* dst_iter) { +void LdUnpackNondosageDense(const PgenVariant* pgvp, const uintptr_t* founder_male_collapsed, uint32_t sample_ct, R2PhaseType phase_type, unsigned char* dst_iter) { const uintptr_t sample_ctaw = BitCtToAlignedWordCt(sample_ct); - uintptr_t* dst_witer = R_CAST(uintptr_t*, dst_iter); + uint32_t* dst_u32 = R_CAST(uint32_t*, dst_iter); + dst_u32[0] = 0; // is_sparse + uintptr_t* dst_witer = R_CAST(uintptr_t*, &(dst_iter[RoundUpPow2(16, kBytesPerVec)])); uintptr_t* one_bitvec = dst_witer; dst_witer = &(dst_witer[sample_ctaw]); uintptr_t* two_bitvec = dst_witer; @@ -5773,11 +5776,11 @@ void LdUnpackNondosage(const PgenVariant* pgvp, const uintptr_t* founder_male_co phaseinfo = dst_witer; dst_witer = &(dst_witer[sample_ctaw]); } - uint32_t* dst_u32iter = R_CAST(uint32_t*, dst_witer); - uint32_t* nmaj_ct_ptr = dst_u32iter++; GenoarrSplit12Nm(pgvp->genovec, sample_ct, one_bitvec, two_bitvec, nm_bitvec); const uint32_t sample_ctl = BitCtToWordCt(sample_ct); - *nmaj_ct_ptr = GenoBitvecSum(one_bitvec, two_bitvec, sample_ctl); + dst_u32[1] = PopcountWords(nm_bitvec, sample_ctl); + dst_u32[2] = GenoBitvecSum(one_bitvec, two_bitvec, sample_ctl); + dst_u32[3] = 0; // defensive if (phase_type == kR2PhaseTypePresent) { if (!pgvp->phasepresent_ct) { ZeroWArr(sample_ctl, phasepresent); @@ -5786,27 +5789,112 @@ void LdUnpackNondosage(const PgenVariant* pgvp, const uintptr_t* founder_male_co memcpy(phaseinfo, pgvp->phaseinfo, sample_ctl * sizeof(intptr_t)); } } else if (phase_type == kR2PhaseTypeUnphased) { - uint32_t* ssq_ptr = dst_u32iter++; - *ssq_ptr = (*nmaj_ct_ptr) + 2 * PopcountWords(two_bitvec, sample_ctl); + dst_u32[3] = dst_u32[2] + 2 * PopcountWords(two_bitvec, sample_ctl); } - uint32_t* nm_ct_ptr = dst_u32iter; - *nm_ct_ptr = PopcountWords(nm_bitvec, sample_ctl); if (founder_male_collapsed) { - ++dst_u32iter; + uint32_t* dst_x_u32 = R_CAST(uint32_t*, dst_witer); // x_male_nm_ct - *dst_u32iter++ = PopcountWordsIntersect(founder_male_collapsed, nm_bitvec, sample_ctl); - - uint32_t* x_male_nmaj_ct_ptr = dst_u32iter; - *x_male_nmaj_ct_ptr = GenoBitvecSumSubset(founder_male_collapsed, one_bitvec, two_bitvec, sample_ctl); + dst_x_u32[0] = PopcountWordsIntersect(founder_male_collapsed, nm_bitvec, sample_ctl); + dst_x_u32[1] = GenoBitvecSumSubset(founder_male_collapsed, one_bitvec, two_bitvec, sample_ctl); if (phase_type == kR2PhaseTypeUnphased) { // x_male_ssq - dst_u32iter[1] = (*x_male_nmaj_ct_ptr) + 2 * PopcountWordsIntersect(founder_male_collapsed, two_bitvec, sample_ctl); + dst_x_u32[2] = dst_x_u32[1] + 2 * PopcountWordsIntersect(founder_male_collapsed, two_bitvec, sample_ctl); + } + } +} + +// Ok if trailing bits of raregeno aren't clear. +void LdUnpackNondosageSparse(const uintptr_t* raregeno, const uint32_t* difflist_sample_ids, const uintptr_t* founder_male_collapsed, uint32_t sample_ct, uint32_t male_ct, uint32_t difflist_common_geno, uint32_t difflist_len, unsigned char* dst_iter) { + uint32_t* dst_u32 = R_CAST(uint32_t*, dst_iter); + dst_u32[0] = 1; // is_sparse + dst_u32[4] = difflist_common_geno; + dst_u32[5] = difflist_len; + uint32_t nm_ct = sample_ct; + uint32_t nmaj_ct = 0; + uint32_t ssq = 0; + if (difflist_common_geno) { + if (difflist_common_geno == 3) { + nm_ct = difflist_len; + } else { + const uint32_t two_ct = sample_ct - difflist_len; + nmaj_ct = 2 * two_ct; + ssq = 4 * two_ct; + } + } + uintptr_t* raregeno_dst = R_CAST(uintptr_t*, &(dst_iter[RoundUpPow2((6 + difflist_len) * sizeof(int32_t), kBytesPerVec)])); + const uint32_t word_ct = NypCtToWordCt(difflist_len); + if (difflist_len) { + memcpy(&(dst_u32[6]), difflist_sample_ids, difflist_len * sizeof(int32_t)); + memcpy(raregeno_dst, raregeno, word_ct * sizeof(intptr_t)); + ZeroTrailingNyps(difflist_len, raregeno_dst); + STD_ARRAY_DECL(uint32_t, 4, genocounts); + GenoarrCountFreqsUnsafe(raregeno_dst, difflist_len, genocounts); + nm_ct -= genocounts[3]; + nmaj_ct += genocounts[1] + 2 * genocounts[2]; + ssq += genocounts[1] + 4 * genocounts[2]; + } + dst_u32[1] = nm_ct; + dst_u32[2] = nmaj_ct; + dst_u32[3] = ssq; + if (founder_male_collapsed) { + uint32_t x_male_nm_ct = male_ct; + uint32_t x_male_nmaj_ct = 0; + uint32_t x_male_ssq = 0; + if (difflist_common_geno) { + if (difflist_common_geno == 3) { + x_male_nm_ct = 0; + } else { + x_male_nmaj_ct = male_ct * 2; + x_male_ssq = male_ct * 4; + } + } + if (difflist_len) { + const uint32_t word_ct_m1 = word_ct - 1; + uint32_t genocounts[4]; + ZeroU32Arr(4, genocounts); + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(difflist_len, kBitsPerWordD2); + } + const uint32_t* cur_difflist_sample_ids = &(difflist_sample_ids[widx * kBitsPerWordD2]); + uintptr_t raregeno_word = raregeno[widx]; + for (uint32_t uii = 0; uii != loop_len; ++uii) { + const uint32_t sample_idx = cur_difflist_sample_ids[uii]; + if (IsSet(founder_male_collapsed, sample_idx)) { + const uintptr_t cur_geno = raregeno_word & 3; + genocounts[cur_geno] += 1; + } + raregeno_word = raregeno_word >> 2; + } + } + if (difflist_common_geno != 3) { + x_male_nm_ct -= genocounts[3]; + } else { + x_male_nm_ct = genocounts[0] + genocounts[1] + genocounts[2]; + } + if (difflist_common_geno != 2) { + x_male_nmaj_ct = genocounts[1] + 2 * genocounts[2]; + x_male_ssq = genocounts[1] + 4 * genocounts[2]; + } else { + const uint32_t zero_or_missing_ct = genocounts[0] + genocounts[3]; + x_male_nmaj_ct -= genocounts[1] + 2 * zero_or_missing_ct; + x_male_ssq -= genocounts[1] + 4 * zero_or_missing_ct; + } } + const uint32_t sample_ctaw = BitCtToAlignedWordCt(sample_ct); + uint32_t* dst_x_u32 = R_CAST(uint32_t*, &(raregeno_dst[sample_ctaw])); + dst_x_u32[0] = x_male_nm_ct; + dst_x_u32[1] = x_male_nmaj_ct; + dst_x_u32[2] = x_male_ssq; } } static inline uint32_t LdNondosageTrailAlignedByteCt(R2PhaseType phase_type, uint32_t x_exists) { - return RoundUpPow2(((2 + (phase_type == kR2PhaseTypeUnphased)) << x_exists) * sizeof(int32_t), kBytesPerVec); + return x_exists * RoundUpPow2((2 + (phase_type == kR2PhaseTypeUnphased)) * sizeof(int32_t), kBytesPerVec); } static inline uint32_t LdDosageTrailAlignedByteCt(R2PhaseType phase_type, uint32_t x_exists) { @@ -5873,15 +5961,33 @@ void LdUnpackDosage(const PgenVariant* pgvp, const uintptr_t* founder_male_colla } } -typedef struct R2NondosageVariantStruct { +typedef struct R2NondosageDenseStruct { const uintptr_t* one_bitvec; const uintptr_t* two_bitvec; const uintptr_t* nm_bitvec; const uintptr_t* phasepresent; // may be uninitialized const uintptr_t* phaseinfo; // may be uninitialized - uint32_t nmaj_ct; - uint32_t ssq; // may be uninitialized +} R2NondosageDense; + +typedef struct R2NondosageSparseStruct { + uint32_t difflist_common_geno; + uint32_t difflist_len; + const uint32_t* difflist_sample_ids; + const uintptr_t* raregeno; + // probable todo: support phase +} R2NondosageSparse; + +typedef union { + R2NondosageDense d; + R2NondosageSparse s; +} R2NondosagePayload; + +typedef struct R2NondosageVariantStruct { + uint32_t is_sparse; uint32_t nm_ct; + uint32_t nmaj_ct; + uint32_t ssq; // set to 0 unless unphased calc + R2NondosagePayload p; uint32_t x_male_nm_ct; // may be uninitialized uint32_t x_male_nmaj_ct; // may be uninitialized uint32_t x_male_ssq; // may be uninitialized @@ -5912,35 +6018,51 @@ typedef union { } R2Variant; void FillR2Nondosage(const unsigned char* src_iter, uint32_t sample_ct, R2PhaseType phase_type, uint32_t is_x, R2NondosageVariant* ndp) { - // See LdUnpackNondosage(). + // See LdUnpackNondosageDense(). const uint32_t sample_ctaw = BitCtToAlignedWordCt(sample_ct); - const uintptr_t* src_witer = R_CAST(const uintptr_t*, src_iter); - ndp->one_bitvec = src_witer; + const uint32_t* src_u32 = R_CAST(const uint32_t*, src_iter); + const uint32_t is_sparse = src_u32[0]; + ndp->is_sparse = is_sparse; + ndp->nm_ct = src_u32[1]; + ndp->nmaj_ct = src_u32[2]; + ndp->ssq = src_u32[3]; + if (is_sparse) { + R2NondosageSparse* ndsp = &(ndp->p.s); + ndsp->difflist_common_geno = src_u32[4]; + const uint32_t difflist_len = src_u32[5]; + ndsp->difflist_len = difflist_len; + ndsp->difflist_sample_ids = &(src_u32[6]); + const uintptr_t* raregeno = R_CAST(const uintptr_t*, &(src_iter[RoundUpPow2(sizeof(int32_t) * (6 + difflist_len), kBytesPerVec)])); + ndsp->raregeno = raregeno; + if (is_x) { + const uint32_t* src_x_u32 = R_CAST(const uint32_t*, &(raregeno[sample_ctaw])); + ndp->x_male_nm_ct = src_x_u32[0]; + ndp->x_male_nmaj_ct = src_x_u32[1]; + ndp->x_male_ssq = src_x_u32[2]; + } + return; + } + const uintptr_t* src_witer = R_CAST(const uintptr_t*, &(src_iter[RoundUpPow2(16, kBytesPerVec)])); + R2NondosageDense* nddp = &(ndp->p.d); + nddp->one_bitvec = src_witer; src_witer = &(src_witer[sample_ctaw]); - ndp->two_bitvec = src_witer; + nddp->two_bitvec = src_witer; src_witer = &(src_witer[sample_ctaw]); - ndp->nm_bitvec = src_witer; + nddp->nm_bitvec = src_witer; src_witer = &(src_witer[sample_ctaw]); if (phase_type == kR2PhaseTypePresent) { - ndp->phasepresent = src_witer; + nddp->phasepresent = src_witer; src_witer = &(src_witer[sample_ctaw]); - ndp->phaseinfo = src_witer; + nddp->phaseinfo = src_witer; src_witer = &(src_witer[sample_ctaw]); } - const uint32_t* final_u32_iter = R_CAST(const uint32_t*, src_witer); - ndp->nmaj_ct = *final_u32_iter++; - if (phase_type == kR2PhaseTypeUnphased) { - ndp->ssq = *final_u32_iter++; - } - ndp->nm_ct = *final_u32_iter; - if (!is_x) { - return; - } - ++final_u32_iter; - ndp->x_male_nm_ct = *final_u32_iter++; - ndp->x_male_nmaj_ct = *final_u32_iter; - if (phase_type == kR2PhaseTypeUnphased) { - ndp->x_male_ssq = final_u32_iter[1]; + if (is_x) { + const uint32_t* src_x_u32 = R_CAST(const uint32_t*, src_witer); + ndp->x_male_nm_ct = src_x_u32[0]; + ndp->x_male_nmaj_ct = src_x_u32[1]; + if (phase_type == kR2PhaseTypeUnphased) { + ndp->x_male_ssq = src_x_u32[2]; + } } } @@ -6013,6 +6135,7 @@ void ClumpHighmemUnpack(uintptr_t tidx, uint32_t parity, ClumpCtx* ctx) { } const uintptr_t* founder_info = ctx->founder_info; const uint32_t* founder_info_cumulative_popcounts = ctx->founder_info_cumulative_popcounts; + const uint32_t founder_male_ct = ctx->founder_male_ct; const uint32_t is_x = ctx->is_x; const uintptr_t* founder_male_collapsed = is_x? ctx->founder_male_collapsed : nullptr; const Dosage* male_dosage_invmask = is_x? ctx->male_dosage_invmask : nullptr; @@ -6026,6 +6149,14 @@ void ClumpHighmemUnpack(uintptr_t tidx, uint32_t parity, ClumpCtx* ctx) { const R2PhaseType phase_type = S_CAST(R2PhaseType, ctx->phase_type); const uint32_t load_dosage = ctx->load_dosage; const uintptr_t allele_idx_start = IdxToUidxW(observed_alleles, observed_alleles_cumulative_popcounts_w, ctx->allele_widx_start, ctx->allele_widx_end, oaidx); + // todo: try tuning this number + const uint32_t max_simple_difflist_len = founder_ct / 64; + uintptr_t* raregeno = nullptr; + uint32_t* difflist_sample_ids = nullptr; + if (phase_type == kR2PhaseTypeUnphased) { + raregeno = ctx->raregeno_bufs[tidx]; + difflist_sample_ids = ctx->difflist_sample_id_bufs[tidx]; + } uintptr_t allele_idx_base; uintptr_t cur_bits; BitIter1Start(observed_alleles, allele_idx_start, &allele_idx_base, &cur_bits); @@ -6055,18 +6186,34 @@ void ClumpHighmemUnpack(uintptr_t tidx, uint32_t parity, ClumpCtx* ctx) { } LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, write_iter); } else { - if (phase_type == kR2PhaseTypePresent) { - reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + if ((phase_type == kR2PhaseTypeUnphased) && (!is_y)) { + uint32_t difflist_common_geno; + uint32_t difflist_len; + reterr = PgrGetInv1DifflistOrGenovec(founder_info, pssi, founder_ct, max_simple_difflist_len, variant_uidx, aidx, pgrp, pgv.genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len); + if (unlikely(reterr)) { + goto ClumpHighmemUnpack_err; + } + if (difflist_common_geno != UINT32_MAX) { + if (difflist_len <= max_simple_difflist_len) { + LdUnpackNondosageSparse(raregeno, difflist_sample_ids, founder_male_collapsed, founder_ct, founder_male_ct, difflist_common_geno, difflist_len, write_iter); + continue; + } + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, founder_ct, difflist_len, pgv.genovec); + } } else { - reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, pgrp, pgv.genovec); - } - if (unlikely(reterr)) { - goto ClumpHighmemUnpack_err; - } - if (is_y) { - InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + if (phase_type == kR2PhaseTypePresent) { + reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + } else { + reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, pgrp, pgv.genovec); + } + if (unlikely(reterr)) { + goto ClumpHighmemUnpack_err; + } + if (is_y) { + InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + } } - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, write_iter); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, write_iter); } } return; @@ -6091,12 +6238,181 @@ static inline uint32_t GenoBitvecUnphasedDotprodSubset(const uintptr_t* subset_m return 2 * half_hom_part + hethet_ct; } +uint32_t ComputeR2NondosageUnphased1SparseStats(const R2NondosageVariant* densevp0, const R2NondosageVariant* sparsevp1, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr) { + const uint32_t difflist_common_geno = sparsevp1->p.s.difflist_common_geno; + const uint32_t difflist_len = sparsevp1->p.s.difflist_len; + const uint32_t* difflist_sample_ids = sparsevp1->p.s.difflist_sample_ids; + uint32_t nmaj_ct0 = 0; + uint32_t ssq0 = 0; + uint32_t dotprod = 0; + uint32_t valid_obs_ct = 0; + if (difflist_common_geno != 3) { + nmaj_ct0 = densevp0->nmaj_ct; + ssq0 = densevp0->ssq; + dotprod = difflist_common_geno * nmaj_ct0; + valid_obs_ct = densevp0->nm_ct; + } + uint32_t nmaj_ct1 = sparsevp1->nmaj_ct; + uint32_t ssq1 = sparsevp1->ssq; + if (difflist_len) { + // genovec would be more convenient than this representation here, but that + // shouldn't be a big deal. + const uintptr_t* nm_bitvec0 = densevp0->p.d.nm_bitvec; + const uintptr_t* one_bitvec0 = densevp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = densevp0->p.d.two_bitvec; + const uintptr_t* raregeno = sparsevp1->p.s.raregeno; + const uint32_t word_ct_m1 = (difflist_len - 1) / kBitsPerWordD2; + uint32_t joint_counts[16]; // low bits = dense geno, high bits = sparse + ZeroU32Arr(16, joint_counts); + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(difflist_len, kBitsPerWordD2); + } + const uint32_t* cur_difflist_sample_ids = &(difflist_sample_ids[widx * kBitsPerWordD2]); + uintptr_t raregeno_word = raregeno[widx]; + for (uint32_t uii = 0; uii != loop_len; ++uii) { + const uintptr_t cur_sparse_geno = raregeno_word & 3; + const uint32_t sample_idx = cur_difflist_sample_ids[uii]; + const uint32_t sample_widx = sample_idx / kBitsPerWord; + const uint32_t sample_idx_lowbits = sample_idx % kBitsPerWord; + const uintptr_t nm_bit = (nm_bitvec0[sample_widx] >> sample_idx_lowbits) & 1; + const uintptr_t one_bit = (one_bitvec0[sample_widx] >> sample_idx_lowbits) & 1; + // "& 3" takes care of this mask + const uintptr_t two_bit_unmasked = two_bitvec0[sample_widx] >> sample_idx_lowbits; + const uintptr_t cur_dense_geno = (nm_bit + one_bit + two_bit_unmasked * 2 - 1) & 3; + joint_counts[cur_dense_geno + cur_sparse_geno * 4] += 1; + raregeno_word = raregeno_word >> 2; + } + } + nmaj_ct1 -= joint_counts[7] + 2 * joint_counts[11]; + ssq1 -= joint_counts[7] + 4 * joint_counts[11]; + if (difflist_common_geno != 3) { + nmaj_ct0 -= joint_counts[13] + 2 * joint_counts[14]; + ssq0 -= joint_counts[13] + 4 * joint_counts[14]; + const uint32_t sparse_missing_dense_nm_ct = joint_counts[12] + joint_counts[13] + joint_counts[14]; + valid_obs_ct -= sparse_missing_dense_nm_ct; + } else { + const uint32_t dense_one_ct = joint_counts[1] + joint_counts[5] + joint_counts[9]; + const uint32_t dense_two_ct = joint_counts[2] + joint_counts[6] + joint_counts[10]; + nmaj_ct0 = dense_one_ct + 2 * dense_two_ct; + ssq0 = dense_one_ct + 4 * dense_two_ct; + const uint32_t dense_missing_sparse_nm_ct = joint_counts[3] + joint_counts[7] + joint_counts[11]; + valid_obs_ct = difflist_len - dense_missing_sparse_nm_ct; + } + if (difflist_common_geno != 2) { + dotprod = joint_counts[5] + 2 * (joint_counts[6] + joint_counts[9]) + 4 * joint_counts[10]; + } else { + dotprod -= joint_counts[5] + 2 * (joint_counts[1] + joint_counts[6] + joint_counts[13]) + 4 * (joint_counts[2] + joint_counts[14]); + } + } + *nmaj_ct0_ptr = nmaj_ct0; + *nmaj_ct1_ptr = nmaj_ct1; + *ssq0_ptr = ssq0; + *ssq1_ptr = ssq1; + *dotprod_ptr = dotprod; + return valid_obs_ct; +} + +uint32_t ComputeR2NondosageUnphased2SparseStats(const R2NondosageVariant* ndp0, const R2NondosageVariant* ndp1, uint32_t sample_ct, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr) { + const uint32_t difflist_common_geno0 = ndp0->p.s.difflist_common_geno; + const uint32_t difflist_common_geno1 = ndp1->p.s.difflist_common_geno; + const uint32_t difflist_len0 = ndp0->p.s.difflist_len; + const uint32_t difflist_len1 = ndp1->p.s.difflist_len; + const uint32_t* difflist_sample_ids0 = ndp0->p.s.difflist_sample_ids; + const uint32_t* difflist_sample_ids1 = ndp1->p.s.difflist_sample_ids; + const uintptr_t* raregeno0 = ndp0->p.s.raregeno; + const uintptr_t* raregeno1 = ndp1->p.s.raregeno; + uint32_t joint_counts[16]; // low bits = ndp0, high bits = ndp1 + ZeroU32Arr(16, joint_counts); + uint32_t difflist_idx1 = 0; + uint32_t sample_idx1 = UINT32_MAX; + uintptr_t raregeno_word1 = 0; + if (difflist_len1) { + sample_idx1 = difflist_sample_ids1[0]; + raregeno_word1 = raregeno1[0]; + } + uintptr_t raregeno_word0 = 0; + for (uint32_t difflist_idx0 = 0; ; ++difflist_idx0) { + uint32_t sample_idx0 = UINT32_MAX; + if (difflist_idx0 < difflist_len0) { + sample_idx0 = difflist_sample_ids0[difflist_idx0]; + raregeno_word0 = raregeno_word0 >> 2; + if (!(difflist_idx0 % kBitsPerWordD2)) { + raregeno_word0 = raregeno0[difflist_idx0 / kBitsPerWordD2]; + } + } + while (sample_idx1 < sample_idx0) { + const uintptr_t cur_geno1 = raregeno_word1 & 3; + joint_counts[difflist_common_geno0 + cur_geno1 * 4] += 1; + sample_idx1 = UINT32_MAX; + if (++difflist_idx1 < difflist_len1) { + sample_idx1 = difflist_sample_ids1[difflist_idx1]; + raregeno_word1 = raregeno_word1 >> 2; + if (!(difflist_idx1 % kBitsPerWordD2)) { + raregeno_word1 = raregeno1[difflist_idx1 / kBitsPerWordD2]; + } + } + } + const uintptr_t cur_geno0 = raregeno_word0 & 3; + if (sample_idx0 != sample_idx1) { + joint_counts[cur_geno0 + difflist_common_geno1 * 4] += 1; + continue; + } + if (sample_idx0 == UINT32_MAX) { + break; + } + const uintptr_t cur_geno1 = raregeno_word1 & 3; + joint_counts[cur_geno0 + cur_geno1 * 4] += 1; + sample_idx1 = UINT32_MAX; + if (++difflist_idx1 < difflist_len1) { + sample_idx1 = difflist_sample_ids1[difflist_idx1]; + raregeno_word1 = raregeno_word1 >> 2; + if (!(difflist_idx1 % kBitsPerWordD2)) { + raregeno_word1 = raregeno1[difflist_idx1 / kBitsPerWordD2]; + } + } + } + *nmaj_ct0_ptr = ndp0->nmaj_ct - joint_counts[13] - 2 * joint_counts[14]; + *nmaj_ct1_ptr = ndp1->nmaj_ct - joint_counts[7] - 2 * joint_counts[11]; + *ssq0_ptr = ndp0->ssq - joint_counts[13] - 4 * joint_counts[14]; + *ssq1_ptr = ndp1->ssq - joint_counts[7] - 4 * joint_counts[11]; + uint32_t dotprod = joint_counts[5] + 2 * (joint_counts[6] + joint_counts[9]) + 4 * joint_counts[10]; + if ((difflist_common_geno0 == 2) && (difflist_common_geno1 == 2)) { + const uint32_t sparse0_not1_ct = joint_counts[8] + joint_counts[9] + joint_counts[11]; + dotprod += 4 * (sample_ct - sparse0_not1_ct - difflist_len1); + } + *dotprod_ptr = dotprod; + uint32_t valid_obs_ct; + if (difflist_common_geno1 != 3) { + const uint32_t missing1_nm0_ct = joint_counts[12] + joint_counts[13] + joint_counts[14]; + valid_obs_ct = ndp0->nm_ct - missing1_nm0_ct; + } else { + const uint32_t missing0_nm1_ct = joint_counts[3] + joint_counts[7] + joint_counts[11]; + valid_obs_ct = difflist_len1 - missing0_nm1_ct; + } + return valid_obs_ct; +} + // Main return value is valid_obs_ct. On valid_obs_ct=0, other return values -// are not filled. (Same is true for the next three functions.) +// may not be filled. (Same is true for the next three functions.) uint32_t ComputeR2NondosageUnphasedStats(const R2NondosageVariant* ndp0, const R2NondosageVariant* ndp1, uint32_t sample_ct, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr) { + if (ndp0->is_sparse) { + if (ndp1->is_sparse) { + return ComputeR2NondosageUnphased2SparseStats(ndp0, ndp1, sample_ct, nmaj_ct0_ptr, nmaj_ct1_ptr, ssq0_ptr, ssq1_ptr, dotprod_ptr); + } else { + return ComputeR2NondosageUnphased1SparseStats(ndp1, ndp0, nmaj_ct1_ptr, nmaj_ct0_ptr, ssq1_ptr, ssq0_ptr, dotprod_ptr); + } + } + if (ndp1->is_sparse) { + return ComputeR2NondosageUnphased1SparseStats(ndp0, ndp1, nmaj_ct0_ptr, nmaj_ct1_ptr, ssq0_ptr, ssq1_ptr, dotprod_ptr); + } const uint32_t sample_ctl = BitCtToWordCt(sample_ct); - const uintptr_t* nm_bitvec0 = ndp0->nm_bitvec; - const uintptr_t* nm_bitvec1 = ndp1->nm_bitvec; + const uintptr_t* nm_bitvec0 = ndp0->p.d.nm_bitvec; + const uintptr_t* nm_bitvec1 = ndp1->p.d.nm_bitvec; const uint32_t nm_ct0 = ndp0->nm_ct; const uint32_t nm_ct1 = ndp1->nm_ct; uint32_t valid_obs_ct; @@ -6108,8 +6424,8 @@ uint32_t ComputeR2NondosageUnphasedStats(const R2NondosageVariant* ndp0, const R } else { valid_obs_ct = MINV(nm_ct0, nm_ct1); } - const uintptr_t* one_bitvec0 = ndp0->one_bitvec; - const uintptr_t* two_bitvec0 = ndp0->two_bitvec; + const uintptr_t* one_bitvec0 = ndp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = ndp0->p.d.two_bitvec; if (nm_ct0 == valid_obs_ct) { *nmaj_ct0_ptr = ndp0->nmaj_ct; *ssq0_ptr = ndp0->ssq; @@ -6119,8 +6435,8 @@ uint32_t ComputeR2NondosageUnphasedStats(const R2NondosageVariant* ndp0, const R // 0, 1, 4 instead of 0, 1, 2 *ssq0_ptr = nmaj_ct0 + 2 * PopcountWordsIntersect(nm_bitvec1, two_bitvec0, sample_ctl); } - const uintptr_t* one_bitvec1 = ndp1->one_bitvec; - const uintptr_t* two_bitvec1 = ndp1->two_bitvec; + const uintptr_t* one_bitvec1 = ndp1->p.d.one_bitvec; + const uintptr_t* two_bitvec1 = ndp1->p.d.two_bitvec; if (nm_ct1 == valid_obs_ct) { *nmaj_ct1_ptr = ndp1->nmaj_ct; *ssq1_ptr = ndp1->ssq; @@ -6137,8 +6453,8 @@ uint32_t ComputeR2NondosagePhasedStats(const R2NondosageVariant* ndp0, const R2N // See HardcallPhasedR2Stats(). Probable todo: make function names more // systematic. const uint32_t sample_ctl = BitCtToWordCt(sample_ct); - const uintptr_t* nm_bitvec0 = ndp0->nm_bitvec; - const uintptr_t* nm_bitvec1 = ndp1->nm_bitvec; + const uintptr_t* nm_bitvec0 = ndp0->p.d.nm_bitvec; + const uintptr_t* nm_bitvec1 = ndp1->p.d.nm_bitvec; const uint32_t nm_ct0 = ndp0->nm_ct; const uint32_t nm_ct1 = ndp1->nm_ct; uint32_t valid_obs_ct; @@ -6150,14 +6466,14 @@ uint32_t ComputeR2NondosagePhasedStats(const R2NondosageVariant* ndp0, const R2N } else { valid_obs_ct = MINV(nm_ct0, nm_ct1); } - const uintptr_t* one_bitvec0 = ndp0->one_bitvec; - const uintptr_t* two_bitvec0 = ndp0->two_bitvec; + const uintptr_t* one_bitvec0 = ndp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = ndp0->p.d.two_bitvec; uint32_t nmaj_ct0 = ndp0->nmaj_ct; if (nm_ct0 != valid_obs_ct) { nmaj_ct0 = GenoBitvecSumSubset(nm_bitvec1, one_bitvec0, two_bitvec0, sample_ctl); } - const uintptr_t* one_bitvec1 = ndp1->one_bitvec; - const uintptr_t* two_bitvec1 = ndp1->two_bitvec; + const uintptr_t* one_bitvec1 = ndp1->p.d.one_bitvec; + const uintptr_t* two_bitvec1 = ndp1->p.d.two_bitvec; uint32_t nmaj_ct1 = ndp1->nmaj_ct; if (nm_ct1 != valid_obs_ct) { nmaj_ct1 = GenoBitvecSumSubset(nm_bitvec0, one_bitvec1, two_bitvec1, sample_ctl); @@ -6167,7 +6483,7 @@ uint32_t ComputeR2NondosagePhasedStats(const R2NondosageVariant* ndp0, const R2N GenoBitvecPhasedDotprod(one_bitvec0, two_bitvec0, one_bitvec1, two_bitvec1, sample_ctl, &known_dotprod, &unknown_hethet_ct); if ((phase_type == kR2PhaseTypePresent) && (unknown_hethet_ct != 0)) { // don't bother with no-phase-here optimization for now - HardcallPhasedR2Refine(ndp0->phasepresent, ndp0->phaseinfo, ndp1->phasepresent, ndp1->phaseinfo, sample_ctl, &known_dotprod, &unknown_hethet_ct); + HardcallPhasedR2Refine(ndp0->p.d.phasepresent, ndp0->p.d.phaseinfo, ndp1->p.d.phasepresent, ndp1->p.d.phaseinfo, sample_ctl, &known_dotprod, &unknown_hethet_ct); } nmajsums_d[0] = u31tod(nmaj_ct0); nmajsums_d[1] = u31tod(nmaj_ct1); @@ -6325,12 +6641,180 @@ double ComputeR2(const R2Variant* r2vp0, const R2Variant* r2vp1, uint32_t sample return results[0]; } +uint32_t ComputeR2NondosageUnphased1SparseSubsetStats(const R2NondosageVariant* densevp0, const R2NondosageVariant* sparsevp1, const uintptr_t* sample_include, uint32_t subsetted_nm_ct0, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr) { + const uint32_t difflist_common_geno = sparsevp1->p.s.difflist_common_geno; + const uint32_t difflist_len = sparsevp1->p.s.difflist_len; + const uint32_t* difflist_sample_ids = sparsevp1->p.s.difflist_sample_ids; + uint32_t nmaj_ct0 = 0; + uint32_t ssq0 = 0; + uint32_t dotprod = 0; + uint32_t valid_obs_ct = 0; + if (difflist_common_geno != 3) { + nmaj_ct0 = *nmaj_ct0_ptr; + ssq0 = *ssq0_ptr; + dotprod = difflist_common_geno * nmaj_ct0; + valid_obs_ct = subsetted_nm_ct0; + } + if (difflist_len) { + const uintptr_t* nm_bitvec0 = densevp0->p.d.nm_bitvec; + const uintptr_t* one_bitvec0 = densevp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = densevp0->p.d.two_bitvec; + const uintptr_t* raregeno = sparsevp1->p.s.raregeno; + const uint32_t word_ct_m1 = (difflist_len - 1) / kBitsPerWordD2; + uint32_t joint_counts[16]; // low bits = dense geno, high bits = sparse + ZeroU32Arr(16, joint_counts); + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(difflist_len, kBitsPerWordD2); + } + const uint32_t* cur_difflist_sample_ids = &(difflist_sample_ids[widx * kBitsPerWordD2]); + uintptr_t raregeno_word = raregeno[widx]; + for (uint32_t uii = 0; uii != loop_len; ++uii) { + const uint32_t sample_idx = cur_difflist_sample_ids[uii]; + const uint32_t sample_widx = sample_idx / kBitsPerWord; + const uint32_t sample_idx_lowbits = sample_idx % kBitsPerWord; + if ((sample_include[sample_widx] >> sample_idx_lowbits) & 1) { + const uintptr_t cur_sparse_geno = raregeno_word & 3; + const uintptr_t nm_bit = (nm_bitvec0[sample_widx] >> sample_idx_lowbits) & 1; + const uintptr_t one_bit = (one_bitvec0[sample_widx] >> sample_idx_lowbits) & 1; + // "& 3" takes care of this mask + const uintptr_t two_bit_unmasked = two_bitvec0[sample_widx] >> sample_idx_lowbits; + const uintptr_t cur_dense_geno = (nm_bit + one_bit + two_bit_unmasked * 2 - 1) & 3; + joint_counts[cur_dense_geno + cur_sparse_geno * 4] += 1; + } + raregeno_word = raregeno_word >> 2; + } + } + *nmaj_ct1_ptr -= joint_counts[7] + 2 * joint_counts[11]; + *ssq1_ptr -= joint_counts[7] + 4 * joint_counts[11]; + if (difflist_common_geno != 3) { + nmaj_ct0 -= joint_counts[13] + 2 * joint_counts[14]; + ssq0 -= joint_counts[13] + 4 * joint_counts[14]; + const uint32_t sparse_missing_dense_nm_ct = joint_counts[12] + joint_counts[13] + joint_counts[14]; + valid_obs_ct -= sparse_missing_dense_nm_ct; + } else { + const uint32_t dense_one_ct = joint_counts[1] + joint_counts[5] + joint_counts[9]; + const uint32_t dense_two_ct = joint_counts[2] + joint_counts[6] + joint_counts[10]; + nmaj_ct0 = dense_one_ct + 2 * dense_two_ct; + ssq0 = dense_one_ct + 4 * dense_two_ct; + const uint32_t dense_zero_ct = joint_counts[0] + joint_counts[4] + joint_counts[8]; + valid_obs_ct = dense_zero_ct + dense_one_ct + dense_two_ct; + } + if (difflist_common_geno != 2) { + dotprod = joint_counts[5] + 2 * (joint_counts[6] + joint_counts[9]) + 4 * joint_counts[10]; + } else { + dotprod -= joint_counts[5] + 2 * (joint_counts[1] + joint_counts[6] + joint_counts[13]) + 4 * (joint_counts[2] + joint_counts[14]); + } + } + *nmaj_ct0_ptr = nmaj_ct0; + *ssq0_ptr = ssq0; + *dotprod_ptr = dotprod; + return valid_obs_ct; +} + +uint32_t ComputeR2NondosageUnphased2SparseSubsetStats(const R2NondosageVariant* ndp0, const R2NondosageVariant* ndp1, const uintptr_t* sample_include, uint32_t sample_ct, uint32_t subsetted_nm_ct0, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr) { + const uint32_t difflist_common_geno0 = ndp0->p.s.difflist_common_geno; + const uint32_t difflist_common_geno1 = ndp1->p.s.difflist_common_geno; + const uint32_t difflist_len0 = ndp0->p.s.difflist_len; + const uint32_t difflist_len1 = ndp1->p.s.difflist_len; + const uint32_t* difflist_sample_ids0 = ndp0->p.s.difflist_sample_ids; + const uint32_t* difflist_sample_ids1 = ndp1->p.s.difflist_sample_ids; + const uintptr_t* raregeno0 = ndp0->p.s.raregeno; + const uintptr_t* raregeno1 = ndp1->p.s.raregeno; + uint32_t joint_counts[16]; // low bits = ndp0, high bits = ndp1 + ZeroU32Arr(16, joint_counts); + uint32_t difflist_idx1 = 0; + uint32_t sample_idx1 = UINT32_MAX; + uintptr_t raregeno_word1 = 0; + if (difflist_len1) { + sample_idx1 = difflist_sample_ids1[0]; + raregeno_word1 = raregeno1[0]; + } + uintptr_t raregeno_word0 = 0; + for (uint32_t difflist_idx0 = 0; ; ++difflist_idx0) { + uint32_t sample_idx0 = UINT32_MAX; + if (difflist_idx0 < difflist_len0) { + sample_idx0 = difflist_sample_ids0[difflist_idx0]; + raregeno_word0 = raregeno_word0 >> 2; + if (!(difflist_idx0 % kBitsPerWordD2)) { + raregeno_word0 = raregeno0[difflist_idx0 / kBitsPerWordD2]; + } + } + while (sample_idx1 < sample_idx0) { + const uintptr_t cur_geno1 = raregeno_word1 & 3; + joint_counts[difflist_common_geno0 + cur_geno1 * 4] += IsSet(sample_include, sample_idx1); + sample_idx1 = UINT32_MAX; + if (++difflist_idx1 < difflist_len1) { + sample_idx1 = difflist_sample_ids1[difflist_idx1]; + raregeno_word1 = raregeno_word1 >> 2; + if (!(difflist_idx1 % kBitsPerWordD2)) { + raregeno_word1 = raregeno1[difflist_idx1 / kBitsPerWordD2]; + } + } + } + if (sample_idx0 == UINT32_MAX) { + break; + } + const uint32_t is_set = IsSet(sample_include, sample_idx0); + const uintptr_t cur_geno0 = raregeno_word0 & 3; + if (sample_idx0 != sample_idx1) { + joint_counts[cur_geno0 + difflist_common_geno1 * 4] += is_set; + continue; + } + const uintptr_t cur_geno1 = raregeno_word1 & 3; + joint_counts[cur_geno0 + cur_geno1 * 4] += is_set; + sample_idx1 = UINT32_MAX; + if (++difflist_idx1 < difflist_len1) { + sample_idx1 = difflist_sample_ids1[difflist_idx1]; + raregeno_word1 = raregeno_word1 >> 2; + if (!(difflist_idx1 % kBitsPerWordD2)) { + raregeno_word1 = raregeno1[difflist_idx1 / kBitsPerWordD2]; + } + } + } + *nmaj_ct0_ptr -= joint_counts[13] + 2 * joint_counts[14]; + *nmaj_ct1_ptr -= joint_counts[7] + 2 * joint_counts[11]; + *ssq0_ptr -= joint_counts[13] + 4 * joint_counts[14]; + *ssq1_ptr -= joint_counts[7] + 4 * joint_counts[11]; + uint32_t dotprod = joint_counts[5] + 2 * (joint_counts[6] + joint_counts[9]) + 4 * joint_counts[10]; + if ((difflist_common_geno0 == 2) && (difflist_common_geno1 == 2)) { + uint32_t sparse_union_ct = 0; + for (uint32_t uii = 0; uii != 16; ++uii) { + sparse_union_ct += joint_counts[uii]; + } + dotprod += 4 * (sample_ct - sparse_union_ct); + } + *dotprod_ptr = dotprod; + uint32_t valid_obs_ct; + if (difflist_common_geno1 != 3) { + const uint32_t missing1_nm0_ct = joint_counts[12] + joint_counts[13] + joint_counts[14]; + valid_obs_ct = subsetted_nm_ct0 - missing1_nm0_ct; + } else { + valid_obs_ct = joint_counts[0] + joint_counts[1] + joint_counts[2] + joint_counts[4] + joint_counts[5] + joint_counts[6] + joint_counts[8] + joint_counts[9] + joint_counts[10]; + } + return valid_obs_ct; +} + // nmaj_ct0, nmaj_ct1, ssq0, and ssq1 assumed to be initialized to precomputed // subsetted values. uint32_t ComputeR2NondosageUnphasedSubsetStats(const R2NondosageVariant* ndp0, const R2NondosageVariant* ndp1, const uintptr_t* sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t subsetted_nm_ct0, uint32_t subsetted_nm_ct1, uint32_t* nmaj_ct0_ptr, uint32_t* nmaj_ct1_ptr, uint32_t* ssq0_ptr, uint32_t* ssq1_ptr, uint32_t* dotprod_ptr, uintptr_t* cur_nm_buf) { + if (ndp0->is_sparse) { + if (ndp1->is_sparse) { + return ComputeR2NondosageUnphased2SparseSubsetStats(ndp0, ndp1, sample_include, sample_ct, subsetted_nm_ct0, nmaj_ct0_ptr, nmaj_ct1_ptr, ssq0_ptr, ssq1_ptr, dotprod_ptr); + } else { + return ComputeR2NondosageUnphased1SparseSubsetStats(ndp1, ndp0, sample_include, subsetted_nm_ct1, nmaj_ct1_ptr, nmaj_ct0_ptr, ssq1_ptr, ssq0_ptr, dotprod_ptr); + } + } + if (ndp1->is_sparse) { + return ComputeR2NondosageUnphased1SparseSubsetStats(ndp0, ndp1, sample_include, subsetted_nm_ct0, nmaj_ct0_ptr, nmaj_ct1_ptr, ssq0_ptr, ssq1_ptr, dotprod_ptr); + } const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct); - const uintptr_t* nm_bitvec0 = ndp0->nm_bitvec; - const uintptr_t* nm_bitvec1 = ndp1->nm_bitvec; + const uintptr_t* nm_bitvec0 = ndp0->p.d.nm_bitvec; + const uintptr_t* nm_bitvec1 = ndp1->p.d.nm_bitvec; const uintptr_t* cur_nm; uint32_t valid_obs_ct; if (subsetted_nm_ct0 == sample_ct) { @@ -6359,8 +6843,8 @@ uint32_t ComputeR2NondosageUnphasedSubsetStats(const R2NondosageVariant* ndp0, c } cur_nm = cur_nm_buf; } - const uintptr_t* one_bitvec0 = ndp0->one_bitvec; - const uintptr_t* two_bitvec0 = ndp0->two_bitvec; + const uintptr_t* one_bitvec0 = ndp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = ndp0->p.d.two_bitvec; if (subsetted_nm_ct0 != valid_obs_ct) { const uint32_t nmaj_ct0 = GenoBitvecSumSubset(cur_nm, one_bitvec0, two_bitvec0, raw_sample_ctl); *nmaj_ct0_ptr = nmaj_ct0; @@ -6368,8 +6852,8 @@ uint32_t ComputeR2NondosageUnphasedSubsetStats(const R2NondosageVariant* ndp0, c *ssq0_ptr = nmaj_ct0 + 2 * PopcountWordsIntersect(cur_nm, two_bitvec0, raw_sample_ctl); } - const uintptr_t* one_bitvec1 = ndp1->one_bitvec; - const uintptr_t* two_bitvec1 = ndp1->two_bitvec; + const uintptr_t* one_bitvec1 = ndp1->p.d.one_bitvec; + const uintptr_t* two_bitvec1 = ndp1->p.d.two_bitvec; if (subsetted_nm_ct1 != valid_obs_ct) { const uint32_t nmaj_ct1 = GenoBitvecSumSubset(cur_nm, one_bitvec1, two_bitvec1, raw_sample_ctl); *nmaj_ct1_ptr = nmaj_ct1; @@ -6381,8 +6865,8 @@ uint32_t ComputeR2NondosageUnphasedSubsetStats(const R2NondosageVariant* ndp0, c uint32_t ComputeR2NondosagePhasedSubsetStats(const R2NondosageVariant* ndp0, const R2NondosageVariant* ndp1, const uintptr_t* sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t subsetted_nm_ct0, uint32_t subsetted_nm_ct1, uint32_t subsetted_nmaj_ct0, uint32_t subsetted_nmaj_ct1, R2PhaseType phase_type, double* nmajsums_d, double* known_dotprod_d_ptr, double* unknown_hethet_d_ptr, uintptr_t* cur_nm_buf) { const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct); - const uintptr_t* nm_bitvec0 = ndp0->nm_bitvec; - const uintptr_t* nm_bitvec1 = ndp1->nm_bitvec; + const uintptr_t* nm_bitvec0 = ndp0->p.d.nm_bitvec; + const uintptr_t* nm_bitvec1 = ndp1->p.d.nm_bitvec; const uintptr_t* cur_nm; uint32_t valid_obs_ct; if (subsetted_nm_ct0 == sample_ct) { @@ -6410,15 +6894,15 @@ uint32_t ComputeR2NondosagePhasedSubsetStats(const R2NondosageVariant* ndp0, con } cur_nm = cur_nm_buf; } - const uintptr_t* one_bitvec0 = ndp0->one_bitvec; - const uintptr_t* two_bitvec0 = ndp0->two_bitvec; + const uintptr_t* one_bitvec0 = ndp0->p.d.one_bitvec; + const uintptr_t* two_bitvec0 = ndp0->p.d.two_bitvec; uint32_t nmaj_ct0 = subsetted_nmaj_ct0; if (subsetted_nm_ct0 != valid_obs_ct) { nmaj_ct0 = GenoBitvecSumSubset(cur_nm, one_bitvec0, two_bitvec0, raw_sample_ctl); } - const uintptr_t* one_bitvec1 = ndp1->one_bitvec; - const uintptr_t* two_bitvec1 = ndp1->two_bitvec; + const uintptr_t* one_bitvec1 = ndp1->p.d.one_bitvec; + const uintptr_t* two_bitvec1 = ndp1->p.d.two_bitvec; uint32_t nmaj_ct1 = subsetted_nmaj_ct1; if (subsetted_nm_ct1 != valid_obs_ct) { nmaj_ct1 = GenoBitvecSumSubset(cur_nm, one_bitvec1, two_bitvec1, raw_sample_ctl); @@ -6429,7 +6913,7 @@ uint32_t ComputeR2NondosagePhasedSubsetStats(const R2NondosageVariant* ndp0, con GenoBitvecPhasedDotprodSubset(cur_nm, one_bitvec0, two_bitvec0, one_bitvec1, two_bitvec1, raw_sample_ctl, &known_dotprod, &unknown_hethet_ct); if ((phase_type == kR2PhaseTypePresent) && (unknown_hethet_ct != 0)) { // don't bother with no-phase-here optimization for now - HardcallPhasedR2RefineSubset(cur_nm, ndp0->phasepresent, ndp0->phaseinfo, ndp1->phasepresent, ndp1->phaseinfo, raw_sample_ctl, &known_dotprod, &unknown_hethet_ct); + HardcallPhasedR2RefineSubset(cur_nm, ndp0->p.d.phasepresent, ndp0->p.d.phaseinfo, ndp1->p.d.phasepresent, ndp1->p.d.phaseinfo, raw_sample_ctl, &known_dotprod, &unknown_hethet_ct); } nmajsums_d[0] = u31tod(nmaj_ct0); nmajsums_d[1] = u31tod(nmaj_ct1); @@ -6534,7 +7018,22 @@ double ComputeXR2(const R2Variant* r2vp0, const R2Variant* r2vp1, const uintptr_ uint32_t male_ssq0 = ndp0->x_male_ssq; uint32_t male_ssq1 = ndp1->x_male_ssq; uint32_t male_dotprod; + /* + printf("male_nm_ct0: %u\n", ndp0->x_male_nm_ct); + printf("male_nm_ct1: %u\n", ndp1->x_male_nm_ct); + printf("male_nmaj_ct0: %u\n", male_nmaj_ct0); + printf("male_nmaj_ct1: %u\n", male_nmaj_ct1); + printf("male_ssq0: %u\n", male_ssq0); + printf("male_ssq1: %u\n", male_ssq1); + */ male_obs_ct = ComputeR2NondosageUnphasedSubsetStats(ndp0, ndp1, founder_male_collapsed, sample_ct, male_ct, ndp0->x_male_nm_ct, ndp1->x_male_nm_ct, &male_nmaj_ct0, &male_nmaj_ct1, &male_ssq0, &male_ssq1, &male_dotprod, cur_nm_buf); + /* + printf("male_nmaj_ct0: %u\n", male_nmaj_ct0); + printf("male_nmaj_ct1: %u\n", male_nmaj_ct1); + printf("male_ssq0: %u\n", male_ssq0); + printf("male_ssq1: %u\n", male_ssq1); + printf("male_dotprod: %u\n", male_dotprod); + */ const double weighted_obs_ct = u31tod(valid_obs_ct) - male_downwt * u31tod(male_obs_ct); const double weighted_nmaj_ct0 = u31tod(nmaj_ct0) - male_downwt * u31tod(male_nmaj_ct0); @@ -6791,7 +7290,7 @@ void ClumpLowmemR2(uintptr_t tidx, uint32_t thread_ct_p1, uint32_t parity, Clump if (load_dosage) { LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, unpacked_cur_variant); } else { - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, unpacked_cur_variant); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, unpacked_cur_variant); } R2Variant cur_r2v; FillR2V(unpacked_cur_variant, founder_ct, phase_type, is_x, load_dosage, &cur_r2v); @@ -7590,6 +8089,16 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c bigstack_alloc_wp(calc_thread_ct + 1, &(ctx.ld_idx_found)))) { goto ClumpReports_ret_NOMEM; } + ctx.raregeno_bufs = nullptr; + ctx.difflist_sample_id_bufs = nullptr; + ctx.chrx_workspaces = nullptr; + const uint32_t phased_r2 = !(flags & kfClumpUnphased); + if (!phased_r2) { + if (unlikely(bigstack_alloc_wp(calc_thread_ct, &ctx.raregeno_bufs) || + bigstack_alloc_u32p(calc_thread_ct, &ctx.difflist_sample_id_bufs))) { + goto ClumpReports_ret_NOMEM; + } + } if (x_exists) { if (unlikely(bigstack_alloc_wp(calc_thread_ct + 1, &ctx.chrx_workspaces))) { goto ClumpReports_ret_NOMEM; @@ -7600,7 +8109,6 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c // PgfiMultiread use case that we fork PgenMtLoadInit()'s computation here // instead of modifying that function. - const uint32_t phased_r2 = !(flags & kfClumpUnphased); const uint32_t all_haploid = IsSet(cip->haploid_mask, 0); const uint32_t check_phase = phased_r2 && (!all_haploid) && (pgfip->gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent)); PgenGlobalFlags effective_gflags = pgfip->gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent); @@ -7619,7 +8127,7 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c unpacked_byte_stride = dosagevec_byte_ct * (1 + phased_r2 + check_phase) + bitvec_byte_ct + dosage_trail_byte_ct; } else { const uintptr_t nondosage_trail_byte_ct = LdNondosageTrailAlignedByteCt(S_CAST(R2PhaseType, phased_r2), x_exists); - unpacked_byte_stride = bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; + unpacked_byte_stride = RoundUpPow2(16, kBytesPerVec) + bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; } const uintptr_t unpacked_byte_stride_cachealign = RoundUpPow2(unpacked_byte_stride, kCacheline); const uintptr_t pgr_struct_alloc = RoundUpPow2(sizeof(PgenReader), kCacheline); @@ -7631,6 +8139,8 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c // memory writes. I'm guessing that is also a reasonable unpacked_variants // shard size to aim for. const uint32_t min_pgv_per_thread = 1 + 4194303 / pgv_byte_stride; + uintptr_t sparse_alloc = 0; + uint32_t raregeno_word_ct = 0; uintptr_t chrx_alloc = 0; if (x_exists) { const uint32_t larger_half = MAXV(founder_male_ct, founder_nonmale_ct); @@ -7642,6 +8152,13 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c const uintptr_t phasepresent_alloc = check_phase * min_u32_alloc; const uintptr_t dosage_ct_alloc = check_dosage * min_u32_alloc; const uintptr_t dphase_ct_alloc = dosage_ct_alloc * check_phase; + if (!phased_r2) { + const uint32_t max_returned_difflist_len = 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor); + const uintptr_t raregeno_vec_ct = DivUp(max_returned_difflist_len, kNypsPerVec); + const uintptr_t difflist_sample_id_vec_ct = DivUp(max_returned_difflist_len, kInt32PerVec); + sparse_alloc = RoundUpPow2((raregeno_vec_ct + difflist_sample_id_vec_ct) * kBytesPerVec, kCacheline); + raregeno_word_ct = raregeno_vec_ct * kWordsPerVec; + } uintptr_t bytes_avail = bigstack_left(); const uintptr_t more_base_alloc = pgr_struct_alloc + pgr_alloc_cacheline_ct * kCacheline + 2 * unpacked_byte_stride_cachealign + min_ld_idx_found_alloc + phasepresent_alloc + dosage_ct_alloc + dphase_ct_alloc + chrx_alloc; @@ -7650,7 +8167,7 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c } bytes_avail -= more_base_alloc; - const uintptr_t per_thread_target_alloc = 2 * min_pgv_per_thread * pgv_byte_stride + unpacked_byte_stride_cachealign + pgr_struct_alloc + pgr_alloc_cacheline_ct * kCacheline + min_ld_idx_found_alloc + chrx_alloc; + const uintptr_t per_thread_target_alloc = 2 * min_pgv_per_thread * pgv_byte_stride + unpacked_byte_stride_cachealign + pgr_struct_alloc + pgr_alloc_cacheline_ct * kCacheline + min_ld_idx_found_alloc + sparse_alloc + chrx_alloc; if (bytes_avail < per_thread_target_alloc * calc_thread_ct) { calc_thread_ct = bytes_avail / per_thread_target_alloc; if (unlikely(!calc_thread_ct)) { @@ -7676,6 +8193,11 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c // shouldn't be possible for this to fail PgrInit(nullptr, 0, pgfip, ctx.pgr_ptrs[tidx], pgr_alloc); + if (!phased_r2) { + uintptr_t* raregeno_buf = S_CAST(uintptr_t*, bigstack_end_alloc_raw(sparse_alloc)); + ctx.raregeno_bufs[tidx] = raregeno_buf; + ctx.difflist_sample_id_bufs[tidx] = R_CAST(uint32_t*, &(raregeno_buf[raregeno_word_ct])); + } if (x_exists) { ctx.chrx_workspaces[tidx] = S_CAST(uintptr_t*, bigstack_end_alloc_raw(chrx_alloc)); } @@ -8187,7 +8709,7 @@ PglErr ClumpReports(const uintptr_t* orig_variant_include, const ChrInfo* cip, c if (load_dosage) { LdUnpackDosage(&pgv, cur_founder_male_collapsed, cur_male_dosage_invmask, founder_ct, phase_type, lowmem_unpacked_variants); } else { - LdUnpackNondosage(&pgv, cur_founder_male_collapsed, founder_ct, phase_type, lowmem_unpacked_variants); + LdUnpackNondosageDense(&pgv, cur_founder_male_collapsed, founder_ct, phase_type, lowmem_unpacked_variants); } index_variant_needed = 0; } @@ -9166,6 +9688,17 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con const R2PhaseType phase_type = GetR2PhaseType(phased_calc, check_phase); const uint32_t check_dosage = (effective_gflags / kfPgenGlobalDosagePresent) & 1; + uintptr_t* raregeno = nullptr; + uint32_t* difflist_sample_ids = nullptr; + const uint32_t max_simple_difflist_len = founder_ct / 64; + if (!phased_calc) { + const uint32_t max_returned_difflist_len = 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor); + if (unlikely(bigstack_alloc_w(NypCtToWordCt(max_returned_difflist_len), &raregeno) || + bigstack_alloc_u32(max_returned_difflist_len, &difflist_sample_ids))) { + goto VcorMatrix_ret_NOMEM; + } + } + // create abbreviated chromosome-view for use by worker threads. // they may need to know where each chromosome starts/ends (use phase or // not?), and which chromosome is X; that's it @@ -9297,7 +9830,7 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con unpacked_variant_byte_stride = dosagevec_byte_ct * (1 + phased_calc + check_phase) + bitvec_byte_ct + dosage_trail_byte_ct; } else { const uintptr_t nondosage_trail_byte_ct = LdNondosageTrailAlignedByteCt(S_CAST(R2PhaseType, phased_calc), x_exists); - unpacked_variant_byte_stride = bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; + unpacked_variant_byte_stride = RoundUpPow2(16, kBytesPerVec) + bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; } uint32_t* founder_info_cumulative_popcounts; PgenVariant pgv; @@ -9449,7 +9982,7 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con row_window_size = row_variant_idx_stop - cur_row_variant_idx_start; } unsigned char* row_load_iter = ctx.unpacked_row_variants[row_parity]; - for (uint32_t vidx_offset = 0; vidx_offset != row_window_size; ++vidx_offset) { + for (uint32_t vidx_offset = 0; vidx_offset != row_window_size; ++vidx_offset, row_load_iter = &(row_load_iter[unpacked_variant_byte_stride])) { const uint32_t variant_uidx = BitIter1(variant_include, &row_variant_uidx_base, &row_cur_bits); if (variant_uidx >= row_chr_end) { do { @@ -9466,6 +9999,7 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con } } const AlleleCode aidx = maj_alleles[variant_uidx]; + const uint32_t is_y = (variant_uidx < y_end) && (variant_uidx >= y_start); if (check_dosage) { if (row_read_phase) { reterr = PgrGetInv1Dp(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, &pgv); @@ -9475,25 +10009,40 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con if (unlikely(reterr)) { goto VcorMatrix_ret_PGR_FAIL; } - if ((variant_uidx < y_end) && (variant_uidx >= y_start)) { + if (is_y) { InterleavedSetMissingCleardosage(founder_female_collapsed, founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec, &pgv.dosage_ct, pgv.dosage_present, pgv.dosage_main); } LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, row_load_iter); } else { - if (row_read_phase) { - reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + if ((!phased_calc) && (!is_y)) { + uint32_t difflist_common_geno; + uint32_t difflist_len; + reterr = PgrGetInv1DifflistOrGenovec(founder_info, pssi, founder_ct, max_simple_difflist_len, variant_uidx, aidx, simple_pgrp, pgv.genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len); + if (unlikely(reterr)) { + goto VcorMatrix_ret_PGR_FAIL; + } + if (difflist_common_geno != UINT32_MAX) { + if (difflist_len <= max_simple_difflist_len) { + LdUnpackNondosageSparse(raregeno, difflist_sample_ids, founder_male_collapsed, founder_ct, founder_male_ct, difflist_common_geno, difflist_len, row_load_iter); + continue; + } + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, founder_ct, difflist_len, pgv.genovec); + } } else { - reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec); - } - if (unlikely(reterr)) { - goto VcorMatrix_ret_PGR_FAIL; - } - if ((variant_uidx < y_end) && (variant_uidx >= y_start)) { - InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + if (row_read_phase) { + reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + } else { + reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec); + } + if (unlikely(reterr)) { + goto VcorMatrix_ret_PGR_FAIL; + } + if (is_y) { + InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + } } - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, row_load_iter); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, row_load_iter); } - row_load_iter = &(row_load_iter[unpacked_variant_byte_stride]); } const uint32_t cur_row_variant_idx_stop = cur_row_variant_idx_start + row_window_size; if (triangle_calc) { @@ -9513,7 +10062,7 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con } unsigned char* col_load_iter = ctx.unpacked_col_variants[col_parity]; // possible todo: don't duplicate already-unpacked row variants - for (uint32_t vidx_offset = 0; vidx_offset != col_window_size; ++vidx_offset) { + for (uint32_t vidx_offset = 0; vidx_offset != col_window_size; ++vidx_offset, col_load_iter = &(col_load_iter[unpacked_variant_byte_stride])) { const uint32_t variant_uidx = BitIter1(variant_include, &col_variant_uidx_base, &col_cur_bits); if (variant_uidx >= col_chr_end) { do { @@ -9530,6 +10079,7 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con } } const AlleleCode aidx = maj_alleles[variant_uidx]; + const uint32_t is_y = (variant_uidx < y_end) && (variant_uidx >= y_start); if (check_dosage) { if (col_read_phase) { reterr = PgrGetInv1Dp(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, &pgv); @@ -9539,25 +10089,40 @@ PglErr VcorMatrix(const uintptr_t* orig_variant_include, const ChrInfo* cip, con if (unlikely(reterr)) { goto VcorMatrix_ret_PGR_FAIL; } - if ((variant_uidx < y_end) && (variant_uidx >= y_start)) { + if (is_y) { InterleavedSetMissingCleardosage(founder_female_collapsed, founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec, &pgv.dosage_ct, pgv.dosage_present, pgv.dosage_main); } LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, col_load_iter); } else { - if (col_read_phase) { - reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + if ((!phased_calc) && (!is_y)) { + uint32_t difflist_common_geno; + uint32_t difflist_len; + reterr = PgrGetInv1DifflistOrGenovec(founder_info, pssi, founder_ct, max_simple_difflist_len, variant_uidx, aidx, simple_pgrp, pgv.genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len); + if (unlikely(reterr)) { + goto VcorMatrix_ret_PGR_FAIL; + } + if (difflist_common_geno != UINT32_MAX) { + if (difflist_len <= max_simple_difflist_len) { + LdUnpackNondosageSparse(raregeno, difflist_sample_ids, founder_male_collapsed, founder_ct, founder_male_ct, difflist_common_geno, difflist_len, col_load_iter); + continue; + } + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, founder_ct, difflist_len, pgv.genovec); + } } else { - reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec); - } - if (unlikely(reterr)) { - goto VcorMatrix_ret_PGR_FAIL; - } - if ((variant_uidx < y_end) && (variant_uidx >= y_start)) { - InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + if (col_read_phase) { + reterr = PgrGetInv1P(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + } else { + reterr = PgrGetInv1(founder_info, pssi, founder_ct, variant_uidx, aidx, simple_pgrp, pgv.genovec); + } + if (unlikely(reterr)) { + goto VcorMatrix_ret_PGR_FAIL; + } + if (is_y) { + InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + } } - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, col_load_iter); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, col_load_iter); } - col_load_iter = &(col_load_iter[unpacked_variant_byte_stride]); } const uint32_t cur_col_variant_idx_stop = cur_col_variant_idx_start + col_window_size; job_done = next_job_done; @@ -9951,12 +10516,14 @@ THREAD_FUNC_DECL VcorTableWriteThread(void* raw_arg) { ++row_chr_fo_idx; row_chr_end = cip->chr_fo_vidx_start[row_chr_fo_idx + 1]; } while (row_variant_uidx >= row_chr_end); - const uint32_t row_chr_idx = cip->chr_file_order[row_chr_fo_idx]; - char* chr_name_end = chrtoa(cip, row_chr_idx, row_chr_buf); - *chr_name_end = '\t'; - row_chr_blen = 1 + S_CAST(uintptr_t, chr_name_end - row_chr_buf); - // row_chr_buf == col_chr_buf except in inter_chr case - col_chr_blen = row_chr_blen; + if (row_chr_buf) { + const uint32_t row_chr_idx = cip->chr_file_order[row_chr_fo_idx]; + char* chr_name_end = chrtoa(cip, row_chr_idx, row_chr_buf); + *chr_name_end = '\t'; + row_chr_blen = 1 + S_CAST(uintptr_t, chr_name_end - row_chr_buf); + // row_chr_buf == col_chr_buf except in inter_chr case + col_chr_blen = row_chr_blen; + } } uintptr_t row_allele_idx_offset_base = row_variant_uidx * 2; if (allele_idx_offsets) { @@ -10005,10 +10572,12 @@ THREAD_FUNC_DECL VcorTableWriteThread(void* raw_arg) { ++col_chr_fo_idx; col_chr_end = cip->chr_fo_vidx_start[col_chr_fo_idx + 1]; } while (col_variant_uidx >= col_chr_end); - const uint32_t col_chr_idx = cip->chr_file_order[col_chr_fo_idx]; - char* chr_name_end = chrtoa(cip, col_chr_idx, col_chr_buf); - *chr_name_end = '\t'; - col_chr_blen = 1 + S_CAST(uintptr_t, chr_name_end - col_chr_buf); + if (col_chr_buf) { + const uint32_t col_chr_idx = cip->chr_file_order[col_chr_fo_idx]; + char* chr_name_end = chrtoa(cip, col_chr_idx, col_chr_buf); + *chr_name_end = '\t'; + col_chr_blen = 1 + S_CAST(uintptr_t, chr_name_end - col_chr_buf); + } } if (chr_col) { cswritep = memcpya(cswritep, row_chr_buf, row_chr_blen); @@ -10536,6 +11105,17 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons const R2PhaseType phase_type = GetR2PhaseType(phased_calc, check_phase); const uint32_t check_dosage = (effective_gflags / kfPgenGlobalDosagePresent) & 1; + uintptr_t* raregeno = nullptr; + uint32_t* difflist_sample_ids = nullptr; + const uint32_t max_simple_difflist_len = founder_ct / 64; + if (!phased_calc) { + const uint32_t max_returned_difflist_len = 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor); + if (unlikely(bigstack_alloc_w(NypCtToWordCt(max_returned_difflist_len), &raregeno) || + bigstack_alloc_u32(max_returned_difflist_len, &difflist_sample_ids))) { + goto VcorTable_ret_NOMEM; + } + } + const uintptr_t* founder_male_collapsed = nullptr; const Dosage* male_dosage_invmask = nullptr; ctx.founder_nonmale_collapsed = nullptr; @@ -10629,7 +11209,7 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons unpacked_variant_byte_stride = dosagevec_byte_ct * (1 + phased_calc + check_phase) + bitvec_byte_ct + dosage_trail_byte_ct; } else { const uintptr_t nondosage_trail_byte_ct = LdNondosageTrailAlignedByteCt(S_CAST(R2PhaseType, phased_calc), x_exists); - unpacked_variant_byte_stride = bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; + unpacked_variant_byte_stride = RoundUpPow2(16, kBytesPerVec) + bitvec_byte_ct * (3 + 2 * check_phase) + nondosage_trail_byte_ct; } uint32_t* founder_info_cumulative_popcounts; PgenVariant pgv; @@ -10836,7 +11416,7 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons uint32_t row_offset = 0; // in inter-chr case, should save col-window size // if row-subset, start at variant_idx=0 instead of at diagonal - for (; row_offset != row_offset_limit; ++row_offset) { + for (; row_offset != row_offset_limit; ++row_offset, row_load_iter = &(row_load_iter[unpacked_variant_byte_stride])) { const uint32_t row_variant_uidx = BitIter1(row_variant_include, &row_variant_uidx_base, &row_cur_bits); if (row_variant_uidx >= row_chr_end) { do { @@ -10889,6 +11469,7 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons write_idx = next_write_idx; const AlleleCode aidx = maj_alleles[row_variant_uidx]; + const uint32_t is_y = (row_variant_uidx < y_end) && (row_variant_uidx >= y_start); if (check_dosage) { if (row_read_phase) { reterr = PgrGetInv1Dp(founder_info, pssi, founder_ct, row_variant_uidx, aidx, simple_pgrp, &pgv); @@ -10898,25 +11479,40 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons if (unlikely(reterr)) { goto VcorTable_ret_PGR_FAIL; } - if ((row_variant_uidx < y_end) && (row_variant_uidx >= y_start)) { + if (is_y) { InterleavedSetMissingCleardosage(founder_female_collapsed, founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec, &pgv.dosage_ct, pgv.dosage_present, pgv.dosage_main); } LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, row_load_iter); } else { - if (row_read_phase) { - reterr = PgrGetInv1P(founder_info, pssi, founder_ct, row_variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + if ((!phased_calc) && (!is_y)) { + uint32_t difflist_common_geno; + uint32_t difflist_len; + reterr = PgrGetInv1DifflistOrGenovec(founder_info, pssi, founder_ct, max_simple_difflist_len, row_variant_uidx, aidx, simple_pgrp, pgv.genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len); + if (unlikely(reterr)) { + goto VcorTable_ret_PGR_FAIL; + } + if (difflist_common_geno != UINT32_MAX) { + if (difflist_len <= max_simple_difflist_len) { + LdUnpackNondosageSparse(raregeno, difflist_sample_ids, founder_male_collapsed, founder_ct, founder_male_ct, difflist_common_geno, difflist_len, row_load_iter); + continue; + } + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, founder_ct, difflist_len, pgv.genovec); + } } else { - reterr = PgrGetInv1(founder_info, pssi, founder_ct, row_variant_uidx, aidx, simple_pgrp, pgv.genovec); - } - if (unlikely(reterr)) { - goto VcorTable_ret_PGR_FAIL; - } - if ((row_variant_uidx < y_end) && (row_variant_uidx >= y_start)) { - InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + if (row_read_phase) { + reterr = PgrGetInv1P(founder_info, pssi, founder_ct, row_variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + } else { + reterr = PgrGetInv1(founder_info, pssi, founder_ct, row_variant_uidx, aidx, simple_pgrp, pgv.genovec); + } + if (unlikely(reterr)) { + goto VcorTable_ret_PGR_FAIL; + } + if (is_y) { + InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + } } - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, row_load_iter); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, row_load_iter); } - row_load_iter = &(row_load_iter[unpacked_variant_byte_stride]); } row_window_size = row_offset; write_idx_starts[row_window_size] = write_idx; @@ -10991,6 +11587,7 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons col_uvidxs[col_offset] = col_uvidx++; col_chr_idxs[col_offset] = col_chr_idx; const AlleleCode aidx = maj_alleles[col_variant_uidx]; + const uint32_t is_y = (col_variant_uidx < y_end) && (col_variant_uidx >= y_start); if (check_dosage) { if (col_read_phase) { reterr = PgrGetInv1Dp(founder_info, pssi, founder_ct, col_variant_uidx, aidx, simple_pgrp, &pgv); @@ -11000,24 +11597,41 @@ PglErr VcorTable(const uintptr_t* orig_variant_include, const ChrInfo* cip, cons if (unlikely(reterr)) { goto VcorTable_ret_PGR_FAIL; } - if ((col_variant_uidx < y_end) && (col_variant_uidx >= y_start)) { + if (is_y) { InterleavedSetMissingCleardosage(founder_female_collapsed, founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec, &pgv.dosage_ct, pgv.dosage_present, pgv.dosage_main); } LdUnpackDosage(&pgv, founder_male_collapsed, male_dosage_invmask, founder_ct, phase_type, col_load_iter); } else { - if (col_read_phase) { - reterr = PgrGetInv1P(founder_info, pssi, founder_ct, col_variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + if ((!phased_calc) && (!is_y)) { + uint32_t difflist_common_geno; + uint32_t difflist_len; + reterr = PgrGetInv1DifflistOrGenovec(founder_info, pssi, founder_ct, max_simple_difflist_len, col_variant_uidx, aidx, simple_pgrp, pgv.genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len); + if (unlikely(reterr)) { + goto VcorTable_ret_PGR_FAIL; + } + if (difflist_common_geno != UINT32_MAX) { + if (difflist_len <= max_simple_difflist_len) { + LdUnpackNondosageSparse(raregeno, difflist_sample_ids, founder_male_collapsed, founder_ct, founder_male_ct, difflist_common_geno, difflist_len, col_load_iter); + goto VcorTable_col_finish; + } + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, founder_ct, difflist_len, pgv.genovec); + } } else { - reterr = PgrGetInv1(founder_info, pssi, founder_ct, col_variant_uidx, aidx, simple_pgrp, pgv.genovec); - } - if (unlikely(reterr)) { - goto VcorTable_ret_PGR_FAIL; - } - if ((col_variant_uidx < y_end) && (col_variant_uidx >= y_start)) { - InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + if (col_read_phase) { + reterr = PgrGetInv1P(founder_info, pssi, founder_ct, col_variant_uidx, aidx, simple_pgrp, pgv.genovec, pgv.phasepresent, pgv.phaseinfo, &pgv.phasepresent_ct); + } else { + reterr = PgrGetInv1(founder_info, pssi, founder_ct, col_variant_uidx, aidx, simple_pgrp, pgv.genovec); + } + if (unlikely(reterr)) { + goto VcorTable_ret_PGR_FAIL; + } + if (is_y) { + InterleavedSetMissing(founder_female_collapsed_interleaved, founder_ctv2, pgv.genovec); + } } - LdUnpackNondosage(&pgv, founder_male_collapsed, founder_ct, phase_type, col_load_iter); + LdUnpackNondosageDense(&pgv, founder_male_collapsed, founder_ct, phase_type, col_load_iter); } + VcorTable_col_finish: col_load_iter = &(col_load_iter[unpacked_variant_byte_stride]); } ctx.col_window_starts[col_parity] = col_window_start; diff --git a/2.0/plink2_ld.h b/2.0/plink2_ld.h index 0eabd4b4..5151e01c 100644 --- a/2.0/plink2_ld.h +++ b/2.0/plink2_ld.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_LD_H__ #define __PLINK2_LD_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_matrix.h b/2.0/plink2_matrix.h index 6b6871b2..74a549c3 100644 --- a/2.0/plink2_matrix.h +++ b/2.0/plink2_matrix.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_MATRIX_H__ #define __PLINK2_MATRIX_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_matrix_calc.cc b/2.0/plink2_matrix_calc.cc index 59bef7ac..968d1366 100644 --- a/2.0/plink2_matrix_calc.cc +++ b/2.0/plink2_matrix_calc.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_matrix_calc.h b/2.0/plink2_matrix_calc.h index 0e62f5af..a83f157c 100644 --- a/2.0/plink2_matrix_calc.h +++ b/2.0/plink2_matrix_calc.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_MATRIX_CALC_H__ #define __PLINK2_MATRIX_CALC_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_merge.cc b/2.0/plink2_merge.cc index f301bdf7..b23d7dbf 100644 --- a/2.0/plink2_merge.cc +++ b/2.0/plink2_merge.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_merge.h b/2.0/plink2_merge.h index 79d7057a..c5feb772 100644 --- a/2.0/plink2_merge.h +++ b/2.0/plink2_merge.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_MERGE_H__ #define __PLINK2_MERGE_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_misc.cc b/2.0/plink2_misc.cc index de14cf12..11db7d3f 100644 --- a/2.0/plink2_misc.cc +++ b/2.0/plink2_misc.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_misc.h b/2.0/plink2_misc.h index cfa6c643..14b1cba1 100644 --- a/2.0/plink2_misc.h +++ b/2.0/plink2_misc.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_MISC_H__ #define __PLINK2_MISC_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_psam.cc b/2.0/plink2_psam.cc index 977e2b25..5882d346 100644 --- a/2.0/plink2_psam.cc +++ b/2.0/plink2_psam.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_psam.h b/2.0/plink2_psam.h index 1f55eb8e..780ccb38 100644 --- a/2.0/plink2_psam.h +++ b/2.0/plink2_psam.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_PSAM_H__ #define __PLINK2_PSAM_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_pvar.cc b/2.0/plink2_pvar.cc index 4ff16d9a..2499d4f8 100644 --- a/2.0/plink2_pvar.cc +++ b/2.0/plink2_pvar.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_pvar.h b/2.0/plink2_pvar.h index e970f5ca..dd14ddf7 100644 --- a/2.0/plink2_pvar.h +++ b/2.0/plink2_pvar.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_PVAR_H__ #define __PLINK2_PVAR_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_random.cc b/2.0/plink2_random.cc index 52187a7d..d350f614 100644 --- a/2.0/plink2_random.cc +++ b/2.0/plink2_random.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_random.h b/2.0/plink2_random.h index 5e42a1f1..cdbff01b 100644 --- a/2.0/plink2_random.h +++ b/2.0/plink2_random.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_RANDOM_H__ #define __PLINK2_RANDOM_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_set.cc b/2.0/plink2_set.cc index 025bf075..0a18a864 100644 --- a/2.0/plink2_set.cc +++ b/2.0/plink2_set.cc @@ -1,4 +1,4 @@ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/plink2_set.h b/2.0/plink2_set.h index 20b818bd..7d6b2745 100644 --- a/2.0/plink2_set.h +++ b/2.0/plink2_set.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_SET_H__ #define __PLINK2_SET_H__ -// This file is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This file is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This program is free software: you can redistribute it and/or modify it diff --git a/2.0/pvar_ffi_support.cc b/2.0/pvar_ffi_support.cc index 82e47184..2c344124 100644 --- a/2.0/pvar_ffi_support.cc +++ b/2.0/pvar_ffi_support.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it diff --git a/2.0/pvar_ffi_support.h b/2.0/pvar_ffi_support.h index 7f4b151e..1b4cf04b 100644 --- a/2.0/pvar_ffi_support.h +++ b/2.0/pvar_ffi_support.h @@ -1,7 +1,7 @@ #ifndef __PVAR_FFI_SUPPORT_H__ #define __PVAR_FFI_SUPPORT_H__ -// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell, +// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it