Skip to content

Commit

Permalink
unphased-r2 hardcall sparse-optimization, fix no-chrom-col segfault, …
Browse files Browse the repository at this point in the history
…2023->2024
  • Loading branch information
chrchang committed Jan 1, 2024
1 parent da1e8df commit be228d8
Show file tree
Hide file tree
Showing 81 changed files with 1,166 additions and 350 deletions.
159 changes: 149 additions & 10 deletions 2.0/include/pgenlib_misc.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
Expand Down Expand Up @@ -1108,10 +1108,14 @@ void DifflistCountSubsetFreqs(const uintptr_t* __restrict sample_include, const

#ifdef USE_SSE2
static_assert(kPglNypTransposeBatch == S_CAST(uint32_t, kNypsPerCacheline), "TransposeNypblock64() needs to be updated.");
# ifdef CACHELINE64
void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
// buf0 and buf1 must each be vector-aligned and have size 16k
// Tried using previous AVX2 small-buffer approach, but that was a bit
// worse... though maybe it should be revisited?

// Each input row has 256 nyps, across 8 words.
// First word of each row goes into first buf0 row, etc.
const uint32_t buf0_row_ct = DivUp(write_batch_size, 32);
{
// Fold the first 3 shuffles into the ingestion loop.
Expand Down Expand Up @@ -1143,18 +1147,18 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
const VecW* buf0_read_iter = R_CAST(const VecW*, buf0);
__m128i* write_iter0 = R_CAST(__m128i*, buf1);
const uint32_t buf0_row_clwidth = DivUp(read_batch_size, 8);
# ifdef USE_SHUFFLE8
# ifdef USE_SHUFFLE8
const VecW gather_u32s = vecw_setr8(0, 1, 8, 9, 2, 3, 10, 11,
4, 5, 12, 13, 6, 7, 14, 15);
# else
# else
const VecW m16 = VCONST_W(kMask0000FFFF);
# endif
# endif
for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) {
__m128i* write_iter1 = &(write_iter0[32]);
__m128i* write_iter2 = &(write_iter1[32]);
__m128i* write_iter3 = &(write_iter2[32]);
for (uint32_t clidx = 0; clidx != buf0_row_clwidth; ++clidx) {
# ifdef USE_AVX2
# ifdef USE_AVX2
VecW loader0 = buf0_read_iter[clidx * 2];
VecW loader1 = buf0_read_iter[clidx * 2 + 1];
loader0 = vecw_shuffle8(loader0, gather_u32s);
Expand All @@ -1176,17 +1180,17 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
write_iter1[clidx] = _mm256_castsi256_si128(final2367);
write_iter2[clidx] = _mm256_extracti128_si256(final0145, 1);
write_iter3[clidx] = _mm256_extracti128_si256(final2367, 1);
# else
# else
VecW loader0 = buf0_read_iter[clidx * 4];
VecW loader1 = buf0_read_iter[clidx * 4 + 1];
VecW loader2 = buf0_read_iter[clidx * 4 + 2];
VecW loader3 = buf0_read_iter[clidx * 4 + 3];
# ifdef USE_SHUFFLE8
# ifdef USE_SHUFFLE8
loader0 = vecw_shuffle8(loader0, gather_u32s);
loader1 = vecw_shuffle8(loader1, gather_u32s);
loader2 = vecw_shuffle8(loader2, gather_u32s);
loader3 = vecw_shuffle8(loader3, gather_u32s);
# else
# else
VecW tmp_lo = vecw_unpacklo16(loader0, loader1);
VecW tmp_hi = vecw_unpackhi16(loader0, loader1);
loader0 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
Expand All @@ -1195,7 +1199,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
tmp_hi = vecw_unpackhi16(loader2, loader3);
loader2 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
# endif
# endif
// (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
// + (0,4) ... (31,7)
// -> (0,0) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7)
Expand All @@ -1207,7 +1211,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
write_iter1[clidx] = WToVec(vecw_unpackhi64(lo_0_15, hi_0_15));
write_iter2[clidx] = WToVec(vecw_unpacklo64(lo_16_31, hi_16_31));
write_iter3[clidx] = WToVec(vecw_unpackhi64(lo_16_31, hi_16_31));
# endif
# endif
}
buf0_read_iter = &(buf0_read_iter[2048 / kBytesPerVec]);
write_iter0 = &(write_iter3[32]);
Expand Down Expand Up @@ -1278,6 +1282,141 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
target_iter0[vidx] = vecw_movemask(target_0123);
}
}
# else
# ifndef CACHELINE128
# error "CACHELINE64 or CACHELINE128 expected."
# endif
// assumes USE_SHUFFLE8, !USE_AVX2
void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
// buf0 and buf1 must each be vector-aligned and have size 64k
// Each input row has 512 nyps, across 16 words.
// First word of each row goes into first buf0 row, etc.
const uint32_t buf0_row_ct = DivUp(write_batch_size, 32);
{
// Fold the first 4 shuffles into the ingestion loop.
const uintptr_t* initial_read_iter = read_iter;
const uintptr_t* initial_read_end = &(initial_read_iter[buf0_row_ct]);
uintptr_t* initial_target_iter = R_CAST(uintptr_t*, buf0);
const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size;
for (; initial_read_iter != initial_read_end; ++initial_read_iter) {
const uintptr_t* read_iter_tmp = initial_read_iter;
for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) {
*initial_target_iter++ = *read_iter_tmp;
read_iter_tmp = &(read_iter_tmp[read_ul_stride]);
}
ZeroWArr(read_batch_rem, initial_target_iter);
initial_target_iter = &(initial_target_iter[read_batch_rem]);
}
}

// First buf0 row now corresponds to a 512x32 nyp matrix (512 * 8 bytes) that
// we wish to transpose. We split this into eight 512x4 matrices.
// (ARMv8 doesn't have efficient movemask, so this should be better than four
// 512x8 matrices.)
// This is nearly identical to the middle step in TransposeBitblock64().
{
const VecW* buf0_read_iter = R_CAST(const VecW*, buf0);
uintptr_t* write_iter0 = R_CAST(uintptr_t*, buf1);
const VecW gather_u16s = vecw_setr8(0, 8, 1, 9, 2, 10, 3, 11,
4, 12, 5, 13, 6, 14, 7, 15);
const uint32_t buf0_row_b64width = DivUp(read_batch_size, 8);
for (uint32_t ridx = 0; ridx != buf0_row_ct; ++ridx) {
uintptr_t* write_iter1 = &(write_iter0[64]);
uintptr_t* write_iter2 = &(write_iter1[64]);
uintptr_t* write_iter3 = &(write_iter2[64]);
uintptr_t* write_iter4 = &(write_iter3[64]);
uintptr_t* write_iter5 = &(write_iter4[64]);
uintptr_t* write_iter6 = &(write_iter5[64]);
uintptr_t* write_iter7 = &(write_iter6[64]);
for (uint32_t b64idx = 0; b64idx != buf0_row_b64width; ++b64idx) {
VecW loader0 = buf0_read_iter[b64idx * 4];
VecW loader1 = buf0_read_iter[b64idx * 4 + 1];
VecW loader2 = buf0_read_iter[b64idx * 4 + 2];
VecW loader3 = buf0_read_iter[b64idx * 4 + 3];
loader0 = vecw_shuffle8(loader0, gather_u16s);
loader1 = vecw_shuffle8(loader1, gather_u16s);
loader2 = vecw_shuffle8(loader2, gather_u16s);
loader3 = vecw_shuffle8(loader3, gather_u16s);
const VecW lo_0123 = vecw_unpacklo16(loader0, loader1);
const VecW lo_4567 = vecw_unpackhi16(loader0, loader1);
const VecW hi_0123 = vecw_unpacklo16(loader2, loader3);
const VecW hi_4567 = vecw_unpackhi16(loader2, loader3);

const VecW final01 = vecw_unpacklo32(lo_0123, hi_0123);
const VecW final23 = vecw_unpackhi32(lo_0123, hi_0123);
const VecW final45 = vecw_unpacklo32(lo_4567, hi_4567);
const VecW final67 = vecw_unpackhi32(lo_4567, hi_4567);
write_iter0[b64idx] = vecw_extract64_0(final01);
write_iter1[b64idx] = vecw_extract64_1(final01);
write_iter2[b64idx] = vecw_extract64_0(final23);
write_iter3[b64idx] = vecw_extract64_1(final23);
write_iter4[b64idx] = vecw_extract64_0(final45);
write_iter5[b64idx] = vecw_extract64_1(final45);
write_iter6[b64idx] = vecw_extract64_0(final67);
write_iter7[b64idx] = vecw_extract64_1(final67);
}
buf0_read_iter = &(buf0_read_iter[256]);
write_iter0 = &(write_iter7[64]);
}
}

// 8 -> 2
// This is similar to the main TransposeNybbleblock() loop.
const VecW* source_iter = R_CAST(VecW*, buf1);
const VecW m2 = VCONST_W(kMask3333);
const VecW m4 = VCONST_W(kMask0F0F);
const VecW m8 = VCONST_W(kMask00FF);
const VecW gather_even = vecw_setr8(0, 2, 4, 6, 8, 10, 12, 14,
-1, -1, -1, -1, -1, -1, -1, -1);
// Take advantage of current function contract.
const uint32_t buf1_row_ct = (write_batch_size + 3) / 4;

const uint32_t fourword_ct = DivUp(read_batch_size, 32);
uintptr_t* target_iter0 = write_iter;
for (uint32_t ridx = 0; ridx != buf1_row_ct; ++ridx) {
uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]);
uintptr_t* target_iter2 = &(target_iter1[write_ul_stride]);
uintptr_t* target_iter3 = &(target_iter2[write_ul_stride]);
for (uint32_t dvidx = 0; dvidx != fourword_ct; ++dvidx) {
const VecW loader0 = source_iter[dvidx * 2];
const VecW loader1 = source_iter[dvidx * 2 + 1];

VecW even_nyps0 = loader0 & m2;
VecW even_nyps1 = loader1 & m2;
VecW odd_nyps0 = vecw_srli(loader0, 2) & m2;
VecW odd_nyps1 = vecw_srli(loader1, 2) & m2;
even_nyps0 = even_nyps0 | vecw_srli(even_nyps0, 6);
even_nyps1 = even_nyps1 | vecw_srli(even_nyps1, 6);
odd_nyps0 = odd_nyps0 | vecw_srli(odd_nyps0, 6);
odd_nyps1 = odd_nyps1 | vecw_srli(odd_nyps1, 6);
// Low four bits of even_nyps{0,1}[0], [2], ..., [14] are destined for
// target_iter0; high four bits of those bytes are destined for
// target_iter2.
const VecW even_nyps = vecw_gather_even(even_nyps0, even_nyps1, m8);
const VecW odd_nyps = vecw_gather_even(odd_nyps0, odd_nyps1, m8);

VecW mod0_nyps = even_nyps & m4;
VecW mod1_nyps = odd_nyps & m4;
VecW mod2_nyps = vecw_srli(even_nyps, 4) & m4;
VecW mod3_nyps = vecw_srli(odd_nyps, 4) & m4;
mod0_nyps = mod0_nyps | vecw_srli(mod0_nyps, 4);
mod1_nyps = mod1_nyps | vecw_srli(mod1_nyps, 4);
mod2_nyps = mod2_nyps | vecw_srli(mod2_nyps, 4);
mod3_nyps = mod3_nyps | vecw_srli(mod3_nyps, 4);
mod0_nyps = vecw_shuffle8(mod0_nyps, gather_even);
mod1_nyps = vecw_shuffle8(mod1_nyps, gather_even);
mod2_nyps = vecw_shuffle8(mod2_nyps, gather_even);
mod3_nyps = vecw_shuffle8(mod3_nyps, gather_even);
target_iter0[dvidx] = vecw_extract64_0(mod0_nyps);
target_iter1[dvidx] = vecw_extract64_0(mod1_nyps);
target_iter2[dvidx] = vecw_extract64_0(mod2_nyps);
target_iter3[dvidx] = vecw_extract64_0(mod3_nyps);
}
source_iter = &(source_iter[32]);
target_iter0 = &(target_iter3[write_ul_stride]);
}
}
# endif
#else // !USE_SSE2
# ifdef __LP64__
static_assert(kWordsPerVec == 1, "TransposeNypblock64() needs to be updated.");
Expand Down
10 changes: 7 additions & 3 deletions 2.0/include/pgenlib_misc.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef __PGENLIB_MISC_H__
#define __PGENLIB_MISC_H__

// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
Expand Down Expand Up @@ -79,7 +79,7 @@
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
#define PGENLIB_INTERNAL_VERNUM 1912
#define PGENLIB_INTERNAL_VERNUM 1913

#ifdef __cplusplus
namespace plink2 {
Expand Down Expand Up @@ -337,6 +337,9 @@ HEADER_INLINE uint64_t GetVint64Unsafe(const unsigned char** buf_iterp) {
}
}

// TODO: make this work properly with kCacheline == 128, then fix other
// transpose functions, etc.

// main batch size
CONSTI32(kPglNypTransposeBatch, kNypsPerCacheline);

Expand All @@ -355,7 +358,8 @@ void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
#endif
CONSTI32(kPglNypTransposeBufwords, kPglNypTransposeBufbytes / kBytesPerWord);

// - up to 256x256; vecaligned_buf must have size 16k (64-bit) or 32k (32-bit)
// - single block is up to 256x256 (CACHELINE64) or 512x512 (CACHELINE128)
// - vecaligned_buf must have size 32k (CACHELINE64) or 128k (CACHELINE128)
// - does NOT zero out trailing bits, because main application is ind-major-bed
// <-> plink2 format conversion, where the zeroing would be undone...
// - important: write_iter must be allocated up to at least
Expand Down
35 changes: 33 additions & 2 deletions 2.0/include/pgenlib_read.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// This library is part of PLINK 2.00, copyright (C) 2005-2023 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2024 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
Expand Down Expand Up @@ -2948,7 +2948,7 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl
return kPglRetSuccess;
}

PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
if (!sample_ct) {
*difflist_common_geno_ptr = UINT32_MAX;
return kPglRetSuccess;
Expand Down Expand Up @@ -5228,6 +5228,37 @@ PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t
return reterr;
}

PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
if (!sample_ct) {
*difflist_common_geno_ptr = UINT32_MAX;
return kPglRetSuccess;
}
const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
if (unlikely(reterr)) {
return reterr;
}
if (allele_idx) {
if (*difflist_common_geno_ptr == UINT32_MAX) {
GenovecInvertUnsafe(sample_ct, allele_invcountvec);
} else {
const uint32_t orig_common_geno = *difflist_common_geno_ptr;
GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno);
if (orig_common_geno != 3) {
*difflist_common_geno_ptr = 2 - orig_common_geno;
}
}
}
return kPglRetSuccess;
}
*difflist_common_geno_ptr = UINT32_MAX;
PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_invcountvec, nullptr);
GenovecInvertUnsafe(sample_ct, allele_invcountvec);
return reterr;
}

// Assumes allele_idx0 < allele_idx1, and allele_idx0 < 2. Rotates hardcalls
// such that, if no multiallelic hardcalls are present, 0 = 0/0, 1 = 0/1,
// 2 = 1/1, and 3 = anything else.
Expand Down
Loading

0 comments on commit be228d8

Please sign in to comment.