Skip to content

Commit

Permalink
a6
Browse files Browse the repository at this point in the history
  • Loading branch information
chrchang committed Sep 27, 2023
1 parent 45a9a15 commit 15665cf
Show file tree
Hide file tree
Showing 25 changed files with 1,524 additions and 852 deletions.
420 changes: 388 additions & 32 deletions 2.0/include/pgenlib_misc.cc

Large diffs are not rendered by default.

36 changes: 31 additions & 5 deletions 2.0/include/pgenlib_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,11 +379,30 @@ void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main);
// replaces each x with -x
void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta);

// currently does zero trailing halfword
void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness);
void PackWordsToHalfwordsInvmatch(const uintptr_t* __restrict genoarr, uintptr_t inv_match_word, uint32_t inword_ct, uintptr_t* __restrict dst);

void PackWordsToHalfwordsMismatch(const uintptr_t* __restrict genoarr, uintptr_t mismatch_word, uint32_t inword_ct, uintptr_t* __restrict dst);

// src and dst allowed to be identical; that's why src is not marked const
// despite not being directly written to.
void MaskWordsToHalfwordsInvmatch(const uintptr_t* __restrict genoarr, uintptr_t inv_match_word, uint32_t inword_ct, uintptr_t* src, uintptr_t* dst);

// Unsafe since it assumes trailing genoarr bits are cleared. But if they are,
// trailing missingness bits will be clear.
HEADER_INLINE void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness) {
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
PackWordsToHalfwordsInvmatch(genoarr, 0, sample_ctl2, missingness);
if (sample_ctl2 % 2) {
Halfword* missingness_alias = DowncastWToHW(missingness);
missingness_alias[sample_ctl2] = 0;
}
}

// currently does not zero trailing halfword
void GenoarrToNonmissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness);
HEADER_INLINE void GenoarrToNonmissing(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness) {
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
PackWordsToHalfwordsMismatch(genoarr, ~k0LU, sample_ctl2, nonmissingness);
ZeroTrailingBits(sample_ct, nonmissingness);
}

void SparseToMissingness(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uint32_t sample_ct, uint32_t difflist_common_geno, uint32_t difflist_len, uintptr_t* __restrict missingness);

Expand All @@ -401,7 +420,7 @@ void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __
BoolErr HapsplitMustPhased(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, uint32_t sample_ct, uint32_t phasepresent_ct, uintptr_t* hap_arr, uintptr_t* nm_arr);

// Only 1 haplotype per genotype, rather than 2; het treated as missing.
void HapsplitHaploid(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* hap_arr, uintptr_t* nm_arr);
void HapsplitHaploid(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict hap_arr, uintptr_t* __restrict nm_arr);


// These functions use 16- or 256-element lookup tables to apply functions of
Expand Down Expand Up @@ -458,6 +477,13 @@ void InitLookup16x4bx2(void* table16x4bx2);

void InitLookup16x8bx2(void* table16x8bx2);

#ifdef USE_SHUFFLE8
// in bytes
CONSTI32(kLookup256x1bx4Size, 1024 + 2 * kBytesPerVec);
#else
CONSTI32(kLookup256x1bx4Size, 1024);
#endif

void InitLookup256x1bx4(void* table256x1bx4);

void InitLookup256x2bx4(void* table256x2bx4);
Expand Down
77 changes: 16 additions & 61 deletions 2.0/include/pgenlib_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ void Expand2bitTo8(const void* __restrict bytearr, uint32_t input_nyp_ct, uint32
// SSE2:
// vec01 contains {0-1, 2-3, 4-5, 6-7, ..., 30-31}
// vec23 contains {32-33, 34-35, 36-37, 38-39, ..., 62-63}
// There's no m4 masking here, so we don't use vecw_lo_and_hi_nybbles.
const VecW vec01 = vecw_unpacklo8(vec_even, vec_odd);
const VecW vec23 = vecw_unpackhi8(vec_even, vec_odd);

Expand Down Expand Up @@ -541,28 +542,11 @@ void Expand4bitTo8(const void* __restrict bytearr, uint32_t input_nybble_ct, uin
const VecW mincr = VecUcToW(vecuc_set1(incr));
const VecW m4 = VCONST_W(kMask0F0F);
for (uint32_t vec_idx = 0; vec_idx != input_vec_ct; ++vec_idx) {
VecW cur_vec = vecw_loadu(src_iter);
const VecW cur_vec = vecw_loadu(src_iter);
src_iter = &(src_iter[kBytesPerVec]);
cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
// AVX2:
// vec_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
// 16, 18, ..., 30, 48, ... 62}
// vec_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
// 17, 19, ..., 31, 49, ..., 63}
// SSE2:
// vec_even contains {0, 2, 4, ..., 30}
// vec_odd contains {1, 3, 5, ..., 31}
const VecW vec_even = cur_vec & m4;
const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;

// AVX2:
// vec_lo contains {0, 1, ..., 31}
// vec_hi contains {32, 33, ..., 63}
// SSE2:
// vec_lo contains {0, 1, 2, ..., 15}
// vec_hi contains {16, 17, 18, ..., 31}
const VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
const VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
VecW vec_lo;
VecW vec_hi;
vecw_lo_and_hi_nybbles(cur_vec, m4, &vec_lo, &vec_hi);
*dst_iter++ = mincr + vec_lo;
*dst_iter++ = mincr + vec_hi;
}
Expand Down Expand Up @@ -2311,7 +2295,7 @@ PglErr ParseOnebitUnsafe(const unsigned char* fread_end, const unsigned char** f
if (read_hw_ct >= 2 * kWordsPerVec) {
const uint32_t read_vec_ct = raw_sample_ct / kBitsPerVec;
const VecW m4 = VCONST_W(kMask0F0F);
# ifdef USE_SSE42
# ifdef USE_SHUFFLE8
// 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85 if the codes
// are 0 and 1
const VecW lookup = {word_base + common_code_delta * 0x1514111005040100LLU,
Expand All @@ -2324,11 +2308,10 @@ PglErr ParseOnebitUnsafe(const unsigned char* fread_end, const unsigned char** f
# endif
for (uint32_t vidx = 0; vidx != read_vec_ct; ++vidx) {
const VecW cur_vec = vecw_loadu(&(onebit_main_iter[vidx * kBytesPerVec]));
const VecW vec_even = cur_vec & m4;
const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
# ifdef USE_SSE42
VecW vec_lo;
VecW vec_hi;
vecw_lo_and_hi_nybbles(cur_vec, m4, &vec_lo, &vec_hi);
# ifdef USE_SHUFFLE8
vec_lo = vecw_shuffle8(lookup, vec_lo);
vec_hi = vecw_shuffle8(lookup, vec_hi);
# else
Expand Down Expand Up @@ -5084,15 +5067,9 @@ PglErr GetAux1bHets(const unsigned char* fread_end, const uintptr_t* __restrict
return kPglRetSuccess;
}

void SuppressHets00(const uintptr_t* allele_countvec, const uintptr_t* subsetted_all_hets, uint32_t sample_ct, uintptr_t* subsetted_suppressed_hets) {
const Halfword* all_hets_alias = DowncastKWToHW(subsetted_all_hets);
static inline void SuppressHets00(const uintptr_t* allele_countvec, uintptr_t* subsetted_all_hets, uint32_t sample_ct, uintptr_t* subsetted_suppressed_hets) {
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
Halfword* write_alias = DowncastWToHW(subsetted_suppressed_hets);
for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
const Halfword cur_count0 = Pack00ToHalfword(allele_countvec[widx]);
const Halfword cur_hets = all_hets_alias[widx];
write_alias[widx] = cur_count0 & cur_hets;
}
MaskWordsToHalfwordsInvmatch(allele_countvec, ~k0LU, sample_ctl2, subsetted_all_hets, subsetted_suppressed_hets);
}

PglErr Get1Multiallelic(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict all_hets, uintptr_t* __restrict allele_countvec, uintptr_t** subsetted_suppressed_hetp) {
Expand Down Expand Up @@ -6331,16 +6308,6 @@ PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex
return GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, nullptr, pgvp);
}

void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* all_hets_hw) {
// requires trailing bits of genoarr to be zeroed out. does not update last
// all_hets[] halfword if raw_sample_ctl2 is odd.
for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
const uintptr_t cur_word = genoarr[widx];
uintptr_t ww = (~(cur_word >> 1)) & cur_word; // low 1, high 0
all_hets_hw[widx] = PackWordToHalfwordMask5555(ww);
}
}

void PgrDetectGenoarrHetsMultiallelic(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
const Halfword* patch_10_set_alias = DowncastKWToHW(patch_10_set);
const AlleleCode* patch_10_vals_iter = patch_10_vals;
Expand Down Expand Up @@ -6647,10 +6614,7 @@ PglErr Get1MP(const uintptr_t* __restrict sample_include, const uint32_t* __rest

// Might want to make this its own function.
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
Halfword* phasepresent_alias = DowncastWToHW(phasepresent);
for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
phasepresent_alias[hwidx] &= Pack01ToHalfword(allele_countvec[hwidx]);
}
MaskWordsToHalfwordsInvmatch(allele_countvec, kMaskAAAA, sample_ctl2, phasepresent, phasepresent);
*phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));

return kPglRetSuccess;
Expand Down Expand Up @@ -6706,15 +6670,9 @@ PglErr IMPLPgrGetInv1P(const uintptr_t* __restrict sample_include, const uint32_
return kPglRetSuccess;
}

void SuppressHets11(const uintptr_t* genovec, const uintptr_t* subsetted_all_hets, uint32_t sample_ct, uintptr_t* subsetted_suppressed_hets) {
const Halfword* all_hets_alias = DowncastKWToHW(subsetted_all_hets);
void SuppressHets11(const uintptr_t* genovec, uintptr_t* subsetted_all_hets, uint32_t sample_ct, uintptr_t* subsetted_suppressed_hets) {
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
Halfword* write_alias = DowncastWToHW(subsetted_suppressed_hets);
for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
const Halfword cur_count0 = Pack11ToHalfword(genovec[widx]);
const Halfword cur_hets = all_hets_alias[widx];
write_alias[widx] = cur_count0 & cur_hets;
}
MaskWordsToHalfwordsInvmatch(genovec, 0, sample_ctl2, subsetted_all_hets, subsetted_suppressed_hets);
}

PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
Expand Down Expand Up @@ -6835,10 +6793,7 @@ PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex
}
if (VrtypeMultiallelicHc(vrtype) && (*phasepresent_ct_ptr)) {
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
Halfword* phasepresent_alias = DowncastWToHW(phasepresent);
for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
phasepresent_alias[hwidx] &= Pack01ToHalfword(genovec[hwidx]);
}
MaskWordsToHalfwordsInvmatch(genovec, kMaskAAAA, sample_ctl2, phasepresent, phasepresent);
*phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));
}
if (invert) {
Expand Down
10 changes: 3 additions & 7 deletions 2.0/include/pgenlib_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -595,21 +595,17 @@ PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex
// low-MAF variants without actually loading the genotype data, since the size
// of the record puts an upper bound on the alt allele frequency.

// requires trailing bits of genoarr to be zeroed out, AND does not update high
// bits of last word if raw_sample_ctl2 is odd.
void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* __restrict all_hets_hw);

// requires trailing bits of genoarr to be zeroed out.
HEADER_INLINE void PgrDetectGenoarrHetsUnsafe(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, uintptr_t* __restrict all_hets) {
Halfword* all_hets_alias = DowncastWToHW(all_hets);
DetectGenoarrHetsHw(genoarr, raw_sample_ctl2, all_hets_alias);
PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, raw_sample_ctl2, all_hets);
if (raw_sample_ctl2 % 2) {
Halfword* all_hets_alias = DowncastWToHW(all_hets);
all_hets_alias[raw_sample_ctl2] = 0;
}
}

HEADER_INLINE void PgrDetectGenoarrHets(const uintptr_t* __restrict genoarr, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
DetectGenoarrHetsHw(genoarr, NypCtToWordCt(raw_sample_ct), DowncastWToHW(all_hets));
PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, NypCtToWordCt(raw_sample_ct), all_hets);
ZeroTrailingBits(raw_sample_ct, all_hets);
}

Expand Down
9 changes: 2 additions & 7 deletions 2.0/include/pgenlib_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1552,7 +1552,7 @@ BoolErr PwcAppendMultiallelicMain(const uintptr_t* __restrict genovec, const uin
#ifdef USE_SSE2
const uint32_t fullvec_ct = patch_10_ct / (kBytesPerVec / 2);
if (fullvec_ct) {
# if defined(USE_SSE42) && !defined(USE_AVX2)
# if defined(USE_SHUFFLE8) && !defined(USE_AVX2)
// SSE4.2: _mm_shuffle_epi8() to gather even bytes, parallel
// equality-to-2 check, movemask
// (+126 or <<6 followed by movemask also works)
Expand Down Expand Up @@ -1866,13 +1866,8 @@ void PglMultiallelicSparseToDense(const uintptr_t* __restrict genoarr, const uin
if (remap1 < remap0) {
table4[1] = remap1 + remap0 * 256;
if (flipped) {
// See GenoarrToMissingnessUnsafe().
const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
Halfword* flipped_alias = DowncastWToHW(flipped);
for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
const uintptr_t cur_geno_word = genoarr[widx];
flipped_alias[widx] = PackWordToHalfwordMask5555(cur_geno_word & (~(cur_geno_word >> 1)));
}
PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, sample_ctl2, flipped);
}
}
// could have an InitLookup16x2bx2 function for this
Expand Down
54 changes: 50 additions & 4 deletions 2.0/include/plink2_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,19 @@
# else
# include "../simde/x86/sse2.h"
# endif
# ifdef SIMDE_ARM_NEON_A32V8_NATIVE
// For Apple M1, we effectively use SSE2 + constrained _mm_shuffle_epi8().
// - We don't want to use simde's emulated _mm_shuffle_epi8 since it has an
// extra and-with-0x8f step that we never need.
// In the event the and-with-0x8f is actually needed, we'll define
// vec..._x86_shuffle8() helper functions.
// - M1 also doesn't have efficient word-popcount.
# define USE_SHUFFLE8
# endif
# endif
# ifdef __SSE4_2__
# define USE_SSE42
# define USE_SHUFFLE8
# include <smmintrin.h>
# ifdef __AVX2__
# if defined(__BMI__) && defined(__BMI2__) && defined(__LZCNT__)
Expand Down Expand Up @@ -1539,11 +1549,18 @@ HEADER_INLINE VecUc vecuc_gather_odd(VecUc src_lo, VecUc src_hi) {
return VecToUc(_mm_packus_epi16(_mm_srli_epi16(UcToVec(src_lo), 8), _mm_srli_epi16(UcToVec(src_hi), 8)));
}

# ifdef USE_SSE42
HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
return VecToI32(_mm_max_epi32(I32ToVec(v1), I32ToVec(v2)));
# ifdef USE_SHUFFLE8
# ifdef SIMDE_ARM_NEON_A64V8_NATIVE
// See simde_mm_shuffle_epi8().
// This may need to be written more carefully in the IGNORE_BUNDLED_SIMDE case.
SIMDE_FUNCTION_ATTRIBUTES simde__m128i _mm_shuffle_epi8(simde__m128i a, simde__m128i b) {
simde__m128i_private a_ = simde__m128i_to_private(a);
simde__m128i_private b_ = simde__m128i_to_private(b);
simde__m128i_private r_;
r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, b_.neon_u8);
return simde__m128i_from_private(r_);
}

# endif
HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
return VecToW(_mm_shuffle_epi8(WToVec(table), WToVec(indexes)));
}
Expand All @@ -1555,6 +1572,11 @@ HEADER_INLINE VecU16 vecu16_shuffle8(VecU16 table, VecU16 indexes) {
HEADER_INLINE VecUc vecuc_shuffle8(VecUc table, VecUc indexes) {
return VecToUc(_mm_shuffle_epi8(UcToVec(table), UcToVec(indexes)));
}
# endif
# ifdef USE_SSE42
HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
return VecToI32(_mm_max_epi32(I32ToVec(v1), I32ToVec(v2)));
}

HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
return _mm_extract_epi64(WToVec(vv), 0);
Expand Down Expand Up @@ -1745,6 +1767,30 @@ HEADER_INLINE VecW VecUcToW(VecUc vv) {
return R_CAST(VecW, vv);
}

HEADER_INLINE void vecw_lo_and_hi_nybbles(VecW cur_vec, VecW m4, VecW* vec_lo_ptr, VecW* vec_hi_ptr) {
// Assumes m4 is VCONST_W(kMask0F0F).
// Returned vec_lo and vec_hi have top nybble of each byte zeroed out.
cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
// AVX2:
// vec_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
// 16, 18, ..., 30, 48, ... 62}
// vec_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
// 17, 19, ..., 31, 49, ..., 63}
// SSE2:
// vec_even contains {0, 2, 4, ..., 30}
// vec_odd contains {1, 3, 5, ..., 31}
const VecW vec_even = cur_vec & m4;
const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;

// AVX2:
// vec_lo contains {0, 1, 2, ..., 31}
// vec_hi contains {32, 33, 34, ..., 63}
// SSE2:
// vec_lo contains {0, 1, 2, ..., 15}
// vec_hi contains {16, 17, 18, ..., 31}
*vec_lo_ptr = vecw_unpacklo8(vec_even, vec_odd);
*vec_hi_ptr = vecw_unpackhi8(vec_even, vec_odd);
}
#else // !USE_SSE2
# ifdef __LP64__
CONSTI32(kBytesPerVec, 8);
Expand Down
Loading

0 comments on commit 15665cf

Please sign in to comment.