diff --git a/2.0/include/plink2_base.h b/2.0/include/plink2_base.h index f74ecbdd..ab9601c7 100644 --- a/2.0/include/plink2_base.h +++ b/2.0/include/plink2_base.h @@ -372,6 +372,7 @@ typedef enum kPglRetDecompressFail, // also distinguish this from MalformedInput kPglRetRewindFail, kPglRetGpuFail, + kPglRetInternalUse1, kPglRetSampleMajorBed = 32, kPglRetNomemCustomMsg = 59, kPglRetInternalError = 60, @@ -442,6 +443,7 @@ const PglErr kPglRetDegenerateData = PglErr::ec::kPglRetDegenerateData; const PglErr kPglRetDecompressFail = PglErr::ec::kPglRetDecompressFail; const PglErr kPglRetRewindFail = PglErr::ec::kPglRetRewindFail; const PglErr kPglRetGpuFail = PglErr::ec::kPglRetGpuFail; +const PglErr kPglRetInternalUse1 = PglErr::ec::kPglRetInternalUse1; const PglErr kPglRetSampleMajorBed = PglErr::ec::kPglRetSampleMajorBed; const PglErr kPglRetWarningErrcode = PglErr::ec::kPglRetWarningErrcode; const PglErr kPglRetNomemCustomMsg = PglErr::ec::kPglRetNomemCustomMsg; diff --git a/2.0/include/plink2_bits.h b/2.0/include/plink2_bits.h index 3ee1a3ab..e85997c3 100644 --- a/2.0/include/plink2_bits.h +++ b/2.0/include/plink2_bits.h @@ -517,8 +517,8 @@ CONSTI32(kPglBitTransposeWords, kWordsPerCacheline); // avoid breaking strict-aliasing rules, while the restrict qualifiers should // tell the compiler it doesn't need to be paranoid about writes to one of // the buffers screwing with reads from the other. -#ifdef __LP64__ CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2)); +#ifdef __LP64__ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_row_ct, uint32_t write_row_ct, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1); HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) { @@ -526,7 +526,6 @@ HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ } #else // !__LP64__ -CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2)); void TransposeBitblock32(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1); // If this ever needs to be called on an input byte array, read_iter could be diff --git a/2.0/plink2.cc b/2.0/plink2.cc index 5f9f527d..d255be34 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a5" #elif defined(USE_AOCL) " AMD" #endif - " (21 Sep 2023)"; + " (22 Sep 2023)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same "" diff --git a/2.0/plink2_export.cc b/2.0/plink2_export.cc index 78d04702..f6c8f01e 100644 --- a/2.0/plink2_export.cc +++ b/2.0/plink2_export.cc @@ -9833,54 +9833,7 @@ void GenovecToNybblesSSE4(const uintptr_t* genovec, uint32_t sample_ct, uint32_t Subst4bitTo8(genovec, lookup, outvec_ct, result_vec); } -void Subst2bitTo8(const void* src_vec, const VecW lookup, uint32_t outvec_ct, void* dst_vec) { - // Similar to Expand2bitTo8(). - const VecW m03 = VCONST_W(kMask0303); - const VecW* src_iter = S_CAST(const VecW*, src_vec); - VecW* dst_iter = S_CAST(VecW*, dst_vec); - VecW* loop_stop = &(dst_iter[RoundDownPow2(outvec_ct - 1, 4) + 1]); - VecW* dst_end = &(dst_iter[outvec_ct]); - while (1) { - VecW cur_vec = *src_iter++; -# ifdef USE_AVX2 - const __m256i midswapped_vec = _mm256_shuffle_epi32(WToVec(cur_vec), 0xd8); - cur_vec = vecw_permute0xd8_if_avx2(VecToW(midswapped_vec)); -# endif - const VecW vec_even = cur_vec; - const VecW vec_odd = vecw_srli(cur_vec, 4); - const VecW vec01 = vecw_unpacklo8(vec_even, vec_odd); - const VecW vec23 = vecw_unpackhi8(vec_even, vec_odd); - const VecW vec01_even = vec01 & m03; - const VecW vec01_odd = vecw_srli(vec01, 2) & m03; - const VecW vec23_even = vec23 & m03; - const VecW vec23_odd = vecw_srli(vec23, 2) & m03; - VecW vec0 = vecw_unpacklo8(vec01_even, vec01_odd); - VecW vec1 = vecw_unpackhi8(vec01_even, vec01_odd); - VecW vec2 = vecw_unpacklo8(vec23_even, vec23_odd); - VecW vec3 = vecw_unpackhi8(vec23_even, vec23_odd); - vec0 = vecw_shuffle8(lookup, vec0); - vec1 = vecw_shuffle8(lookup, vec1); - vec2 = vecw_shuffle8(lookup, vec2); - vec3 = vecw_shuffle8(lookup, vec3); - *dst_iter++ = vec0; - if (dst_iter >= loop_stop) { - if (dst_iter < dst_end) { - *dst_iter++ = vec1; - if (dst_iter < dst_end) { - *dst_iter++ = vec2; - if (dst_iter < dst_end) { - *dst_iter++ = vec3; - } - } - } - return; - } - *dst_iter++ = vec1; - *dst_iter++ = vec2; - *dst_iter++ = vec3; - } -} - +// writes up to vector boundary static inline void NybblesToIupac(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) { const VecW lookup = vecw_setr8('.', 'A', 'C', 'M', 'G', 'R', 'S', '.', 'T', 'W', 'Y', '.', 'K', '.', '.', 'N'); const uint32_t invec_ct = NybbleCtToVecCt(entry_ct); @@ -9936,13 +9889,9 @@ void NybblearrLookup256x1bx2(const uintptr_t* nybblearr, const void* table256x1b static const unsigned char kNybblePairToIupac[512] = PAIR_TABLE256('.', 'A', 'C', 'M', 'G', 'R', 'S', '.', 'T', 'W', 'Y', '.', 'K', '.', '.', 'N'); -void NybblesToIupac(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) { +static inline void NybblesToIupac(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) { NybblearrLookup256x1bx2(nybblevec, kNybblePairToIupac, entry_ct, result_vec); } - -static inline void NybblesToIupacUnsafe(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) { - return NybblesToIupac(nybblevec, entry_ct, result_vec); -} #endif typedef struct PhylipReadCtxStruct { @@ -9952,10 +9901,10 @@ typedef struct PhylipReadCtxStruct { const uintptr_t* sample_include; const uint32_t* sample_include_cumulative_popcounts; const unsigned char* allele_nybble_table; -#ifndef USE_SSE42 - // Precomputed 256-entry lookup tables, for use by GenovecToNybblesNoSSE4(). - const uint16_t* genovec_to_nybble_tables[25]; -#endif + // Precomputed 256-entry lookup tables. + // Format depends on phased vs. unphased. No lookup table needed in unphased + // SSE4+ case. + const void* genovec_to_nybble_tables[25]; uint32_t sample_ct; PgenReader** pgr_ptrs; @@ -9987,7 +9936,7 @@ THREAD_FUNC_DECL PhylipReadThread(void* raw_arg) { const uintptr_t* sample_include = ctx->sample_include; const unsigned char* allele_nybble_table = ctx->allele_nybble_table; #ifndef USE_SSE42 - const uint16_t* const* genovec_to_nybble_tables = ctx->genovec_to_nybble_tables; + const uint16_t* const* genovec_to_nybble_tables = R_CAST(const uint16_t* const*, ctx->genovec_to_nybble_tables); #endif const uint32_t read_sample_ct = ctx->sample_ct; const uintptr_t read_sample_ctaw4 = NybbleCtToAlignedWordCt(read_sample_ct); @@ -10103,7 +10052,9 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) { const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp); const uintptr_t* sample_include = ctx->sample_include; const unsigned char* allele_nybble_table = ctx->allele_nybble_table; + const uint32_t* const* genovec_to_nybblepair_tables = R_CAST(const uint32_t* const*, ctx->genovec_to_nybble_tables); const uint32_t read_sample_ct = ctx->sample_ct; + const uint32_t read_sample_ctl = BitCtToWordCt(read_sample_ct); const uintptr_t read_sample_ctaw4 = NybbleCtToAlignedWordCt(read_sample_ct); PgenReader* pgrp = ctx->pgr_ptrs[tidx]; PgrSampleSubsetIndex pssi; @@ -10163,11 +10114,72 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) { goto PhylipPhasedReadThread_err; } ZeroTrailingNyps(read_sample_ct, genovec); + STD_ARRAY_DECL(uint32_t, 4, genocounts); + GenoarrCountFreqsUnsafe(genovec, read_sample_ct, genocounts); + const uint32_t phasepresent_ct = pgv.phasepresent_ct; + if (unlikely(phasepresent_ct != genocounts[1])) { + // another type of InconsistentInput; use a different code here since + // we want to print a different error message + new_err_info = (S_CAST(uint64_t, variant_uidx) << 32) | S_CAST(uint32_t, kPglRetInternalUse1); + goto PhylipPhasedReadThread_err; + } if (write_sample_nm_cts_iter) { - *write_sample_nm_cts_iter++ = read_sample_ct - GenoarrCountMissingUnsafe(genovec, read_sample_ct); + *write_sample_nm_cts_iter++ = (read_sample_ct - genocounts[3]) * 2; + } + const uint32_t nybble0 = allele_nybbles[0]; + { + const uint32_t nybble1 = allele_nybbles[1]; + const uint32_t allele0_table_idx = ctzu32(nybble0) + 4 * (nybble0 == 15); + const uint32_t allele1_table_idx = ctzu32(nybble1) + 4 * (nybble1 == 15); + GenoarrLookup256x1bx4(genovec, genovec_to_nybblepair_tables[allele0_table_idx + 5 * allele1_table_idx], read_sample_ct, vmaj_readbuf_iter); + } + if (cur_allele_ct > 2) { + unsigned char* nybblepairs = DowncastToUc(vmaj_readbuf_iter); + const uint32_t patch_01_ct = pgv.patch_01_ct; + uint32_t shifted_nybbles[4]; + shifted_nybbles[2] = allele_nybbles[2] << 4; + shifted_nybbles[3] = allele_nybbles[3] << 4; + if (patch_01_ct) { + const uintptr_t* patch_01_set = pgv.patch_01_set; + const AlleleCode* patch_01_vals = pgv.patch_01_vals; + uintptr_t sample_widx = 0; + uintptr_t patch_01_bits = patch_01_set[0]; + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_widx, &patch_01_bits); + const AlleleCode ac = patch_01_vals[uii]; + nybblepairs[sample_idx] = nybble0 | shifted_nybbles[ac]; + } + } + const uint32_t patch_10_ct = pgv.patch_10_ct; + if (patch_10_ct) { + const uintptr_t* patch_10_set = pgv.patch_10_set; + const AlleleCode* patch_10_vals = pgv.patch_10_vals; + uintptr_t sample_widx = 0; + uintptr_t patch_10_bits = patch_10_set[0]; + for (uint32_t uii = 0; uii != patch_10_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_widx, &patch_10_bits); + const AlleleCode ac0 = patch_10_vals[uii * 2]; + const AlleleCode ac1 = patch_10_vals[uii * 2 + 1]; + nybblepairs[sample_idx] = allele_nybbles[ac0] | shifted_nybbles[ac1]; + } + } + } + if (phasepresent_ct) { + for (uint32_t widx = 0; widx != read_sample_ctl; ++widx) { + const uintptr_t phasepresent_word = phasepresent[widx]; + uintptr_t phaseinfo_word = phasepresent_word & phaseinfo[widx]; + if (phaseinfo_word) { + unsigned char* cur_nybblepairs = DowncastToUc(vmaj_readbuf_iter); + cur_nybblepairs = &(cur_nybblepairs[widx * kBitsPerWord]); + do { + const uint32_t sample_idx_lowbits = ctzw(phaseinfo_word); + const uint32_t nybblepair = cur_nybblepairs[sample_idx_lowbits]; + cur_nybblepairs[sample_idx_lowbits] = (nybblepair << 4) | (nybblepair >> 4); + phaseinfo_word &= phaseinfo_word - 1; + } while (phaseinfo_word); + } + } } - // TODO - // GenovecToNybblePairs, etc. vmaj_readbuf_iter = &(vmaj_readbuf_iter[read_sample_ctaw4]); } prev_copy_ct += cur_block_copy_ct; @@ -10187,12 +10199,70 @@ typedef struct PhylipTransposeCtxStruct { const uintptr_t* vmaj_readbuf; VecW** thread_vecaligned_bufs; + uintptr_t** thread_smaj_nybble_bufs; uint32_t sample_batch_size; uintptr_t* smaj_writebufs[2]; } PhylipTransposeCtx; +THREAD_FUNC_DECL PhylipTransposeWriteThread(void* raw_arg) { + ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg); + const uintptr_t tidx = arg->tidx; + PhylipTransposeCtx* ctx = S_CAST(PhylipTransposeCtx*, arg->sharedp->context); + + const uint32_t variant_ct = ctx->variant_ct; + const uintptr_t variant_batch_ct = DivUp(variant_ct, kPglNybbleTransposeBatch); + const uintptr_t write_word_stride = RoundUpPow2(variant_ct, kCacheline) / kBytesPerWord; + const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp); + const uintptr_t variant_batch_idx_start = (S_CAST(uint64_t, tidx) * variant_batch_ct) / calc_thread_ct; + VecW* vecaligned_buf = ctx->thread_vecaligned_bufs[tidx]; + uintptr_t* smaj_nybble_buf = ctx->thread_smaj_nybble_bufs[tidx]; + uintptr_t variant_batch_idx_full_end = ((S_CAST(uint64_t, tidx) + 1) * variant_batch_ct) / calc_thread_ct; + uint32_t variant_idx_end; + if (tidx + 1 < calc_thread_ct) { + variant_idx_end = variant_batch_idx_full_end * kPglNybbleTransposeBatch; + } else { + variant_idx_end = variant_ct; + if (variant_ct % kPglNybbleTransposeBatch) { + --variant_batch_idx_full_end; + } + } + const uint32_t sample_ct = ctx->write_sample_ct; + const uintptr_t sample_ctaw4 = NybbleCtToAlignedWordCt(sample_ct); + const uintptr_t* vmaj_readbuf = ctx->vmaj_readbuf; + uint32_t sample_widx = 0; + uint32_t parity = 0; + do { + const uint32_t sample_batch_size = ctx->sample_batch_size; + const uintptr_t* vmaj_readbuf_iter = &(vmaj_readbuf[variant_batch_idx_start * kPglNybbleTransposeBatch * sample_ctaw4 + sample_widx]); + uintptr_t* smaj_writebuf_start = &(ctx->smaj_writebufs[parity][variant_batch_idx_start * kPglNybbleTransposeWords]); + uintptr_t* smaj_writebuf_iter = smaj_writebuf_start; + uint32_t variant_batch_size = kPglNybbleTransposeBatch; + for (uintptr_t variant_batch_idx = variant_batch_idx_start; ; ++variant_batch_idx) { + if (variant_batch_idx >= variant_batch_idx_full_end) { + if (variant_batch_idx * kPglNybbleTransposeBatch >= variant_idx_end) { + break; + } + variant_batch_size = variant_idx_end - variant_batch_idx * kPglNybbleTransposeBatch; + } + TransposeNybbleblock(vmaj_readbuf_iter, sample_ctaw4, kPglNybbleTransposeWords, variant_batch_size, sample_batch_size, smaj_nybble_buf, vecaligned_buf); + const uintptr_t* nybble_read_iter = smaj_nybble_buf; + uintptr_t* ascii_write_iter = smaj_writebuf_iter; + for (uint32_t uii = 0; uii != sample_ct; ++uii) { + NybblesToIupac(nybble_read_iter, variant_batch_size, ascii_write_iter); + nybble_read_iter = &(nybble_read_iter[kPglNybbleTransposeWords]); + ascii_write_iter = &(ascii_write_iter[write_word_stride]); + } + smaj_writebuf_iter = &(smaj_writebuf_iter[kPglNybbleTransposeWords * 2]); + vmaj_readbuf_iter = &(vmaj_readbuf_iter[variant_batch_size * sample_ctaw4]); + } + parity = 1 - parity; + sample_widx += sample_batch_size / kBitsPerWordD4; + } while (!THREAD_BLOCK_FINISH(arg)); + THREAD_RETURN; +} + PglErr ExportPhylip(__attribute__((unused)) const uintptr_t* orig_sample_include, __attribute__((unused)) const SampleIdInfo* siip, __attribute__((unused)) const uintptr_t* sex_male_collapsed, __attribute__((unused)) const uintptr_t* variant_include, __attribute__((unused)) const ChrInfo* cip, __attribute__((unused)) const uint32_t* variant_bps, __attribute__((unused)) const uintptr_t* allele_idx_offsets, __attribute__((unused)) const char* const* allele_storage, __attribute__((unused)) uint32_t raw_sample_ct, __attribute__((unused)) uint32_t sample_ct, __attribute__((unused)) uint32_t raw_variant_ct, __attribute__((unused)) uint32_t variant_ct, __attribute__((unused)) uint32_t export_phased, __attribute__((unused)) uint32_t export_used_sites, __attribute__((unused)) uint32_t max_thread_ct, __attribute__((unused)) uintptr_t pgr_alloc_cacheline_ct, __attribute__((unused)) PgenFileInfo* pgfip, __attribute__((unused)) char* outname, __attribute__((unused)) char* outname_end) { unsigned char* bigstack_mark = g_bigstack_base; FILE* outfile = nullptr; diff --git a/2.0/plink2_import.cc b/2.0/plink2_import.cc index 6f79979e..c6e931a8 100644 --- a/2.0/plink2_import.cc +++ b/2.0/plink2_import.cc @@ -9587,7 +9587,6 @@ PglErr OxSampleToPsam(const char* samplename, const char* const_fid, const char* const uint32_t col_uidx = BitIter1(col_first_pass_remaining, &col_uidx_base, &cur_bits); linebuf_iter = NextTokenMult(token_end, col_uidx - prev_col_uidx); if (unlikely(!linebuf_iter)) { - printf("fail 2 %u %u %u\n", col_uidx, prev_col_uidx, uii); goto OxSampleToPsam_ret_MISSING_TOKENS; } prev_col_uidx = col_uidx;