diff --git a/2.0/include/pgenlib_misc.cc b/2.0/include/pgenlib_misc.cc index 2dfbe949..1a172e42 100644 --- a/2.0/include/pgenlib_misc.cc +++ b/2.0/include/pgenlib_misc.cc @@ -3049,6 +3049,71 @@ uintptr_t PglComputeMaxAlleleCt(const uintptr_t* allele_idx_offsets, uint32_t va return max_allele_ct; } +// Ok for nybble_vvec to be unaligned. +uint32_t CountNybbleVec(const unsigned char* nybble_vvec_biter, uintptr_t nybble_word, uint32_t vec_ct) { + const VecW m0 = vecw_setzero(); + const VecW alld15 = VCONST_W(kMask1111); + const VecW m4 = VCONST_W(kMask0F0F); + const VecW xor_vvec = vecw_set1(nybble_word); + VecW prev_sad_result = vecw_setzero(); + VecW acc = vecw_setzero(); + uintptr_t cur_incr = 15; + for (; ; vec_ct -= cur_incr) { + if (vec_ct < 15) { + if (!vec_ct) { + acc = acc + prev_sad_result; + return HsumW(acc); + } + cur_incr = vec_ct; + } + VecW inner_acc = vecw_setzero(); + const unsigned char* nybble_vvec_stop = &(nybble_vvec_biter[cur_incr * kBytesPerVec]); + do { + VecW loader = vecw_loadu(nybble_vvec_biter) ^ xor_vvec; + nybble_vvec_biter += kBytesPerVec; + // DetectAllZeroNybbles() followed by right-shift-3 is the same number of + // operations, can see if that's any faster in practice + loader = vecw_srli(loader, 1) | loader; + loader = vecw_srli(loader, 2) | loader; + inner_acc = inner_acc + vecw_and_notfirst(loader, alld15); + } while (nybble_vvec_biter < nybble_vvec_stop); + inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4); + acc = acc + prev_sad_result; + prev_sad_result = vecw_bytesum(inner_acc, m0); + } +} + +uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) { + const unsigned char* nybblearr_uc = S_CAST(const unsigned char*, nybblearr); + const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4; + uint32_t tot = CountNybbleVec(nybblearr_uc, nybble_word, fullword_ct / kWordsPerVec); +#ifdef __LP64__ + for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) { + uintptr_t cur_word; + CopyFromUnalignedOffsetW(&cur_word, nybblearr_uc, word_idx); + cur_word ^= nybble_word; + cur_word = cur_word | (cur_word >> 1); + cur_word = cur_word | (cur_word >> 2); + tot += Popcount0001Word((~cur_word) & kMask1111); + } +#endif + const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4; + if (trailing_nybble_ct) { + const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4)); + uintptr_t cur_word = SubwordLoad(&(nybblearr_uc[fullword_ct * kBytesPerWord]), trailing_byte_ct) ^ nybble_word; + cur_word = cur_word | (cur_word >> 1); + cur_word = cur_word | (cur_word >> 2); + cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4); +#if defined(USE_SSE42) || !defined(__LP64__) + tot += Popcount0001Word(cur_word); +#else + // minor optimization, can't overflow + tot += (cur_word * kMask1111) >> 60; +#endif + } + return tot; +} + #ifdef __cplusplus } // namespace plink2 #endif diff --git a/2.0/include/pgenlib_misc.h b/2.0/include/pgenlib_misc.h index 7da62613..c64e4696 100644 --- a/2.0/include/pgenlib_misc.h +++ b/2.0/include/pgenlib_misc.h @@ -670,6 +670,8 @@ HEADER_INLINE AlleleCode GetAidx(const uintptr_t* allele_idx_offsets, uint32_t v return allele_idx - allele_idx_offsets[variant_uidx]; } +uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct); + // The actual format: // 1. 2 magic bytes 0x6c 0x1b. // diff --git a/2.0/include/pgenlib_read.cc b/2.0/include/pgenlib_read.cc index 05a1e246..2a6da517 100644 --- a/2.0/include/pgenlib_read.cc +++ b/2.0/include/pgenlib_read.cc @@ -3650,70 +3650,6 @@ uint32_t CountNypSubset(const uintptr_t* __restrict nypvec, const uintptr_t* __r } */ -// Ok for nybble_vvec to be unaligned. -uint32_t CountNybbleVec(const unsigned char* nybble_vvec_biter, uintptr_t nybble_word, uint32_t vec_ct) { - const VecW m0 = vecw_setzero(); - const VecW alld15 = VCONST_W(kMask1111); - const VecW m4 = VCONST_W(kMask0F0F); - const VecW xor_vvec = vecw_set1(nybble_word); - VecW prev_sad_result = vecw_setzero(); - VecW acc = vecw_setzero(); - uintptr_t cur_incr = 15; - for (; ; vec_ct -= cur_incr) { - if (vec_ct < 15) { - if (!vec_ct) { - acc = acc + prev_sad_result; - return HsumW(acc); - } - cur_incr = vec_ct; - } - VecW inner_acc = vecw_setzero(); - const unsigned char* nybble_vvec_stop = &(nybble_vvec_biter[cur_incr * kBytesPerVec]); - do { - VecW loader = vecw_loadu(nybble_vvec_biter) ^ xor_vvec; - nybble_vvec_biter += kBytesPerVec; - // DetectAllZeroNybbles() followed by right-shift-3 is the same number of - // operations, can see if that's any faster in practice - loader = vecw_srli(loader, 1) | loader; - loader = vecw_srli(loader, 2) | loader; - inner_acc = inner_acc + vecw_and_notfirst(loader, alld15); - } while (nybble_vvec_biter < nybble_vvec_stop); - inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4); - acc = acc + prev_sad_result; - prev_sad_result = vecw_bytesum(inner_acc, m0); - } -} - -uint32_t CountNybble(const unsigned char* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) { - const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4; - uint32_t tot = CountNybbleVec(nybblearr, nybble_word, fullword_ct / kWordsPerVec); -#ifdef __LP64__ - for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) { - uintptr_t cur_word; - CopyFromUnalignedOffsetW(&cur_word, nybblearr, word_idx); - cur_word ^= nybble_word; - cur_word = cur_word | (cur_word >> 1); - cur_word = cur_word | (cur_word >> 2); - tot += Popcount0001Word((~cur_word) & kMask1111); - } -#endif - const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4; - if (trailing_nybble_ct) { - const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4)); - uintptr_t cur_word = SubwordLoad(&(nybblearr[fullword_ct * kBytesPerWord]), trailing_byte_ct) ^ nybble_word; - cur_word = cur_word | (cur_word >> 1); - cur_word = cur_word | (cur_word >> 2); - cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4); -#if defined(USE_SSE42) || !defined(__LP64__) - tot += Popcount0001Word(cur_word); -#else - // minor optimization, can't overflow - tot += (cur_word * kMask1111) >> 60; -#endif - } - return tot; -} - // similar to ParseAndSaveDifflist() PglErr ParseAndSaveDeltalist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* __restrict deltalist_len_ptr) { const unsigned char* group_info_iter; diff --git a/2.0/plink2.cc b/2.0/plink2.cc index d255be34..715c5cbe 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a5" #elif defined(USE_AOCL) " AMD" #endif - " (22 Sep 2023)"; + " (23 Sep 2023)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same "" diff --git a/2.0/plink2_export.cc b/2.0/plink2_export.cc index 6b400ea5..17800c92 100644 --- a/2.0/plink2_export.cc +++ b/2.0/plink2_export.cc @@ -9837,8 +9837,8 @@ void GenovecToNybblesSSE4(const uintptr_t* genovec, uint32_t sample_ct, uint32_t // writes up to vector boundary static inline void NybblesToIupac(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) { const VecW lookup = vecw_setr8('.', 'A', 'C', 'M', 'G', 'R', 'S', '.', 'T', 'W', 'Y', '.', 'K', '.', '.', 'N'); - const uint32_t invec_ct = NybbleCtToVecCt(entry_ct); - Subst4bitTo8(nybblevec, lookup, invec_ct, result_vec); + const uint32_t outvec_ct = DivUp(entry_ct, kBytesPerVec); + Subst4bitTo8(nybblevec, lookup, outvec_ct, result_vec); } #else // some of this may be moved to pgenlib_misc @@ -10027,21 +10027,23 @@ THREAD_FUNC_DECL PhylipReadThread(void* raw_arg) { goto PhylipReadThread_err; } ZeroTrailingNyps(read_sample_ct, genovec); - if (write_sample_nm_cts_iter) { - *write_sample_nm_cts_iter++ = read_sample_ct - GenoarrCountMissingUnsafe(genovec, read_sample_ct); - } + const uint32_t nybble0 = allele_nybbles[0]; + const uint32_t nybble1 = allele_nybbles[1]; #ifdef USE_SSE42 - GenovecToNybblesSSE4(genovec, read_sample_ct, allele_nybbles[0], allele_nybbles[1], vmaj_readbuf_iter); + GenovecToNybblesSSE4(genovec, read_sample_ct, nybble0, nybble1, vmaj_readbuf_iter); #else - GenovecToNybblesNoSSE4(genovec_to_nybble_tables, genovec, read_sample_ct, allele_nybbles[0], allele_nybbles[1], vmaj_readbuf_iter); + GenovecToNybblesNoSSE4(genovec_to_nybble_tables, genovec, read_sample_ct, nybble0, nybble1, vmaj_readbuf_iter); #endif + uint32_t missing_allele_present = (nybble0 == 15) || (nybble1 == 15); if (cur_allele_ct > 2) { const uint32_t patch_01_ct = pgv.patch_01_ct; if (patch_01_ct) { uint32_t patch_01_nybbles[4]; - patch_01_nybbles[2] = allele_nybbles[0] | allele_nybbles[2]; + patch_01_nybbles[2] = nybble0 | allele_nybbles[2]; + missing_allele_present |= (allele_nybbles[2] == 15); if (cur_allele_ct == 4) { - patch_01_nybbles[3] = allele_nybbles[0] | allele_nybbles[3]; + patch_01_nybbles[3] = nybble0 | allele_nybbles[3]; + missing_allele_present |= (allele_nybbles[3] == 15); } const uintptr_t* patch_01_set = pgv.patch_01_set; const AlleleCode* patch_01_vals = pgv.patch_01_vals; @@ -10067,6 +10069,20 @@ THREAD_FUNC_DECL PhylipReadThread(void* raw_arg) { } } } + if (write_sample_nm_cts_iter) { + uint32_t missing_ct; + if (missing_allele_present) { + // REF=N or ALT=N is possible (see e.g. chrM POS=3107 in the original + // 1000 Genomes phase 3 variant calls). They should also be counted + // as missing. + // (This is intentionally different from vcf2phylip.py's behavior.) + missing_ct = CountNybble(vmaj_readbuf_iter, ~k0LU, read_sample_ct); + } else { + missing_ct = GenoarrCountMissingUnsafe(genovec, read_sample_ct); + } + *write_sample_nm_cts_iter += read_sample_ct - missing_ct; + ++write_sample_nm_cts_iter; + } vmaj_readbuf_iter = &(vmaj_readbuf_iter[read_sample_ctaw4]); } prev_copy_ct += cur_block_copy_ct; @@ -10158,24 +10174,26 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) { ZeroTrailingNyps(read_sample_ct, genovec); STD_ARRAY_DECL(uint32_t, 4, genocounts); GenoarrCountFreqsUnsafe(genovec, read_sample_ct, genocounts); - if (write_sample_nm_cts_iter) { - *write_sample_nm_cts_iter++ = (read_sample_ct - genocounts[3]) * 2; - } const uint32_t nybble0 = allele_nybbles[0]; + const uint32_t nybble1 = allele_nybbles[1]; { - const uint32_t nybble1 = allele_nybbles[1]; const uint32_t allele0_table_idx = ctzu32(nybble0) + 4 * (nybble0 == 15); const uint32_t allele1_table_idx = ctzu32(nybble1) + 4 * (nybble1 == 15); const uint32_t table_idx = allele0_table_idx + 5 * allele1_table_idx; GenoarrLookup256x1bx4(genovec, genovec_to_nybblepair_tables[table_idx], read_sample_ct, vmaj_readbuf_iter); } uint32_t het_ct = genocounts[1]; + uint32_t missing_allele_present = (nybble0 == 15) || (nybble1 == 15); if (cur_allele_ct > 2) { unsigned char* nybblepairs = DowncastToUc(vmaj_readbuf_iter); const uint32_t patch_01_ct = pgv.patch_01_ct; uint32_t shifted_nybbles[4]; shifted_nybbles[2] = allele_nybbles[2] << 4; - shifted_nybbles[3] = allele_nybbles[3] << 4; + missing_allele_present |= (allele_nybbles[2] == 15); + if (cur_allele_ct == 4) { + shifted_nybbles[3] = allele_nybbles[3] << 4; + missing_allele_present |= (allele_nybbles[3] == 15); + } if (patch_01_ct) { const uintptr_t* patch_01_set = pgv.patch_01_set; const AlleleCode* patch_01_vals = pgv.patch_01_vals; @@ -10202,10 +10220,19 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) { } } } + if (write_sample_nm_cts_iter) { + uint32_t missing_ct; + if (missing_allele_present) { + missing_ct = CountNybble(vmaj_readbuf_iter, ~k0LU, read_sample_ct * 2); + } else { + missing_ct = genocounts[3] * 2; + } + *write_sample_nm_cts_iter += read_sample_ct * 2 - missing_ct; + ++write_sample_nm_cts_iter; + } const uint32_t phasepresent_ct = pgv.phasepresent_ct; if (unlikely(phasepresent_ct != het_ct)) { - // another type of InconsistentInput; use a different code here since - // we want to print a different error message + // another type of InconsistentInput new_err_info = (S_CAST(uint64_t, variant_uidx) << 32) | S_CAST(uint32_t, kPglRetInternalUse1); goto PhylipPhasedReadThread_err; } @@ -10400,7 +10427,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si read_ctx.allele_nybble_table[S_CAST(unsigned char, input_missing_geno_char)] = 15; read_ctx.write_sample_nm_cts = nullptr; if (export_used_sites) { - if (unlikely(bigstack_alloc_u32(variant_ct, &read_ctx.write_sample_nm_cts))) { + if (unlikely(bigstack_calloc_u32(variant_ct, &read_ctx.write_sample_nm_cts))) { goto ExportPhylip_ret_NOMEM; } } @@ -10536,9 +10563,10 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si if (sample_uidx_start) { ClearBitsNz(0, sample_uidx_start, sample_include); } + const uint32_t read_sample_idx_offset = pass_idx * read_sample_ct; uint32_t sample_uidx_end; if (pass_idx + 1 == pass_ct) { - read_sample_ct = sample_ct - pass_idx * read_sample_ct; + read_sample_ct = sample_ct - read_sample_idx_offset; read_sample_ctaw4 = NybbleCtToAlignedWordCt(read_sample_ct); sample_uidx_end = raw_sample_ct; } else { @@ -10623,9 +10651,6 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si fputs("\b\b\b\b\b\b\b\b\b\b\b\b\bwriting... 0%", stdout); fflush(stdout); pct = 0; - uintptr_t sample_uidx_base; - uintptr_t sample_include_bits; - BitIter1Start(sample_include, sample_uidx_start, &sample_uidx_base, &sample_include_bits); uint32_t flush_write_sample_idx = 0; next_print_idx = write_sample_ct / 100; for (uint32_t flush_write_sample_idx_end = 0; ; ) { @@ -10641,7 +10666,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si if (flush_write_sample_idx_end) { const uintptr_t* ascii_iupac_iter = transpose_ctx.smaj_writebufs[1 - parity]; for (; flush_write_sample_idx != flush_write_sample_idx_end; ++flush_write_sample_idx) { - const uint32_t orig_sample_idx = flush_write_sample_idx >> export_phased; + const uint32_t orig_sample_idx = read_sample_idx_offset + (flush_write_sample_idx >> export_phased); char* write_iter = strcpya(textbuf_line_start, &(exported_sample_ids[orig_sample_idx * max_exported_sample_id_blen])); if (export_phased) { *write_iter++ = id_delim; @@ -10703,6 +10728,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si uint32_t chr_fo_idx = UINT32_MAX; uint32_t chr_end = 0; uint32_t chr_blen = 0; + uint32_t min_ct = UINT32_MAX; char* chr_buf = &(writebuf_flush[kMaxIdSlen + 384]); uintptr_t variant_uidx_base = 0; uintptr_t variant_include_bits = variant_include[0]; @@ -10720,7 +10746,11 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si } write_iter = memcpya(write_iter, chr_buf, chr_blen); write_iter = u32toa_x(variant_bps[variant_uidx], '\t', write_iter); - write_iter = u32toa(write_sample_nm_cts[variant_idx], write_iter); + const uint32_t cur_ct = write_sample_nm_cts[variant_idx]; + if (cur_ct < min_ct) { + min_ct = cur_ct; + } + write_iter = u32toa(cur_ct, write_iter); AppendBinaryEoln(&write_iter); if (unlikely(fwrite_ck(writebuf_flush, outfile, &write_iter))) { goto ExportPhylip_ret_WRITE_FAIL; @@ -10730,6 +10760,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si goto ExportPhylip_ret_WRITE_FAIL; } logprintfww("--export phylip%s: %s written.\n", export_phased? "-phased" : "", outname); + logprintf("Minimum non-N sample count across loci: %u/%u\n", min_ct, sample_ct << export_phased); } } while (0) { @@ -10843,8 +10874,7 @@ PglErr Exportf(const uintptr_t* sample_include, const PedigreeIdInfo* piip, cons allele_storage = subst_allele_storage; } if (flags & (kfExportfTypemask - kfExportfIndMajorBed - kfExportfVcf - kfExportfBcf - kfExportfOxGen - kfExportfBgen11 - kfExportfBgen12 - kfExportfBgen13 - kfExportfHaps - kfExportfHapsLegend - kfExportfAv - kfExportfA - kfExportfAD - kfExportfTped - kfExportfPed - kfExportfPhylip - kfExportfPhylipPhased - kfExportfCompound)) { - logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, and ind-major-bed output have been implemented so far.\n"); - // logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, phylip, phylip-phased, and ind-major-bed output have been\nimplemented so far.\n"); + logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, phylip, phylip-phased, and ind-major-bed output have been\nimplemented so far.\n"); reterr = kPglRetNotYetSupported; goto Exportf_ret_1; } diff --git a/2.0/plink2_help.cc b/2.0/plink2_help.cc index b71c9100..cdf06aa6 100644 --- a/2.0/plink2_help.cc +++ b/2.0/plink2_help.cc @@ -438,7 +438,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) { " Create a new fileset with all filters applied. The following output\n" " formats are supported:\n" " (actually, only A, AD, Av, bcf, bgen-1.x, haps, hapslegend, ind-major-bed,\n" -" oxford, ped, tped, and vcf are implemented for now)\n" +" oxford, ped, phylip, phylip-phased, tped, and vcf are implemented for now)\n" " * '23': 23andMe 4-column format. This can only be used on a single\n" " sample's data (--keep may be handy), and does not support\n" " multicharacter allele codes.\n"