Skip to content

Commit

Permalink
--export phylip sse4 bugfix, other tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
chrchang committed Sep 23, 2023
1 parent 2c2dd01 commit 45a9a15
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 91 deletions.
65 changes: 65 additions & 0 deletions 2.0/include/pgenlib_misc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3049,6 +3049,71 @@ uintptr_t PglComputeMaxAlleleCt(const uintptr_t* allele_idx_offsets, uint32_t va
return max_allele_ct;
}

// Ok for nybble_vvec to be unaligned.
uint32_t CountNybbleVec(const unsigned char* nybble_vvec_biter, uintptr_t nybble_word, uint32_t vec_ct) {
const VecW m0 = vecw_setzero();
const VecW alld15 = VCONST_W(kMask1111);
const VecW m4 = VCONST_W(kMask0F0F);
const VecW xor_vvec = vecw_set1(nybble_word);
VecW prev_sad_result = vecw_setzero();
VecW acc = vecw_setzero();
uintptr_t cur_incr = 15;
for (; ; vec_ct -= cur_incr) {
if (vec_ct < 15) {
if (!vec_ct) {
acc = acc + prev_sad_result;
return HsumW(acc);
}
cur_incr = vec_ct;
}
VecW inner_acc = vecw_setzero();
const unsigned char* nybble_vvec_stop = &(nybble_vvec_biter[cur_incr * kBytesPerVec]);
do {
VecW loader = vecw_loadu(nybble_vvec_biter) ^ xor_vvec;
nybble_vvec_biter += kBytesPerVec;
// DetectAllZeroNybbles() followed by right-shift-3 is the same number of
// operations, can see if that's any faster in practice
loader = vecw_srli(loader, 1) | loader;
loader = vecw_srli(loader, 2) | loader;
inner_acc = inner_acc + vecw_and_notfirst(loader, alld15);
} while (nybble_vvec_biter < nybble_vvec_stop);
inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4);
acc = acc + prev_sad_result;
prev_sad_result = vecw_bytesum(inner_acc, m0);
}
}

uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) {
const unsigned char* nybblearr_uc = S_CAST(const unsigned char*, nybblearr);
const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4;
uint32_t tot = CountNybbleVec(nybblearr_uc, nybble_word, fullword_ct / kWordsPerVec);
#ifdef __LP64__
for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) {
uintptr_t cur_word;
CopyFromUnalignedOffsetW(&cur_word, nybblearr_uc, word_idx);
cur_word ^= nybble_word;
cur_word = cur_word | (cur_word >> 1);
cur_word = cur_word | (cur_word >> 2);
tot += Popcount0001Word((~cur_word) & kMask1111);
}
#endif
const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4;
if (trailing_nybble_ct) {
const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4));
uintptr_t cur_word = SubwordLoad(&(nybblearr_uc[fullword_ct * kBytesPerWord]), trailing_byte_ct) ^ nybble_word;
cur_word = cur_word | (cur_word >> 1);
cur_word = cur_word | (cur_word >> 2);
cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4);
#if defined(USE_SSE42) || !defined(__LP64__)
tot += Popcount0001Word(cur_word);
#else
// minor optimization, can't overflow
tot += (cur_word * kMask1111) >> 60;
#endif
}
return tot;
}

#ifdef __cplusplus
} // namespace plink2
#endif
2 changes: 2 additions & 0 deletions 2.0/include/pgenlib_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,8 @@ HEADER_INLINE AlleleCode GetAidx(const uintptr_t* allele_idx_offsets, uint32_t v
return allele_idx - allele_idx_offsets[variant_uidx];
}

uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct);

// The actual format:
// 1. 2 magic bytes 0x6c 0x1b.
//
Expand Down
64 changes: 0 additions & 64 deletions 2.0/include/pgenlib_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3650,70 +3650,6 @@ uint32_t CountNypSubset(const uintptr_t* __restrict nypvec, const uintptr_t* __r
}
*/

// Ok for nybble_vvec to be unaligned.
uint32_t CountNybbleVec(const unsigned char* nybble_vvec_biter, uintptr_t nybble_word, uint32_t vec_ct) {
const VecW m0 = vecw_setzero();
const VecW alld15 = VCONST_W(kMask1111);
const VecW m4 = VCONST_W(kMask0F0F);
const VecW xor_vvec = vecw_set1(nybble_word);
VecW prev_sad_result = vecw_setzero();
VecW acc = vecw_setzero();
uintptr_t cur_incr = 15;
for (; ; vec_ct -= cur_incr) {
if (vec_ct < 15) {
if (!vec_ct) {
acc = acc + prev_sad_result;
return HsumW(acc);
}
cur_incr = vec_ct;
}
VecW inner_acc = vecw_setzero();
const unsigned char* nybble_vvec_stop = &(nybble_vvec_biter[cur_incr * kBytesPerVec]);
do {
VecW loader = vecw_loadu(nybble_vvec_biter) ^ xor_vvec;
nybble_vvec_biter += kBytesPerVec;
// DetectAllZeroNybbles() followed by right-shift-3 is the same number of
// operations, can see if that's any faster in practice
loader = vecw_srli(loader, 1) | loader;
loader = vecw_srli(loader, 2) | loader;
inner_acc = inner_acc + vecw_and_notfirst(loader, alld15);
} while (nybble_vvec_biter < nybble_vvec_stop);
inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4);
acc = acc + prev_sad_result;
prev_sad_result = vecw_bytesum(inner_acc, m0);
}
}

uint32_t CountNybble(const unsigned char* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) {
const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4;
uint32_t tot = CountNybbleVec(nybblearr, nybble_word, fullword_ct / kWordsPerVec);
#ifdef __LP64__
for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) {
uintptr_t cur_word;
CopyFromUnalignedOffsetW(&cur_word, nybblearr, word_idx);
cur_word ^= nybble_word;
cur_word = cur_word | (cur_word >> 1);
cur_word = cur_word | (cur_word >> 2);
tot += Popcount0001Word((~cur_word) & kMask1111);
}
#endif
const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4;
if (trailing_nybble_ct) {
const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4));
uintptr_t cur_word = SubwordLoad(&(nybblearr[fullword_ct * kBytesPerWord]), trailing_byte_ct) ^ nybble_word;
cur_word = cur_word | (cur_word >> 1);
cur_word = cur_word | (cur_word >> 2);
cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4);
#if defined(USE_SSE42) || !defined(__LP64__)
tot += Popcount0001Word(cur_word);
#else
// minor optimization, can't overflow
tot += (cur_word * kMask1111) >> 60;
#endif
}
return tot;
}

// similar to ParseAndSaveDifflist()
PglErr ParseAndSaveDeltalist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* __restrict deltalist_len_ptr) {
const unsigned char* group_info_iter;
Expand Down
2 changes: 1 addition & 1 deletion 2.0/plink2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ static const char ver_str[] = "PLINK v2.00a5"
#elif defined(USE_AOCL)
" AMD"
#endif
" (22 Sep 2023)";
" (23 Sep 2023)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
""
Expand Down
80 changes: 55 additions & 25 deletions 2.0/plink2_export.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9837,8 +9837,8 @@ void GenovecToNybblesSSE4(const uintptr_t* genovec, uint32_t sample_ct, uint32_t
// writes up to vector boundary
static inline void NybblesToIupac(const uintptr_t* nybblevec, uint32_t entry_ct, uintptr_t* result_vec) {
const VecW lookup = vecw_setr8('.', 'A', 'C', 'M', 'G', 'R', 'S', '.', 'T', 'W', 'Y', '.', 'K', '.', '.', 'N');
const uint32_t invec_ct = NybbleCtToVecCt(entry_ct);
Subst4bitTo8(nybblevec, lookup, invec_ct, result_vec);
const uint32_t outvec_ct = DivUp(entry_ct, kBytesPerVec);
Subst4bitTo8(nybblevec, lookup, outvec_ct, result_vec);
}
#else
// some of this may be moved to pgenlib_misc
Expand Down Expand Up @@ -10027,21 +10027,23 @@ THREAD_FUNC_DECL PhylipReadThread(void* raw_arg) {
goto PhylipReadThread_err;
}
ZeroTrailingNyps(read_sample_ct, genovec);
if (write_sample_nm_cts_iter) {
*write_sample_nm_cts_iter++ = read_sample_ct - GenoarrCountMissingUnsafe(genovec, read_sample_ct);
}
const uint32_t nybble0 = allele_nybbles[0];
const uint32_t nybble1 = allele_nybbles[1];
#ifdef USE_SSE42
GenovecToNybblesSSE4(genovec, read_sample_ct, allele_nybbles[0], allele_nybbles[1], vmaj_readbuf_iter);
GenovecToNybblesSSE4(genovec, read_sample_ct, nybble0, nybble1, vmaj_readbuf_iter);
#else
GenovecToNybblesNoSSE4(genovec_to_nybble_tables, genovec, read_sample_ct, allele_nybbles[0], allele_nybbles[1], vmaj_readbuf_iter);
GenovecToNybblesNoSSE4(genovec_to_nybble_tables, genovec, read_sample_ct, nybble0, nybble1, vmaj_readbuf_iter);
#endif
uint32_t missing_allele_present = (nybble0 == 15) || (nybble1 == 15);
if (cur_allele_ct > 2) {
const uint32_t patch_01_ct = pgv.patch_01_ct;
if (patch_01_ct) {
uint32_t patch_01_nybbles[4];
patch_01_nybbles[2] = allele_nybbles[0] | allele_nybbles[2];
patch_01_nybbles[2] = nybble0 | allele_nybbles[2];
missing_allele_present |= (allele_nybbles[2] == 15);
if (cur_allele_ct == 4) {
patch_01_nybbles[3] = allele_nybbles[0] | allele_nybbles[3];
patch_01_nybbles[3] = nybble0 | allele_nybbles[3];
missing_allele_present |= (allele_nybbles[3] == 15);
}
const uintptr_t* patch_01_set = pgv.patch_01_set;
const AlleleCode* patch_01_vals = pgv.patch_01_vals;
Expand All @@ -10067,6 +10069,20 @@ THREAD_FUNC_DECL PhylipReadThread(void* raw_arg) {
}
}
}
if (write_sample_nm_cts_iter) {
uint32_t missing_ct;
if (missing_allele_present) {
// REF=N or ALT=N is possible (see e.g. chrM POS=3107 in the original
// 1000 Genomes phase 3 variant calls). They should also be counted
// as missing.
// (This is intentionally different from vcf2phylip.py's behavior.)
missing_ct = CountNybble(vmaj_readbuf_iter, ~k0LU, read_sample_ct);
} else {
missing_ct = GenoarrCountMissingUnsafe(genovec, read_sample_ct);
}
*write_sample_nm_cts_iter += read_sample_ct - missing_ct;
++write_sample_nm_cts_iter;
}
vmaj_readbuf_iter = &(vmaj_readbuf_iter[read_sample_ctaw4]);
}
prev_copy_ct += cur_block_copy_ct;
Expand Down Expand Up @@ -10158,24 +10174,26 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) {
ZeroTrailingNyps(read_sample_ct, genovec);
STD_ARRAY_DECL(uint32_t, 4, genocounts);
GenoarrCountFreqsUnsafe(genovec, read_sample_ct, genocounts);
if (write_sample_nm_cts_iter) {
*write_sample_nm_cts_iter++ = (read_sample_ct - genocounts[3]) * 2;
}
const uint32_t nybble0 = allele_nybbles[0];
const uint32_t nybble1 = allele_nybbles[1];
{
const uint32_t nybble1 = allele_nybbles[1];
const uint32_t allele0_table_idx = ctzu32(nybble0) + 4 * (nybble0 == 15);
const uint32_t allele1_table_idx = ctzu32(nybble1) + 4 * (nybble1 == 15);
const uint32_t table_idx = allele0_table_idx + 5 * allele1_table_idx;
GenoarrLookup256x1bx4(genovec, genovec_to_nybblepair_tables[table_idx], read_sample_ct, vmaj_readbuf_iter);
}
uint32_t het_ct = genocounts[1];
uint32_t missing_allele_present = (nybble0 == 15) || (nybble1 == 15);
if (cur_allele_ct > 2) {
unsigned char* nybblepairs = DowncastToUc(vmaj_readbuf_iter);
const uint32_t patch_01_ct = pgv.patch_01_ct;
uint32_t shifted_nybbles[4];
shifted_nybbles[2] = allele_nybbles[2] << 4;
shifted_nybbles[3] = allele_nybbles[3] << 4;
missing_allele_present |= (allele_nybbles[2] == 15);
if (cur_allele_ct == 4) {
shifted_nybbles[3] = allele_nybbles[3] << 4;
missing_allele_present |= (allele_nybbles[3] == 15);
}
if (patch_01_ct) {
const uintptr_t* patch_01_set = pgv.patch_01_set;
const AlleleCode* patch_01_vals = pgv.patch_01_vals;
Expand All @@ -10202,10 +10220,19 @@ THREAD_FUNC_DECL PhylipPhasedReadThread(void* raw_arg) {
}
}
}
if (write_sample_nm_cts_iter) {
uint32_t missing_ct;
if (missing_allele_present) {
missing_ct = CountNybble(vmaj_readbuf_iter, ~k0LU, read_sample_ct * 2);
} else {
missing_ct = genocounts[3] * 2;
}
*write_sample_nm_cts_iter += read_sample_ct * 2 - missing_ct;
++write_sample_nm_cts_iter;
}
const uint32_t phasepresent_ct = pgv.phasepresent_ct;
if (unlikely(phasepresent_ct != het_ct)) {
// another type of InconsistentInput; use a different code here since
// we want to print a different error message
// another type of InconsistentInput
new_err_info = (S_CAST(uint64_t, variant_uidx) << 32) | S_CAST(uint32_t, kPglRetInternalUse1);
goto PhylipPhasedReadThread_err;
}
Expand Down Expand Up @@ -10400,7 +10427,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
read_ctx.allele_nybble_table[S_CAST(unsigned char, input_missing_geno_char)] = 15;
read_ctx.write_sample_nm_cts = nullptr;
if (export_used_sites) {
if (unlikely(bigstack_alloc_u32(variant_ct, &read_ctx.write_sample_nm_cts))) {
if (unlikely(bigstack_calloc_u32(variant_ct, &read_ctx.write_sample_nm_cts))) {
goto ExportPhylip_ret_NOMEM;
}
}
Expand Down Expand Up @@ -10536,9 +10563,10 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
if (sample_uidx_start) {
ClearBitsNz(0, sample_uidx_start, sample_include);
}
const uint32_t read_sample_idx_offset = pass_idx * read_sample_ct;
uint32_t sample_uidx_end;
if (pass_idx + 1 == pass_ct) {
read_sample_ct = sample_ct - pass_idx * read_sample_ct;
read_sample_ct = sample_ct - read_sample_idx_offset;
read_sample_ctaw4 = NybbleCtToAlignedWordCt(read_sample_ct);
sample_uidx_end = raw_sample_ct;
} else {
Expand Down Expand Up @@ -10623,9 +10651,6 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
fputs("\b\b\b\b\b\b\b\b\b\b\b\b\bwriting... 0%", stdout);
fflush(stdout);
pct = 0;
uintptr_t sample_uidx_base;
uintptr_t sample_include_bits;
BitIter1Start(sample_include, sample_uidx_start, &sample_uidx_base, &sample_include_bits);
uint32_t flush_write_sample_idx = 0;
next_print_idx = write_sample_ct / 100;
for (uint32_t flush_write_sample_idx_end = 0; ; ) {
Expand All @@ -10641,7 +10666,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
if (flush_write_sample_idx_end) {
const uintptr_t* ascii_iupac_iter = transpose_ctx.smaj_writebufs[1 - parity];
for (; flush_write_sample_idx != flush_write_sample_idx_end; ++flush_write_sample_idx) {
const uint32_t orig_sample_idx = flush_write_sample_idx >> export_phased;
const uint32_t orig_sample_idx = read_sample_idx_offset + (flush_write_sample_idx >> export_phased);
char* write_iter = strcpya(textbuf_line_start, &(exported_sample_ids[orig_sample_idx * max_exported_sample_id_blen]));
if (export_phased) {
*write_iter++ = id_delim;
Expand Down Expand Up @@ -10703,6 +10728,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
uint32_t chr_fo_idx = UINT32_MAX;
uint32_t chr_end = 0;
uint32_t chr_blen = 0;
uint32_t min_ct = UINT32_MAX;
char* chr_buf = &(writebuf_flush[kMaxIdSlen + 384]);
uintptr_t variant_uidx_base = 0;
uintptr_t variant_include_bits = variant_include[0];
Expand All @@ -10720,7 +10746,11 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
}
write_iter = memcpya(write_iter, chr_buf, chr_blen);
write_iter = u32toa_x(variant_bps[variant_uidx], '\t', write_iter);
write_iter = u32toa(write_sample_nm_cts[variant_idx], write_iter);
const uint32_t cur_ct = write_sample_nm_cts[variant_idx];
if (cur_ct < min_ct) {
min_ct = cur_ct;
}
write_iter = u32toa(cur_ct, write_iter);
AppendBinaryEoln(&write_iter);
if (unlikely(fwrite_ck(writebuf_flush, outfile, &write_iter))) {
goto ExportPhylip_ret_WRITE_FAIL;
Expand All @@ -10730,6 +10760,7 @@ PglErr ExportPhylip(const uintptr_t* orig_sample_include, const SampleIdInfo* si
goto ExportPhylip_ret_WRITE_FAIL;
}
logprintfww("--export phylip%s: %s written.\n", export_phased? "-phased" : "", outname);
logprintf("Minimum non-N sample count across loci: %u/%u\n", min_ct, sample_ct << export_phased);
}
}
while (0) {
Expand Down Expand Up @@ -10843,8 +10874,7 @@ PglErr Exportf(const uintptr_t* sample_include, const PedigreeIdInfo* piip, cons
allele_storage = subst_allele_storage;
}
if (flags & (kfExportfTypemask - kfExportfIndMajorBed - kfExportfVcf - kfExportfBcf - kfExportfOxGen - kfExportfBgen11 - kfExportfBgen12 - kfExportfBgen13 - kfExportfHaps - kfExportfHapsLegend - kfExportfAv - kfExportfA - kfExportfAD - kfExportfTped - kfExportfPed - kfExportfPhylip - kfExportfPhylipPhased - kfExportfCompound)) {
logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, and ind-major-bed output have been implemented so far.\n");
// logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, phylip, phylip-phased, and ind-major-bed output have been\nimplemented so far.\n");
logerrputs("Error: Only VCF, BCF, oxford, bgen-1.x, haps, hapslegend, A, AD, Av, ped, tped,\ncompound-genotypes, phylip, phylip-phased, and ind-major-bed output have been\nimplemented so far.\n");
reterr = kPglRetNotYetSupported;
goto Exportf_ret_1;
}
Expand Down
2 changes: 1 addition & 1 deletion 2.0/plink2_help.cc
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ PglErr DispHelp(const char* const* argvk, uint32_t param_ct) {
" Create a new fileset with all filters applied. The following output\n"
" formats are supported:\n"
" (actually, only A, AD, Av, bcf, bgen-1.x, haps, hapslegend, ind-major-bed,\n"
" oxford, ped, tped, and vcf are implemented for now)\n"
" oxford, ped, phylip, phylip-phased, tped, and vcf are implemented for now)\n"
" * '23': 23andMe 4-column format. This can only be used on a single\n"
" sample's data (--keep may be handy), and does not support\n"
" multicharacter allele codes.\n"
Expand Down

0 comments on commit 45a9a15

Please sign in to comment.