Skip to content

Commit

Permalink
improve --pmerge[-list] --set-all-var-ids overflow message
Browse files Browse the repository at this point in the history
  • Loading branch information
chrchang committed Sep 28, 2023
1 parent 5478ad1 commit b7ffe5b
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 27 deletions.
14 changes: 8 additions & 6 deletions 2.0/plink2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1070,12 +1070,9 @@ PglErr Plink2Core(const Plink2Cmdline* pcp, MakePlink2Flags make_plink2_flags, c
}
if (pcp->misc_flags & kfMiscRealRefAlleles) {
if (unlikely(nonref_flags && (!AllBitsAreOne(nonref_flags, raw_variant_ct)))) {
// technically a lie, it's okay if a .bed is first converted to .pgen
// without this flag, and then the user remembers the existence of
// --real-ref-alleles later. but to reduce the ease of
// foot-shooting, we don't allow this to clobber arbitrary
// nonref_flags arrays.
logerrputs("Error: --real-ref-alleles must be used on a plink1 fileset.\n");
// To reduce the ease of foot-shooting, we don't allow this to
// clobber arbitrary nonref_flags arrays.
logerrputs("Error: --real-ref-alleles must be used on a plink1 or similar fileset.\n");
goto Plink2Core_ret_INCONSISTENT_INPUT;
}

Expand Down Expand Up @@ -9597,6 +9594,10 @@ int main(int argc, char** argv) {
goto main_ret_INVALID_CMDLINE_A;
}
pc.misc_flags |= kfMiscRealRefAlleles;
// --real-ref-alleles applies to --pmerge[-list] input.
if (!(pc.command_flags1 & kfCommand1Pmerge)) {
pc.dependency_flags |= kfFilterAllReq;
}
goto main_param_zero;
} else if (strequal_k_unsafe(flagname_p2, "emove")) {
if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 0x7fffffff))) {
Expand Down Expand Up @@ -11509,6 +11510,7 @@ int main(int argc, char** argv) {
}
reterr = PgenInfoStandalone(pgenname, pc.pginame);
} else {
// --real-ref-alleles is an exception since it applies to merge.
if (unlikely(pc.dependency_flags && (!(pc.command_flags1 & (~kfCommand1Pmerge))))) {
logerrputs("Error: Basic file conversions do not support regular filter or transform\noperations. Rerun your command with --make-bed/--make-[b]pgen.\n");
goto main_ret_INVALID_CMDLINE;
Expand Down
34 changes: 24 additions & 10 deletions 2.0/plink2_data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3764,7 +3764,7 @@ PglErr WriteBimSplit(const char* outname, const uintptr_t* variant_include, cons

const VaridTemplate* cur_varid_templatep = nullptr;
const char* varid_token_start = nullptr; // for vid-split
uint32_t allele_overflow_seen = 0;
uint32_t max_allele_overflow_slen = 0;
uint32_t chr_fo_idx = UINT32_MAX;
uint32_t chr_end = 0;
uint32_t chr_buf_blen = 0;
Expand Down Expand Up @@ -3832,14 +3832,14 @@ PglErr WriteBimSplit(const char* outname, const uintptr_t* variant_include, cons
// Always true in --set-all-var-ids case. True in
// --set-missing-var-ids case when vid-split unspecified, or split
// failed.
cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &allele_overflow_seen, cswritep);
cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &max_allele_overflow_slen, cswritep);
*cswritep++ = '\t';
} else if (varid_token_start) {
const char* varid_token_end = strchrnul(varid_token_start, ';');
// If substring matches missing code and --set-missing-var-ids is
// specified, we replace it.
if (varid_templatep && (S_CAST(uintptr_t, varid_token_end - varid_token_start) == missing_varid_slen) && memequal(varid_token_start, missing_varid_match, missing_varid_slen)) {
cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &allele_overflow_seen, cswritep);
cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &max_allele_overflow_slen, cswritep);
} else {
cswritep = memcpya(cswritep, varid_token_start, varid_token_end - varid_token_start);
}
Expand Down Expand Up @@ -3870,9 +3870,16 @@ PglErr WriteBimSplit(const char* outname, const uintptr_t* variant_include, cons
if (unlikely(CswriteCloseNull(&css, cswritep))) {
goto WriteBimSplit_ret_WRITE_FAIL;
}
if (unlikely(allele_overflow_seen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
if (unlikely(max_allele_overflow_slen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
logputs("\n");
logerrprintfww("Error: Allele code(s) too long for --set-%s-var-ids. (--new-id-max-allele-len may be helpful.)\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
logerrprintf("Error: Allele code(s) too long for --set-%s-var-ids.\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
// Not a precise bound, but in practice this should print the more useful
// message >99% of the time.
if (max_allele_overflow_slen < kMaxIdSlen / 2) {
logerrprintfww("The longest observed allele code in this dataset has length %u. If you're fine with the corresponding ID length, rerun with \"--new-id-max-allele-len %u\" added to your command line.\n", max_allele_overflow_slen, max_allele_overflow_slen);
} else {
logerrprintfww("The longest observed allele code in this dataset has length %u. We recommend deciding on a length-limit, and then adding \"--new-id-max-allele-len <limit> missing\" to your command line to cause variants with longer allele codes to be assigned '.' IDs. (You can then process just those variants with another script, if necessary.)\n", max_allele_overflow_slen);
}
goto WriteBimSplit_ret_INCONSISTENT_INPUT;
}
}
Expand Down Expand Up @@ -4181,7 +4188,7 @@ PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, con
const uint32_t varid_split = (make_plink2_flags / kfMakePlink2VaridSemicolon) & 1;
const uint32_t varid_dup_nosplit = varid_dup && (!varid_split);
const uint32_t split_just_snps = ((make_plink2_flags & (kfMakePlink2MSplitBase * 3)) == kfMakePlink2MSplitSnps);
uint32_t allele_overflow_seen = 0;
uint32_t max_allele_overflow_slen = 0;
uint32_t trs_variant_uidx = 0;
uintptr_t variant_uidx_base = 0;
uintptr_t cur_bits = variant_include[0];
Expand Down Expand Up @@ -4362,14 +4369,14 @@ PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, con
// Always true in --set-all-var-ids case. True in
// --set-missing-var-ids case when vid-split unspecified, or split
// failed.
cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &allele_overflow_seen, cswritep);
cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &max_allele_overflow_slen, cswritep);
*cswritep++ = '\t';
} else if (varid_token_start) {
const char* varid_token_end = strchrnul(varid_token_start, ';');
// If substring matches missing code and --set-missing-var-ids is
// specified, we replace it.
if (varid_templatep && (S_CAST(uintptr_t, varid_token_end - varid_token_start) == missing_varid_slen) && memequal(varid_token_start, missing_varid_match, missing_varid_slen)) {
cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &allele_overflow_seen, cswritep);
cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, &max_allele_overflow_slen, cswritep);
} else {
cswritep = memcpya(cswritep, varid_token_start, varid_token_end - varid_token_start);
}
Expand Down Expand Up @@ -4495,9 +4502,16 @@ PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, con
putc_unlocked('\b', stdout);
}
fputs("\b\b", stdout);
if (unlikely(allele_overflow_seen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
if (unlikely(max_allele_overflow_slen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
logputs("\n");
logerrprintfww("Error: Allele code(s) too long for --set-%s-var-ids. (--new-id-max-allele-len may be helpful.)\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
logerrprintf("Error: Allele code(s) too long for --set-%s-var-ids.\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
// Not a precise bound, but in practice this should print the more useful
// message >99% of the time.
if (max_allele_overflow_slen < kMaxIdSlen / 2) {
logerrprintfww("The longest observed allele code in this dataset has length %u. If you're fine with the corresponding ID length, rerun with \"--new-id-max-allele-len %u\" added to your command line.\n", max_allele_overflow_slen, max_allele_overflow_slen);
} else {
logerrprintfww("The longest observed allele code in this dataset has length %u. We recommend deciding on a length-limit, and then adding \"--new-id-max-allele-len <limit> missing\" to your command line to cause variants with longer allele codes to be assigned '.' IDs. (You can then process just those variants with another script, if necessary.)\n", max_allele_overflow_slen);
}
goto WritePvarSplit_ret_INCONSISTENT_INPUT;
}
}
Expand Down
16 changes: 12 additions & 4 deletions 2.0/plink2_merge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2115,7 +2115,7 @@ PglErr ScanPvarsAndMergeHeader(const PmergeInfo* pmip, const char* missing_varid
uint32_t info_pr_nonflag_exists = 0;
uint32_t max_xheader_line_blen = 0;
uint32_t at_least_one_info_exists = 0;
uint32_t allele_overflow_seen = 0;
uint32_t max_allele_overflow_slen = 0;
uintptr_t info_conflict_ct = 0;
for (uintptr_t fileset_idx = 0; fileset_idx != fileset_ct; ++fileset_idx) {
PmergeInputFilesetLl* cur_fileset = *filesets_iterp;
Expand Down Expand Up @@ -2667,7 +2667,7 @@ PglErr ScanPvarsAndMergeHeader(const PmergeInfo* pmip, const char* missing_varid
}
}
variant_id = variant_id_buf;
char* variant_id_end = VaridTemplateWrite(cur_varid_templatep, token_ptrs[2], alt_start, cur_bp, ref_slen, extra_alt_ct, alt_slen, &allele_overflow_seen, variant_id);
char* variant_id_end = VaridTemplateWrite(cur_varid_templatep, token_ptrs[2], alt_start, cur_bp, ref_slen, extra_alt_ct, alt_slen, &max_allele_overflow_slen, variant_id);
id_slen = variant_id_end - variant_id;
}
if (unlikely(id_slen > kMaxIdSlen)) {
Expand Down Expand Up @@ -3125,8 +3125,16 @@ PglErr ScanPvarsAndMergeHeader(const PmergeInfo* pmip, const char* missing_varid
logerrputs("Error: Chromosomes are not in a consistent order. Retry --pmerge[-list] after\nusing --make-pgen/--make-bed + --sort-vars to sort your variants in a\nconsistent manner.\n");
goto ScanPvarsAndMergeHeader_ret_INCONSISTENT_INPUT;
}
if (unlikely(allele_overflow_seen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
logerrprintfww("Error: Allele code(s) too long for --set-%s-var-ids. (--new-id-max-allele-len may be helpful.)\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
if (unlikely(max_allele_overflow_slen && (!(misc_flags & (kfMiscNewVarIdOverflowMissing | kfMiscNewVarIdOverflowTruncate))))) {
logputs("\n");
logerrprintf("Error: Allele code(s) too long for --set-%s-var-ids.\n", (misc_flags & kfMiscSetMissingVarIds)? "missing" : "all");
// Not a precise bound, but in practice this should print the more useful
// message >99% of the time.
if (max_allele_overflow_slen < kMaxIdSlen / 2) {
logerrprintfww("The longest observed allele code across these datasets has length %u. If you're fine with the corresponding ID length, rerun with \"--new-id-max-allele-len %u\" added to your command line.\n", max_allele_overflow_slen, max_allele_overflow_slen);
} else {
logerrprintfww("The longest observed allele code across these datasets has length %u. We recommend deciding on a length-limit, and then adding \"--new-id-max-allele-len <limit> missing\" to your command line to cause variants with longer allele codes to be assigned '.' IDs. (You can then process just those variants with another script, if necessary.)\n", max_allele_overflow_slen);
}
goto ScanPvarsAndMergeHeader_ret_INCONSISTENT_INPUT;
}
cip->chr_ct = chr_ct;
Expand Down
16 changes: 10 additions & 6 deletions 2.0/plink2_pvar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ BoolErr VaridTemplateApply(unsigned char* tmp_alloc_base, const VaridTemplate* v

// Exported variant of VaridTemplateApply() which appends to a buffer.
// Probable todo: pull out the common parts of the functions.
char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, uint32_t* allele_overflow_seenp, char* dst) {
char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, uint32_t* max_overflow_slenp, char* dst) {
uint32_t insert_slens[4];
const uint32_t alleles_needed = vtp->alleles_needed;
const uint32_t new_id_max_allele_slen = vtp->new_id_max_allele_slen;
Expand All @@ -391,13 +391,13 @@ char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const
insert_slens[1] = id_slen;
id_slen += vtp->base_len;
uint32_t ref_slen = 0;
uint32_t cur_overflow = 0;
uint32_t cur_max_overflow_slen = 0;
const char* tmp_allele_ptrs[2];
if (alleles_needed & 1) {
ref_slen = ref_token_slen;
if (ref_slen > new_id_max_allele_slen) {
cur_max_overflow_slen = ref_slen;
ref_slen = new_id_max_allele_slen;
cur_overflow = 1;
}
insert_slens[2] = ref_slen;
id_slen += ref_slen;
Expand All @@ -411,8 +411,10 @@ char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const
alt1_slen = AdvToDelim(alt1_start, ',') - alt1_start;
}
if (alt1_slen > new_id_max_allele_slen) {
if (alt1_slen > cur_max_overflow_slen) {
cur_max_overflow_slen = alt1_slen;
}
alt1_slen = new_id_max_allele_slen;
cur_overflow = 1;
}
id_slen += alt1_slen;
if (alleles_needed <= 3) {
Expand All @@ -438,8 +440,10 @@ char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const
tmp_allele_ptrs[0] = alt1_start;
}
}
if (cur_overflow) {
*allele_overflow_seenp = 1;
if (cur_max_overflow_slen) {
if (cur_max_overflow_slen > *max_overflow_slenp) {
*max_overflow_slenp = cur_max_overflow_slen;
}
const uint32_t overflow_substitute_blen = vtp->overflow_substitute_blen;
if (overflow_substitute_blen) {
return memcpya(dst, vtp->missing_id_match, overflow_substitute_blen);
Expand Down
2 changes: 1 addition & 1 deletion 2.0/plink2_pvar.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ void VaridTemplateInit(const char* varid_template_str, const char* missing_id_ma

BoolErr VaridInitAll(unsigned char* arena_end, const char* varid_template_str, const char* varid_multi_template_str, const char* varid_multi_nonsnp_template_str, MiscFlags misc_flags, uint32_t new_variant_id_max_allele_slen, unsigned char** arena_basep, const char** missing_varid_matchp, char** chr_output_name_bufp, VaridTemplate** varid_templatepp, VaridTemplate** varid_multi_templatepp, VaridTemplate** varid_multi_nonsnp_templatepp, uint32_t* missing_varid_blenp, uint32_t* missing_varid_match_slenp);

char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, uint32_t* allele_overflow_seenp, char* dst);
char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, uint32_t* max_overflow_slenp, char* dst);

// These functions assume info_token[-1] is safe to read
// They may set info_token[info_slen] to \0, since they need to use strstr()
Expand Down

0 comments on commit b7ffe5b

Please sign in to comment.