-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbgen2igd.cpp
245 lines (218 loc) · 11.2 KB
/
bgen2igd.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#include <iostream>
#include <sstream>
#include <algorithm>
#include "bgen/bstring.h"
#include "picovcf/picovcf.hpp"
extern "C" {
#include <bgen/bgen.h>
}
using namespace picovcf;
// See https://www.chg.ox.ac.uk/~gav/bgen_format/spec/latest.html
static bool colexicographicPairCmp(const std::pair<size_t, size_t>& pair1,
const std::pair<size_t, size_t>& pair2) {
if (pair1.second == pair2.second) {
return pair1.first < pair2.first;
}
return pair1.second < pair2.second;
}
// Convert a BGEN file to an IGD file. Supports phased and unphased, though there are
// some ploidy restrictions on the unphased data.
inline void bgenToIGD(const std::string& bgenFilename,
const std::string& outFilename,
std::string description = "",
bool verbose = false) {
bgen_file* bgenFile = bgen_file_open(bgenFilename.c_str());
PICOVCF_ASSERT_OR_MALFORMED(nullptr != bgenFile, "Invalid bgen file " << bgenFilename);
const uint32_t numIndividuals = bgen_file_nsamples(bgenFile);
PICOVCF_ASSERT_OR_MALFORMED(numIndividuals > 0, "No samples in BGEN file");
const uint32_t numVariants = bgen_file_nvariants(bgenFile);
PICOVCF_ASSERT_OR_MALFORMED(numVariants > 0, "No variants in BGEN file");
std::stringstream ssMetaFile;
ssMetaFile << bgenFilename << ".meta";
std::string metaFilename = ssMetaFile.str();
bgen_metafile* metaFile = bgen_metafile_create(bgenFile, metaFilename.c_str(), 1, 0);
PICOVCF_ASSERT_OR_MALFORMED(nullptr != metaFile, "Failed to create .meta file " << metaFilename);
const bgen_partition* partition = bgen_metafile_read_partition(metaFile, 0);
PICOVCF_ASSERT_OR_MALFORMED(nullptr != partition, "Failed to load partition 0");
bgen_metafile_close(metaFile);
metaFile = nullptr;
PICOVCF_RELEASE_ASSERT(bgen_partition_nvariants(partition) == numVariants);
const bgen_variant* const firstVariant = bgen_partition_get_variant(partition, 0);
bgen_genotype* genotype = bgen_file_open_genotype(bgenFile, firstVariant->genotype_offset);
PICOVCF_RELEASE_ASSERT(genotype != nullptr);
const uint8_t minPloidy = bgen_genotype_min_ploidy(genotype);
const uint8_t maxPloidy = bgen_genotype_max_ploidy(genotype);
PICOVCF_ASSERT_OR_MALFORMED(minPloidy == maxPloidy, "bgen2igd only support single-ploidy files, currently");
const size_t ploidy = static_cast<size_t>(maxPloidy);
const bool isPhased = bgen_genotype_phased(genotype);
std::vector<std::string> variantIds;
std::ofstream outFile(outFilename, std::ios::binary);
IGDWriter writer(ploidy, numIndividuals, isPhased);
writer.writeHeader(outFile, bgenFilename, description);
for (size_t i = 0; i < bgen_partition_nvariants(partition); i++) {
const bgen_variant* const variant = bgen_partition_get_variant(partition, i);
PICOVCF_RELEASE_ASSERT(variant != nullptr);
bgen_genotype* genotype = bgen_file_open_genotype(bgenFile, variant->genotype_offset);
PICOVCF_RELEASE_ASSERT(genotype != nullptr);
{
const uint8_t _minPloidy = bgen_genotype_min_ploidy(genotype);
const uint8_t _maxPloidy = bgen_genotype_max_ploidy(genotype);
PICOVCF_ASSERT_OR_MALFORMED(_minPloidy == _maxPloidy, "bgen2igd only support single-ploidy files, currently");
PICOVCF_ASSERT_OR_MALFORMED(_minPloidy == ploidy, "bgen2igd only support single-ploidy files, currently");
}
PICOVCF_ASSERT_OR_MALFORMED(isPhased == bgen_genotype_phased(genotype), "bgen2igd only support same phasedness throughout file, currently");
const size_t probsPerIndividual = bgen_genotype_ncombs(genotype);
const size_t numProbabilities = probsPerIndividual * numIndividuals;
PICOVCF_RELEASE_ASSERT(numProbabilities >= numIndividuals);
std::vector<double> probabilities(numProbabilities);
PICOVCF_ASSERT_OR_MALFORMED(0 == bgen_genotype_read(genotype, probabilities.data()),
"Failed to read genotype for variant from BGEN");
const size_t numAlleles = static_cast<size_t>(bgen_genotype_nalleles(genotype));
PICOVCF_ASSERT_OR_MALFORMED(0 != numAlleles, "No alleles for BGEN genotype");
IGDSampleList missingData;
const size_t numSampleLists = isPhased ? numAlleles : (numAlleles * ploidy);
std::vector<IGDSampleList> variantGtData(numSampleLists);
// This restriction is just because enumerating the order of genotype probabilities gets
// annoying for ploidy > 2, and we haven't had need for such data (yet).
PICOVCF_ASSERT_OR_MALFORMED(isPhased || ploidy <= 2, "bgen2igd only support haploid or diploid data, for unphased inputs");
SampleT sampleIndex = 0;
if (isPhased) {
const size_t numSamples = numIndividuals * ploidy;
for (size_t sampleId = 0; sampleId < numSamples; sampleId++) {
const size_t baseOffset = (numAlleles * sampleId);
double maxProb = 0.0;
size_t maxAllele = 0;
double sumProb = 0.0;
if (bgen_genotype_missing(genotype, sampleId / ploidy)) {
missingData.push_back(sampleId);
} else {
for (size_t k = 0; k < numAlleles; k++) {
const size_t idx = baseOffset + k;
const double prob = probabilities.at(idx);
sumProb += prob;
if (prob > maxProb) {
maxProb = prob;
maxAllele = k;
}
}
// The BGEN spec leaves out the k'th allele, but the library we use recovers it, so
// this should be true.
PICOVCF_RELEASE_ASSERT(sumProb >= 0.99999 && sumProb <= 1.00001);
// We assume that the allele at position 0 is the reference allele.
if (maxAllele != 0) {
variantGtData.at(maxAllele).push_back(sampleId);
}
}
}
} else {
// There has to be an easier way to do this. We could do this more generically than
// for just haploid/diploid, but it's not needed right now.
std::vector<std::pair<size_t, size_t>> order;
for (size_t k = 0; k < numAlleles; k++) {
if (ploidy == 2) {
for (size_t l = k; l < numAlleles; l++) {
order.emplace_back(k, l);
}
} else {
PICOVCF_RELEASE_ASSERT(ploidy == 1);
order.emplace_back(k, 0);
}
}
std::sort(order.begin(), order.end(), colexicographicPairCmp);
PICOVCF_RELEASE_ASSERT(order.size() == probsPerIndividual);
for (size_t indivId = 0; indivId < numIndividuals; indivId++) {
const size_t baseOffset = (probsPerIndividual * indivId);
if (bgen_genotype_missing(genotype, indivId)) {
missingData.push_back(indivId);
} else {
double maxProb = 0.0;
ssize_t maxAllele1 = 0;
ssize_t maxAllele2 = 0;
double sumProb = 0.0;
for (size_t k = 0; k < order.size(); k++) {
const auto& hapPair = order[k];
const size_t idx = baseOffset + k;
const double prob = probabilities.at(idx);
sumProb += prob;
if (prob > maxProb) {
maxProb = prob;
maxAllele1 = hapPair.first;
maxAllele2 = hapPair.second;
}
}
// The BGEN spec leaves out the k'th allele, but the library we use recovers it, so
// this should be true.
PICOVCF_RELEASE_ASSERT(sumProb >= 0.99999 && sumProb <= 1.00001);
// We assume that the allele at position 0 is the reference allele.
if (ploidy == 1) {
if (maxAllele1 != 0) {
variantGtData.at(maxAllele1).push_back(indivId);
}
} else {
if (maxAllele1 == maxAllele2) {
if (maxAllele1 != 0) {
variantGtData.at(maxAllele1 + numAlleles).push_back(indivId);
}
} else {
if (maxAllele1 != 0) {
variantGtData.at(maxAllele1).push_back(indivId);
}
if (maxAllele2 != 0) {
variantGtData.at(maxAllele2).push_back(indivId);
}
}
}
}
}
}
std::string refAllele(bgen_string_data(variant->allele_ids[0]),
bgen_string_length(variant->allele_ids[0]));
const auto position = variant->position;
for (size_t i = 0; i < numAlleles; i++) {
if (i == 0) {
PICOVCF_RELEASE_ASSERT(variantGtData[i].empty());
continue;
}
std::string alt(bgen_string_data(variant->allele_ids[i]),
bgen_string_length(variant->allele_ids[i]));
const uint8_t numCopies = isPhased ? 0 : 1;
writer.writeVariantSamples(outFile, position, refAllele, alt, variantGtData[i], false, numCopies);
if (!isPhased && ploidy > 1) {
const size_t copies2Index = i + numAlleles;
const auto& sampleList = variantGtData[copies2Index];
if (!sampleList.empty()) {
writer.writeVariantSamples(outFile,
position,
refAllele,
alt,
sampleList,
false,
/*numCopies=*/2);
}
}
}
if (!missingData.empty()) {
writer.writeVariantSamples(outFile, position, refAllele, "", missingData, true);
}
bgen_genotype_close(genotype);
genotype = nullptr;
}
writer.writeIndex(outFile);
writer.writeVariantInfo(outFile);
outFile.seekp(0);
writer.writeHeader(outFile, bgenFilename, description);
if (verbose) {
std::cout << "Wrote " << writer.m_totalCount << " total variants" << std::endl;
std::cout << "Of which " << writer.m_sparseCount << " were written sparsely" << std::endl;
}
}
int main(int argc, char *argv[]) {
if (argc < 3) {
std::cerr << "Usage: bgen2igd <bgen file> <output IGD file>" << std::endl;
return 1;
}
std::string inFile = argv[1];
std::string outFile = argv[2];
bgenToIGD(inFile, outFile, "", true);
return 0;
}