Skip to content

Commit

Permalink
Updated ivar-to-vcf, cleaning columns and saving not-merged varints
Browse files Browse the repository at this point in the history
  • Loading branch information
Shettland committed Jun 21, 2024
1 parent 13a698c commit bcf3010
Showing 1 changed file with 67 additions and 19 deletions.
86 changes: 67 additions & 19 deletions bin/ivar_variants_to_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def split_by_codon(self, same_codon_rows):
for row in rowset.itertuples():
alt_pos = [x for x in range(3) if row.REF_CODON[x] != row.ALT_CODON[x]]
if len(alt_pos) > 1:
print("Found 2 conflicting variants in position ", row.POS)
print("Conflicting variants in position %s. Skipped" % row.POS)
continue
alt_pos = alt_pos[0]
first_index = row.Index if first_index == None else first_index
Expand All @@ -338,17 +338,26 @@ def exclude_af_outliers(self, consec_rows, af_threshold):
consec_rows (pd.DataFrame): Consecutive rows aimed to be merged
af_threshold (float): Allele Frequency threshold used to exclude outliers
Returns: clean_consec_rows (pd.DataFrame): Consecutive rows without AF outliers
Returns:
clean_consec_rows (pd.DataFrame): Consecutive rows without AF outliers
"""
if len(consec_rows) <= 1:
print("Cannot determine AF outlier with less than 2 rows. Skipped")
return consec_rows

consec_rows["AF"] = consec_rows["FILENAME"].str.split(":").str[8]
all_afs = consec_rows["AF"].astype(float)
af_median = all_afs.median()

consec_rows["AF"] = np.where(
abs(all_afs - af_median) <= af_threshold, True, False
)
consec_rows = consec_rows[consec_rows["AF"] == True]
clean_consec_rows = consec_rows.drop("AF", axis=1)

if len(consec_rows) == 2:
if np.diff(all_afs)[0] <= af_threshold:
consec_rows["AF"] = False
else:
consec_rows["AF"] = np.where(
abs(all_afs - af_median) <= af_threshold, True, False
)
clean_consec_rows = consec_rows[consec_rows["AF"] == True]
clean_consec_rows = clean_consec_rows.drop("AF", axis=1)
return clean_consec_rows

def merge_rows(self, consec_rows):
Expand Down Expand Up @@ -394,7 +403,9 @@ def handle_dup_rows(self, row_set):
merged_rowlist = []
for indexlist in index_product_list:
consec_rows = row_set.loc[indexlist, :]
clean_rows = self.exclude_af_outliers(consec_rows, self.merge_af_threshold)
clean_rows = self.exclude_af_outliers(
consec_rows.copy(), self.merge_af_threshold
)
if self.find_consecutive(clean_rows).empty:
continue
merged_row = self.merge_rows(clean_rows)
Expand All @@ -404,7 +415,21 @@ def handle_dup_rows(self, row_set):
dropped_rowlist = [x for x in merged_rowlist if x != row]
if any(row[4] == rowdrop[4] for rowdrop in dropped_rowlist):
merged_rowlist.remove(row)
if not clean_rows.equals(consec_rows):
outlier_rows = self.get_rows_diff(consec_rows, clean_rows)
else:
outlier_rows = pd.DataFrame()
if not outlier_rows.empty:
outlier_rows_list = outlier_rows.values.tolist()
merged_rowlist.extend(outlier_rows_list)
return merged_rowlist

def get_rows_diff(self, consec_rows, clean_rows):
diff_rows = consec_rows.merge(clean_rows.drop_duplicates(),
on=list(clean_rows.columns), how='left', indicator=True)
diff_rows = diff_rows[diff_rows['_merge'] == "left_only"]
diff_rows = diff_rows.drop("_merge", axis=1)
return diff_rows

def process_vcf_df(self, vcf_df):
"""Merge rows with consecutive SNPs that passed all filters and without NAs
Expand All @@ -414,6 +439,18 @@ def process_vcf_df(self, vcf_df):
Returns:
processed_vcf_df: dataframe with consecutive variants merged
"""

def include_rows(vcf_df, first_index, rows_to_merge):
indexes_to_merge = [
x for x in range(first_index, first_index + len(rows_to_merge))
]
for index, row in zip(indexes_to_merge, rows_to_merge):
try:
vcf_df.loc[index] = row
except ValueError:
print(f"Invalid row found: {str(row)}. Skipped")
return vcf_df

clean_vcf_df = vcf_df[vcf_df["INFO"] == "TYPE=SNP"]
clean_vcf_df = clean_vcf_df[clean_vcf_df["FILTER"] == "PASS"].dropna()
consecutive_df = self.find_consecutive(clean_vcf_df)
Expand All @@ -422,27 +459,34 @@ def process_vcf_df(self, vcf_df):
splitted_groups = self.split_non_consecutive(consecutive_df)
same_codon_consecutive = self.get_same_codon(splitted_groups)
split_rows_dict = self.split_by_codon(same_codon_consecutive)
for first_index, row_set in sorted(split_rows_dict.items()):
for _, row_set in sorted(split_rows_dict.items()):
first_index = vcf_df.tail(1).index[0] + 1
vcf_df = vcf_df.drop(row_set.Index)
# Redundant "Index" column is generated by Itertuples in split_by_codon()
row_set = row_set.drop(["Index"], axis=1)
if self.find_duplicates(row_set):
rows_to_merge = self.handle_dup_rows(row_set)
indexes_to_merge = [
x for x in range(first_index, first_index + len(rows_to_merge))
]
for index, row in zip(indexes_to_merge, rows_to_merge):
vcf_df.loc[index] = row
rows_to_merge = self.handle_dup_rows(row_set.copy())
vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
else:
clean_rows = self.exclude_af_outliers(row_set, self.merge_af_threshold)
clean_rows = self.exclude_af_outliers(
row_set.copy(), self.merge_af_threshold
)
if not row_set.equals(clean_rows):
outlier_rows = self.get_rows_diff(row_set.copy(), clean_rows)
else:
outlier_rows = pd.DataFrame()
if not outlier_rows.empty:
rows_to_merge = outlier_rows.values.tolist()
vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
first_index = first_index+len(rows_to_merge)+1
if self.find_consecutive(clean_rows).empty:
rows_to_merge = clean_rows.values.tolist()
vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
continue
rows_to_merge = self.merge_rows(clean_rows)
vcf_df.loc[first_index] = rows_to_merge
vcf_df = vcf_df.sort_index().reset_index(drop=True)
vcf_df = vcf_df.sort_values("POS")
processed_vcf_df = vcf_df.drop(["REF_CODON", "ALT_CODON"], axis=1)
processed_vcf_df = vcf_df.sort_values("POS")
return processed_vcf_df

def get_vcf_header(self):
Expand Down Expand Up @@ -509,6 +553,10 @@ def write_vcf(self):
vcf_table = self.initiate_vcf_df()
if not self.ignore_merge:
vcf_table = self.process_vcf_df(vcf_table)
try:
vcf_table = vcf_table.drop(["REF_CODON", "ALT_CODON"], axis=1)
except KeyError:
pass
# Workaround because itertuples cannot handle special characters in column names
vcf_table = vcf_table.rename(
columns={"REGION": "#CHROM", "FILENAME": self.filename}
Expand Down

0 comments on commit bcf3010

Please sign in to comment.