Updated ivar-to-vcf, cleaning columns and saving not-merged varints

BU-ISCIII · Jun 21, 2024 · bcf3010 · bcf3010
1 parent 13a698c
commit bcf3010
Showing 1 changed file with 67 additions and 19 deletions.
diff --git a/bin/ivar_variants_to_vcf.py b/bin/ivar_variants_to_vcf.py
@@ -317,7 +317,7 @@ def split_by_codon(self, same_codon_rows):
             for row in rowset.itertuples():
                 alt_pos = [x for x in range(3) if row.REF_CODON[x] != row.ALT_CODON[x]]
                 if len(alt_pos) > 1:
-                    print("Found 2 conflicting variants in position ", row.POS)
+                    print("Conflicting variants in position %s. Skipped" % row.POS)
                     continue
                 alt_pos = alt_pos[0]
                 first_index = row.Index if first_index == None else first_index
@@ -338,17 +338,26 @@ def exclude_af_outliers(self, consec_rows, af_threshold):
             consec_rows (pd.DataFrame): Consecutive rows aimed to be merged
             af_threshold (float): Allele Frequency threshold used to exclude outliers
 
-        Returns: clean_consec_rows (pd.DataFrame): Consecutive rows without AF outliers
+        Returns: 
+            clean_consec_rows (pd.DataFrame): Consecutive rows without AF outliers
         """
+        if len(consec_rows) <= 1:
+            print("Cannot determine AF outlier with less than 2 rows. Skipped")
+            return consec_rows
+
         consec_rows["AF"] = consec_rows["FILENAME"].str.split(":").str[8]
         all_afs = consec_rows["AF"].astype(float)
         af_median = all_afs.median()
-
-        consec_rows["AF"] = np.where(
-            abs(all_afs - af_median) <= af_threshold, True, False
-        )
-        consec_rows = consec_rows[consec_rows["AF"] == True]
-        clean_consec_rows = consec_rows.drop("AF", axis=1)
+
+        if len(consec_rows) == 2:
+            if np.diff(all_afs)[0] <= af_threshold:
+                consec_rows["AF"] = False
+        else:
+            consec_rows["AF"] = np.where(
+                abs(all_afs - af_median) <= af_threshold, True, False
+            )
+        clean_consec_rows = consec_rows[consec_rows["AF"] == True]
+        clean_consec_rows = clean_consec_rows.drop("AF", axis=1)
         return clean_consec_rows
 
     def merge_rows(self, consec_rows):
@@ -394,7 +403,9 @@ def handle_dup_rows(self, row_set):
         merged_rowlist = []
         for indexlist in index_product_list:
             consec_rows = row_set.loc[indexlist, :]
-            clean_rows = self.exclude_af_outliers(consec_rows, self.merge_af_threshold)
+            clean_rows = self.exclude_af_outliers(
+                consec_rows.copy(), self.merge_af_threshold
+            )
             if self.find_consecutive(clean_rows).empty:
                 continue
             merged_row = self.merge_rows(clean_rows)
@@ -404,7 +415,21 @@ def handle_dup_rows(self, row_set):
             dropped_rowlist = [x for x in merged_rowlist if x != row]
             if any(row[4] == rowdrop[4] for rowdrop in dropped_rowlist):
                 merged_rowlist.remove(row)
+        if not clean_rows.equals(consec_rows):
+            outlier_rows = self.get_rows_diff(consec_rows, clean_rows)
+        else:
+            outlier_rows = pd.DataFrame()
+        if not outlier_rows.empty:
+            outlier_rows_list = outlier_rows.values.tolist()
+            merged_rowlist.extend(outlier_rows_list)
         return merged_rowlist
+
+    def get_rows_diff(self, consec_rows, clean_rows):
+        diff_rows = consec_rows.merge(clean_rows.drop_duplicates(),
+            on=list(clean_rows.columns), how='left', indicator=True)
+        diff_rows = diff_rows[diff_rows['_merge'] == "left_only"]
+        diff_rows = diff_rows.drop("_merge", axis=1)
+        return diff_rows
 
     def process_vcf_df(self, vcf_df):
         """Merge rows with consecutive SNPs that passed all filters and without NAs
@@ -414,6 +439,18 @@ def process_vcf_df(self, vcf_df):
         Returns:
             processed_vcf_df: dataframe with consecutive variants merged
         """
+
+        def include_rows(vcf_df, first_index, rows_to_merge):
+            indexes_to_merge = [
+                    x for x in range(first_index, first_index + len(rows_to_merge))
+                ]
+            for index, row in zip(indexes_to_merge, rows_to_merge):
+                try:
+                    vcf_df.loc[index] = row
+                except ValueError:
+                    print(f"Invalid row found: {str(row)}. Skipped")
+            return vcf_df
+
         clean_vcf_df = vcf_df[vcf_df["INFO"] == "TYPE=SNP"]
         clean_vcf_df = clean_vcf_df[clean_vcf_df["FILTER"] == "PASS"].dropna()
         consecutive_df = self.find_consecutive(clean_vcf_df)
@@ -422,27 +459,34 @@ def process_vcf_df(self, vcf_df):
         splitted_groups = self.split_non_consecutive(consecutive_df)
         same_codon_consecutive = self.get_same_codon(splitted_groups)
         split_rows_dict = self.split_by_codon(same_codon_consecutive)
-        for first_index, row_set in sorted(split_rows_dict.items()):
+        for _, row_set in sorted(split_rows_dict.items()):
             first_index = vcf_df.tail(1).index[0] + 1
             vcf_df = vcf_df.drop(row_set.Index)
             # Redundant "Index" column is generated by Itertuples in split_by_codon()
             row_set = row_set.drop(["Index"], axis=1)
             if self.find_duplicates(row_set):
-                rows_to_merge = self.handle_dup_rows(row_set)
-                indexes_to_merge = [
-                    x for x in range(first_index, first_index + len(rows_to_merge))
-                ]
-                for index, row in zip(indexes_to_merge, rows_to_merge):
-                    vcf_df.loc[index] = row
+                rows_to_merge = self.handle_dup_rows(row_set.copy())
+                vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
             else:
-                clean_rows = self.exclude_af_outliers(row_set, self.merge_af_threshold)
+                clean_rows = self.exclude_af_outliers(
+                    row_set.copy(), self.merge_af_threshold
+                )
+                if not row_set.equals(clean_rows):
+                    outlier_rows = self.get_rows_diff(row_set.copy(), clean_rows)
+                else:
+                    outlier_rows = pd.DataFrame()
+                if not outlier_rows.empty:
+                    rows_to_merge = outlier_rows.values.tolist()
+                    vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
+                    first_index = first_index+len(rows_to_merge)+1
                 if self.find_consecutive(clean_rows).empty:
+                    rows_to_merge = clean_rows.values.tolist()
+                    vcf_df = include_rows(vcf_df, first_index, rows_to_merge)
                     continue
                 rows_to_merge = self.merge_rows(clean_rows)
                 vcf_df.loc[first_index] = rows_to_merge
         vcf_df = vcf_df.sort_index().reset_index(drop=True)
-        vcf_df = vcf_df.sort_values("POS")
-        processed_vcf_df = vcf_df.drop(["REF_CODON", "ALT_CODON"], axis=1)
+        processed_vcf_df = vcf_df.sort_values("POS")
         return processed_vcf_df
 
     def get_vcf_header(self):
@@ -509,6 +553,10 @@ def write_vcf(self):
         vcf_table = self.initiate_vcf_df()
         if not self.ignore_merge:
             vcf_table = self.process_vcf_df(vcf_table)
+        try:
+            vcf_table = vcf_table.drop(["REF_CODON", "ALT_CODON"], axis=1)
+        except KeyError:
+            pass
         # Workaround because itertuples cannot handle special characters in column names
         vcf_table = vcf_table.rename(
             columns={"REGION": "#CHROM", "FILENAME": self.filename}