Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding semsim distribution plot analysis #188

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ plotly = "^5.13.0"
seaborn = "^0.12.2"
matplotlib = "^3.7.0"
pyserde = "^0.9.8"
polars = "^0.18.13"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
Expand Down
2 changes: 2 additions & 0 deletions src/pheval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .cli_pheval_utils import (
create_spiked_vcfs_command,
scramble_phenopackets_command,
semsim_comparison_command,
semsim_convert_command,
semsim_scramble_command,
update_phenopackets_command,
Expand Down Expand Up @@ -54,6 +55,7 @@ def pheval_utils():
pheval_utils.add_command(semsim_convert_command)
pheval_utils.add_command(scramble_phenopackets_command)
pheval_utils.add_command(update_phenopackets_command)
pheval_utils.add_command(semsim_comparison_command)
pheval_utils.add_command(create_spiked_vcfs_command)
pheval_utils.add_command(benchmark)
pheval_utils.add_command(benchmark_comparison)
Expand Down
49 changes: 19 additions & 30 deletions src/pheval/cli_pheval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pheval.prepare.create_spiked_vcf import spike_vcfs
from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
from pheval.prepare.update_phenopacket import update_phenopackets
from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
from pheval.utils.semsim_utils import semsim_comparison
from pheval.utils.utils import semsim_convert, semsim_scramble


Expand Down Expand Up @@ -116,18 +116,11 @@ def scramble_phenopackets_command(

@click.command("semsim-comparison")
@click.option(
"--semsim-left",
"-L",
required=True,
metavar="FILE",
help="Path to the first semantic similarity profile.",
)
@click.option(
"--semsim-right",
"-R",
"--input",
"-i",
multiple=True,
required=True,
metavar="FILE",
help="Path to the second semantic similarity profile.",
help="Semsim inputs file",
)
@click.option(
"--score-column",
Expand All @@ -142,42 +135,38 @@ def scramble_phenopackets_command(
"--analysis",
"-a",
required=True,
type=click.Choice(["heatmap", "percentage_diff"], case_sensitive=False),
type=click.Choice(["heatmap", "percentage_diff", "distribution"], case_sensitive=False),
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
help="""There are two types of analysis:
heatmap - Generates a heatmap plot that shows the differences between the semantic similarity profiles using the
score column for this purpose. Defaults to "heatmap".
percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles""",
)
@click.option(
"--output",
"-o",
metavar="FILE",
default="percentage_diff.semsim.tsv",
help="Output path for the difference tsv. Defaults to percentage_diff.semsim.tsv",
"--output-folder",
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
"-O",
metavar="output_folder",
default=".",
help="Output path folder for the comparisons",
)
def semsim_comparison(
semsim_left: Path,
semsim_right: Path,
def semsim_comparison_command(
input: List[Path],
score_column: str,
analysis: str,
output: Path = "percentage_diff.semsim.tsv",
output_folder: Path,
):
"""Compares two semantic similarity profiles
"""Compares semantic similarity profiles

Args:
semsim-left (Path): File path of the first semantic similarity profile
semsim-right (Path): File path of the second semantic similarity profile
output (Path): Output path for the difference tsv. Defaults to "percentage_diff.semsim.tsv".
input (List[Path]): File paths semantic similarity profiles
output-folder (Path): Output folder path for the comparisons.
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
analysis (str): There are two types of analysis:
heatmap - Generates a heatmap plot that shows the differences between the semantic similarity profiles using the
score column for this purpose. Defaults to "heatmap".
percentage_diff - Calculates the score column percentage difference between the semantic similarity profiles.
distribution - Generate plots comparing semsim score distribution.
"""
if analysis == "heatmap":
return semsim_heatmap_plot(semsim_left, semsim_right, score_column)
if analysis == "percentage_diff":
percentage_diff(semsim_left, semsim_right, score_column, output)
semsim_comparison(input, score_column, analysis, output_folder)


@click.command("update-phenopackets")
Expand Down
128 changes: 105 additions & 23 deletions src/pheval/utils/semsim_utils.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,129 @@
"""
Contains all pheval utility methods
"""
import logging
from itertools import combinations
from pathlib import Path
from typing import List

import numpy
import pandas as pd
import plotly.express as px
import polars as pl
import seaborn as sns
from matplotlib import pyplot as plt

import pheval.utils.file_utils as file_utils

info_log = logging.getLogger("info")

def filter_non_0_score(data: pd.DataFrame, col: str) -> pd.DataFrame:

def semsim_comparison(input: List[Path], score_column: str, analysis: str, output: Path):
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
for s in set(combinations(input, 2)):
semsim_left = s[0]
semsim_right = s[1]
if analysis == "heatmap":
semsim_heatmap_plot(semsim_left, semsim_right, score_column)
if analysis == "percentage_diff":
percentage_diff(semsim_left, semsim_right, score_column, output)
if analysis == "distribution":
semsim_score_distribution_plot(input, score_column, output)


def filter_non_0_score(data: pl.DataFrame, col: str) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you got tests for all these other semsim util functions?

Copy link
Member Author

@souzadevinicius souzadevinicius Nov 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are implement on test_cli.py file.
But have to be improved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see that there is a test for the CLI command in test_cli.py, what I meant is do you have tests for all the smaller methods that you are calling? Like the diff_semsim method in semsim_utils.py

"""Removes rows that have value equal to 0 based on the given column passed by col parameter

Args:
data (pd.DataFrame): Dirty dataframe
data (pl.DataFrame): Dirty dataframe
col (str): Column to be filtered

Returns:
pd.DataFrame: Filtered dataframe
pl.DataFrame: Filtered dataframe
"""
return data[data[col] != 0]
return data.filter(pl.col(col) != 0)


def parse_semsim(df: pd.DataFrame, cols: list) -> pd.DataFrame:
def parse_semsim(df: pl.DataFrame, cols: list) -> pd.DataFrame:
"""Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones

Args:
df (pd.DataFrame): semantic similarity profile dataframe
df (pl.DataFrame): semantic similarity profile dataframe
cols (list): list of columns that will be selected on semsim data

Returns:
pd.Dataframe: parsed semantic similarity dataframe
"""
df[cols[-1]] = pd.to_numeric(df[cols[-1]], errors="coerce")
df.replace("None", numpy.nan).dropna(subset=cols[-1], inplace=True)
df.with_columns(pl.col(cols[-1]).cast(pl.Float64))
df[cols[-1]].set(df[cols[-1]].is_null(), None)
return df


def diff_semsim(
semsim_left: pd.DataFrame, semsim_right: pd.DataFrame, score_column: str, absolute_diff: bool
) -> pd.DataFrame:
semsim_left: pl.DataFrame, semsim_right: pl.DataFrame, score_column: str, absolute_diff: bool
) -> pl.DataFrame:
"""Calculates score difference between two semantic similarity profiles

Args:
semsim_left (pd.DataFrame): first semantic similarity dataframe
semsim_right (pd.DataFrame): second semantic similarity dataframe
semsim_left (pl.DataFrame): first semantic similarity dataframe
semsim_right (pl.DataFrame): second semantic similarity dataframe
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False).
Defaults to True.

Returns:
pd.DataFrame: A dataframe with terms and its scores differences
pl.DataFrame: A dataframe with terms and its scores differences
"""
df = pd.merge(semsim_left, semsim_right, on=["subject_id", "object_id"], how="outer")
df = semsim_left.join(semsim_right, on=["subject_id", "object_id"], how="outer")
if absolute_diff:
df["diff"] = df[f"{score_column}_x"] - df[f"{score_column}_y"]
return df[["subject_id", "object_id", "diff"]]
df["diff"] = df.apply(
lambda row: get_percentage_diff(row[f"{score_column}_x"], row[f"{score_column}_y"]), axis=1
df = df.with_columns((pl.col(score_column) - pl.col(f"{score_column}_right")).alias("diff"))
return df[["subject_id", "object_id", f"{score_column}", f"{score_column}_right", "diff"]]
df = df.with_columns(
# horizontal sum with a custom apply
pl.struct([score_column, f"{score_column}_right"])
.apply(lambda x: get_percentage_diff(x[score_column], x[f"{score_column}_right"]))
.alias("diff")
)
return df[["subject_id", "object_id", f"{score_column}", f"{score_column}_right", "diff"]]


def semsim_score_distribution_plot(input: List[Path], score_column: str, output: Path):
"""Generates Semsim score distribution plots

Args:
input (List[Path]): List of semsim input files
score_column (str): Score column that will be plotted on the graphs (e.g. jaccard_similarity)
output (Path): Output folder where plots will be saved
"""
df_list = []
plt.rcParams["figure.autolayout"] = True
plt.rcParams["figure.figsize"] = [20, 3.50 * len(input)]
_, axes = plt.subplots(len(input), 1)
for idx, i in enumerate(input):
info_log.info(f"Reading {Path(i).stem}")
info_log.info(f"Reading {Path(i).stem}")
souzadevinicius marked this conversation as resolved.
Show resolved Hide resolved
df = pl.read_csv(i, separator="\t")
df = df[["subject_id", "object_id", f"{score_column}"]]
df = df.with_columns(semsim=pl.lit(Path(i).stem))
df_list.append(df)
sns.histplot(df[score_column], bins=20, ax=axes[idx]).set_title(Path(i).stem)
axes[idx].set_xlabel(score_column)
plt.setp(axes, ylim=axes[0].get_ylim())
info_log.info("Concatenating data")
df_concat = pl.concat(df_list)
info_log.info(f"Saving plot in {output}/bars.png")
plt.savefig(f"{output}/bars.png")
plt.clf()
sns.histplot(
df_concat,
x=score_column,
bins=10,
multiple="dodge",
fill=True,
kde=True,
alpha=0.5,
hue="semsim",
)
return df[["subject_id", "object_id", f"{score_column}_x", f"{score_column}_y", "diff"]]
info_log.info(f"Saving plot in {output}/dist.png")
plt.savefig(f"{output}/dist.png")


def percentage_diff(semsim_left: Path, semsim_right: Path, score_column: str, output: Path):
Expand All @@ -72,8 +135,20 @@ def percentage_diff(semsim_left: Path, semsim_right: Path, score_column: str, ou
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
output (Path): Output path for the difference tsv file
"""
fname_left = Path(semsim_left).stem
fname_right = Path(semsim_right).stem
clean_df = semsim_analysis(semsim_left, semsim_right, score_column, absolute_diff=False)
clean_df.sort_values(by="diff", ascending=False).to_csv(output, sep="\t", index=False)
(
clean_df.drop_nulls("diff")
.sort("diff", descending=True)
.rename(
{
score_column: f"{fname_left}_{score_column}",
f"{score_column}_right": f"{fname_right}_{score_column}",
}
)
.write_csv(f"{output}/{fname_left}-{fname_right}.diff.tsv", separator="\t")
)


def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str):
Expand All @@ -87,6 +162,9 @@ def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str
clean_df = semsim_analysis(semsim_left, semsim_right, score_column)
df = clean_df.pivot(index="subject_id", columns="object_id", values="diff")
fig = px.imshow(df, text_auto=True)
fig.update_layout(
title=f"{Path(semsim_left).stem} - {Path(semsim_right).stem}", xaxis_nticks=36
)
fig.show()


Expand All @@ -107,8 +185,8 @@ def semsim_analysis(
"""
validate_semsim_file_comparison(semsim_left, semsim_right)
cols = ["subject_id", "object_id", score_column]
semsim_left = pd.read_csv(semsim_left, sep="\t")
semsim_right = pd.read_csv(semsim_right, sep="\t")
semsim_left = pl.read_csv(semsim_left, separator="\t")
semsim_right = pl.read_csv(semsim_right, separator="\t")
file_utils.ensure_columns_exists(
cols=cols,
err_message="must exist in semsim dataframes",
Expand All @@ -117,6 +195,8 @@ def semsim_analysis(
semsim_left = parse_semsim(semsim_left, cols)
semsim_right = parse_semsim(semsim_right, cols)
diff_df = diff_semsim(semsim_left, semsim_right, score_column, absolute_diff)
if not absolute_diff:
return diff_df
return filter_non_0_score(diff_df, "diff")


Expand Down Expand Up @@ -145,6 +225,8 @@ def get_percentage_diff(current_number: float, previous_number: float) -> float:
float: percentage difference between two numbers
"""
try:
if not current_number or not previous_number:
return None
if current_number == previous_number:
return "{:.2%}".format(0)
if current_number > previous_number:
Expand Down
18 changes: 9 additions & 9 deletions testdata/semsim/hp-mp.semsim.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
subject_id subject_label subject_source object_id object_label object_source ancestor_id ancestor_label ancestor_source object_information_content subject_information_content ancestor_information_content jaccard_similarity dice_similarity phenodigm_score
HP:0000001 None None HP:0000236 None None HP:0000001 None None None None 0.6926096656076508 0.05263157894736842 None 0.19092705490615916
HP:0000001 None None HP:0000309 None None HP:0000001 None None None None 0.6926096656076508 0.08333333333333333 None 0.24024460895922492
HP:0000001 None None HP:0000322 None None HP:0000001 None None None None 0.6926096656076508 0.034482758620689655 None 0.15454155401543365
HP:0000001 None None HP:0000735 None None HP:0000001 None None None None 0.6926096656076508 0.07142857142857142 None 0.22242328783644721
HP:0000001 None None HP:0000826 None None HP:0000001 None None None None 0.6926096656076508 0.06666666666666667 None 0.21488131074427277
HP:0000001 None None HP:0000853 None None HP:0000001 None None None None 0.6926096656076508 0.08333333333333333 None 0.24024460895922492
HP:0000001 None None HP:0000938 None None HP:0000001 None None None None 0.6926096656076508 0.05555555555555555 None 0.19615890180152568
HP:0000001 None None HP:0001144 None None HP:0000001 None None None None 0.6926096656076508 0.0625 None 0.20805793448094734
HP:0000001 None None HP:0001443 None None HP:0000001 None None None None 0.6926096656076508 0.13 None 0.26317478329195043
HP:0000001 None None HP:0000236 None None HP:0000001 None None None None 0.6926096656076508 0.02 None 0.19092705490615916
HP:0000001 None None HP:0000309 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.24024460895922492
HP:0000001 None None HP:0000322 None None HP:0000001 None None None None 0.6926096656076508 0.04 None 0.15454155401543365
HP:0000001 None None HP:0000735 None None HP:0000001 None None None None 0.6926096656076508 0.09 None 0.22242328783644721
HP:0000001 None None HP:0000826 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.21488131074427277
HP:0000001 None None HP:0000853 None None HP:0000001 None None None None 0.6926096656076508 0.02 None 0.24024460895922492
HP:0000001 None None HP:0000938 None None HP:0000001 None None None None 0.6926096656076508 0.09 None 0.19615890180152568
HP:0000001 None None HP:0001144 None None HP:0000001 None None None None 0.6926096656076508 0.01 None 0.20805793448094734
HP:0000001 None None HP:0001443 None None HP:0000001 None None None None 0.6926096656076508 0.9 None 0.26317478329195043
Loading