find clusters in different type of vasculature

bagherilab · Dec 6, 2024 · 0eadf37 · 0eadf37
1 parent 4614539
commit 0eadf37
Show file tree

Hide file tree

Showing 4 changed files with 341 additions and 0 deletions.
diff --git a/sandbox/src/gp.py b/sandbox/src/gp.py
@@ -111,6 +111,7 @@ def clean_data(full_data, response):
 # Load data
 data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
 data = pd.read_csv(data_path)
+data = data[data["KEY"] == "C_Lava"]
 output_names = ["ACTIVITY", "GROWTH", "SYMMETRY"]
 features = [
     "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", 
@@ -173,6 +174,8 @@ def clean_data(full_data, response):
 
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    print("X_train, X_test, y_train, y_test")
+    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
     scaler = StandardScaler()
     X_train = scaler.fit_transform(X_train)
     X_test = scaler.transform(X_test)

diff --git a/src/find_clusters/combine_temporal_data.py b/src/find_clusters/combine_temporal_data.py
@@ -0,0 +1,30 @@
+import os
+import pandas as pd
+import re
+from glob import glob
+
+
+def load_data(file_pattern, suffix_filter):
+    all_data = []
+    # Find all files matching the pattern
+    files = glob(file_pattern)
+    for file in files:
+        # Skip files that don't match the suffix_filter
+        if not file.endswith(suffix_filter):
+            continue
+
+        # Extract the time point from the filename
+        time_match = re.search(r'C-feature_(\d+\.\d+)_metric', os.path.basename(file))
+        if time_match:
+            time_point = float(time_match.group(1))  # Extract and convert the time point
+        else:
+            raise ValueError(f"Time point not found in {file}")
+
+        # Load the data and add the TIME column
+        data = pd.read_csv(file)
+        data['TIME'] = time_point  # Add the TIME column
+        all_data.append(data)
+
+    # Combine all data into a single DataFrame
+    combined_data = pd.concat(all_data, ignore_index=True)
+    return combined_data
diff --git a/src/find_clusters/find_best_features.py b/src/find_clusters/find_best_features.py
@@ -0,0 +1,223 @@
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import matplotlib as mpl
+
+from sklearn.svm import SVC
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report, accuracy_score
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import StandardScaler
+from combine_temporal_data import load_data
+import numpy as np
+
+def classify_and_visualize(data,
+                           time_point,
+                           features,
+                           label_column):
+    # Encode labels as integers
+    data = data.copy()
+    data_at_time = data[data['TIME'] == time_point].copy()  # Use .copy() to avoid the warning
+    label_encoder = LabelEncoder()
+    data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column])
+
+    X = data_at_time[features]
+    y = data_at_time[label_column]
+
+    # Split into train and test sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+
+    # Feature selection (ANOVA F-value)
+    selector = SelectKBest(score_func=f_classif, k='all')
+    X_train_selected = selector.fit_transform(X_train, y_train)
+    X_test_selected = selector.transform(X_test)
+
+    # Rank features by importance
+    feature_scores = pd.DataFrame({
+        "Feature": features,
+        "Score": selector.scores_
+    }).sort_values(by="Score", ascending=False)
+    print("Feature Importance Rankings:")
+    print(feature_scores)
+    # Train an SVM classifier
+    svm = SVC(kernel='linear', random_state=42)
+    svm.fit(X_train_selected, y_train)
+    y_pred = svm.predict(X_test_selected)
+
+    # Evaluate performance
+    print("Classification Report:")
+    print(classification_report(y_test, y_pred))
+    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
+
+
+    return feature_scores
+
+
+def visualize_features_response(data, time_point, features, response_name, label_column):
+    # Filter data for the specified time point
+    data_at_time = data[data['TIME'] == time_point].copy()
+    data_at_time = data_at_time[data_at_time["COMPONENTS"] == 1]
+    threshold = 0.2
+    columns_to_drop = [col for col in data_at_time.columns if ((data_at_time[col] == np.inf) | (data_at_time[col] == -np.inf)).mean() >= threshold]
+    data_at_time = data_at_time.drop(columns=columns_to_drop)
+    label_encoder = LabelEncoder()
+    data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column])
+    data_at_time = data_at_time.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0)
+    # Encode labels and prepare data
+    X = data_at_time[features]
+    scaler = StandardScaler()
+    X = scaler.fit_transform(X)
+    y = data_at_time[label_column]
+    categories = label_encoder.classes_
+
+    # Define a consistent colormap
+    cmap = plt.cm.viridis
+    norm = mpl.colors.Normalize(vmin=min(y), vmax=max(y))  # Normalize the label encoding
+    category_colors = {i: cmap(norm(i)) for i in range(len(categories))}  # Map encoded labels to colors
+    # Perform PCA for visualization
+    pca = PCA(n_components=2)
+    reduced_features = pca.fit_transform(X)
+
+    pc1_loadings = pca.components_[0]  # First principal component
+    feature_importance = pd.DataFrame({
+        "Feature": features,
+        "Loading": pc1_loadings,
+        "Absolute Loading": abs(pc1_loadings)
+    })
+
+    # Rank features by absolute loading
+    feature_importance = feature_importance.sort_values(by="Absolute Loading", ascending=False)
+    print("Feature Importance Rankings:")
+    print(feature_importance)
+
+    #reduced_features[:, 1] *= 0
+    print("PCA Explained Variance Ratio:")
+    print(pca.explained_variance_ratio_)
+    # Create a heatmap for feature correlation
+    plt.figure(figsize=(12, 10))
+    sns.heatmap(data[features].corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
+    plt.title("Feature Correlation Heatmap")
+    plt.savefig("feature_correlation_heatmap.png")
+    # Create subplots
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6), gridspec_kw={'width_ratios': [2, 1]})
+    # PCA scatter plot
+    scatter = axes[0].scatter(reduced_features[:, 0],
+                              reduced_features[:, 1],
+                              c=y,
+                              cmap=cmap,
+                              s=50)
+
+    # Add custom colorbar with string labels
+    colorbar = fig.colorbar(scatter, ax=axes[0], label="Category")
+    colorbar_ticks = range(len(categories))
+    colorbar.set_ticks(colorbar_ticks)
+    colorbar.set_ticklabels(categories)  # Use string labels for colorbar
+
+    # Add legend to PCA plot
+    handles, _ = scatter.legend_elements()
+    axes[0].legend(handles, categories, title="Categories")
+    axes[0].set_xlabel("PCA Component 1")
+    axes[0].set_ylabel("PCA Component 2")
+    axes[0].set_title("PCA Visualization of Features")
+
+    # Violin plot for response distribution
+    sns.boxplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], palette=category_colors)
+    sns.swarmplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], color='black')
+    # Set custom x-ticks using category names
+    axes[1].set_xticks(range(len(categories)))  # Set the positions of the ticks
+    axes[1].set_xticklabels(categories)  # Set the category names as tick labels
+    axes[1].set_xlabel("Category")
+    axes[1].set_ylabel(response_name)
+    axes[1].set_title(f"{response_name} Distribution for TIME={time_point}")
+
+    # Adjust layout and save the figure
+    plt.tight_layout()
+    plt.savefig("combined_visualization.png")
+    plt.show()
+
+def visualize_vasculature_over_time(data, vasculature_type, features, label_column):
+    # Filter data for the selected vasculature type
+    data_filtered = data[data[label_column] == vasculature_type].copy()
+
+    # Remove columns with excessive inf/-inf values
+    threshold = 0.2
+    columns_to_drop = [col for col in data_filtered.columns if ((data_filtered[col] == np.inf) | (data_filtered[col] == -np.inf)).mean() >= threshold]
+    data_filtered = data_filtered.drop(columns=columns_to_drop)
+
+    # Replace inf/-inf with NaN and drop rows with NaN
+    data_filtered = data_filtered.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0)
+    # Encode time points as labels
+    time_points = data_filtered['TIME'].unique()
+    time_points.sort()  # Ensure time points are sorted
+
+    # Standardize features
+    X = data_filtered[features]
+    scaler = StandardScaler()
+    X = scaler.fit_transform(X)
+
+    # Perform PCA
+    pca = PCA(n_components=2)
+    reduced_features = pca.fit_transform(X)
+
+    # Extract time labels for coloring
+    time_labels = data_filtered['TIME'].astype(str)  # Convert to string for color mapping
+
+    # PCA Explained Variance
+    print("PCA Explained Variance Ratio:")
+    print(pca.explained_variance_ratio_)
+
+    # Feature importance (loadings for PC1)
+    pc1_loadings = pca.components_[0]
+    feature_importance = pd.DataFrame({
+        "Feature": features,
+        "Loading": pc1_loadings,
+        "Absolute Loading": abs(pc1_loadings)
+    }).sort_values(by="Absolute Loading", ascending=False)
+    print("Feature Importance Rankings:")
+    print(feature_importance)
+
+    # Define colormap for time points
+    cmap = plt.cm.viridis
+    norm = plt.Normalize(vmin=min(data_filtered['TIME']), vmax=max(data_filtered['TIME']))
+    colors = cmap(norm(data_filtered['TIME']))
+
+    # PCA Scatter Plot
+    plt.figure(figsize=(10, 7))
+    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=data_filtered['TIME'], cmap=cmap, s=50)
+    colorbar = plt.colorbar(scatter, label="Time")
+    plt.xlabel("PCA Component 1")
+    plt.ylabel("PCA Component 2")
+    plt.title(f"PCA Visualization of {vasculature_type} Over Time")
+    plt.savefig(f"pca_{vasculature_type}_over_time.png")
+
+
+
+def main():
+    # Example usage
+    label_column = "KEY"
+    features = [
+        "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", 
+        "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", 
+        "AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", 
+        "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", 
+        "AVG_BETWEENNESS", "AVG_CORENESS"
+    ]
+    #features = [
+#
+#    ]
+    # Define the pattern to locate the files and filter suffix
+    data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
+    suffix_filter = "_15-04032023.csv"  # Specify the suffix to filter files
+    data = load_data(data_path_pattern, suffix_filter)
+    time_point = 15.0
+    response_name = "ACTIVITY"
+    #pca_visualization(data, time_point, features, label_column=label_column)
+    #visualize_response(data, time_point, response_name, label_column)
+    #visualize_features_response(data, time_point, features, response_name, label_column)
+    vasculature_type = "C_Savav"
+    visualize_vasculature_over_time(data, vasculature_type, features, label_column)
+if __name__ == "__main__":
+    main()
diff --git a/src/find_clusters/find_clusters.py b/src/find_clusters/find_clusters.py
@@ -0,0 +1,85 @@
+import os
+from glob import glob
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix
+import seaborn as sns
+from combine_temporal_data import load_data
+
+def cluster_analysis_with_ground_truth(data, time_point, features, label_column):
+    # Filter data for the specified time point
+    data_at_time = data[data['TIME'] == time_point].copy()  # Use .copy() to avoid the warning
+
+    # Select only the features for clustering
+    feature_data = data_at_time[features]
+
+    # Encode string ground truth labels to integers
+    label_encoder = LabelEncoder()
+    ground_truth_labels = label_encoder.fit_transform(data_at_time[label_column])  # Encode string labels to integers
+
+    # Normalize the features
+    scaler = StandardScaler()
+    normalized_features = scaler.fit_transform(feature_data)
+
+    optimal_clusters = len(set(ground_truth_labels))  # Use the number of unique ground truth categories
+    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init='auto')
+    cluster_labels = kmeans.fit_predict(normalized_features)
+
+    data_at_time.loc[:, 'CLUSTER'] = cluster_labels
+
+    # Compare clustering with ground truth
+    ari = adjusted_rand_score(ground_truth_labels, cluster_labels)
+    nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels)
+    print(f"Adjusted Rand Index (ARI): {ari}")
+    print(f"Normalized Mutual Information (NMI): {nmi}")
+
+    # Confusion Matrix
+    cm = confusion_matrix(ground_truth_labels, cluster_labels)
+    plt.figure(figsize=(8, 5))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(optimal_clusters), yticklabels=label_encoder.classes_)
+    plt.xlabel("Predicted Clusters")
+    plt.ylabel("Ground Truth")
+    plt.title("Confusion Matrix")
+    plt.savefig("confusion_matrix.png")
+
+    # Optional PCA visualization
+    pca = PCA(n_components=2)
+    reduced_features = pca.fit_transform(normalized_features)
+    plt.figure(figsize=(8, 5))
+    plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=ground_truth_labels, cmap='viridis', s=50, label='Ground Truth')
+    #plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='plasma', s=20, label='Predicted Clusters', alpha=0.5)
+    plt.xlabel("PCA Component 1")
+    plt.ylabel("PCA Component 2")
+    plt.title(f"Clusters vs Ground Truth for TIME={time_point}")
+    plt.legend()
+    plt.savefig("pca_clusters.png")
+
+    return data_at_time
+
+def main():
+    features = [
+        "KEY", "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", 
+        "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", 
+        "AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", 
+        "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", 
+        "AVG_BETWEENNESS", "AVG_CORENESS"
+    ]
+    label_column = "KEY"
+    features = [
+        "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW"]
+    # Define the pattern to locate the files and filter suffix
+    data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
+    suffix_filter = "_15-04032023.csv"  # Specify the suffix to filter files
+    data = load_data(data_path_pattern, suffix_filter)
+    #print(data.head())  # Show the first few rows of the combined DataFrame
+    #print(data['TIME'].unique())  # Display the unique time points
+    # Analyze clusters for TIME=0.0 with ground truth comparison
+    clustered_data = cluster_analysis_with_ground_truth(data, time_point=0.0, features=features, label_column=label_column)
+    print(clustered_data.head())  # View the clustered data
+
+
+if __name__ == '__main__':
+    main()