Skip to content

Commit

Permalink
find clusters in different type of vasculature
Browse files Browse the repository at this point in the history
  • Loading branch information
pohaoc2 committed Dec 6, 2024
1 parent 4614539 commit 0eadf37
Show file tree
Hide file tree
Showing 4 changed files with 341 additions and 0 deletions.
3 changes: 3 additions & 0 deletions sandbox/src/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def clean_data(full_data, response):
# Load data
data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
data = pd.read_csv(data_path)
data = data[data["KEY"] == "C_Lava"]
output_names = ["ACTIVITY", "GROWTH", "SYMMETRY"]
features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
Expand Down Expand Up @@ -173,6 +174,8 @@ def clean_data(full_data, response):


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("X_train, X_test, y_train, y_test")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
Expand Down
30 changes: 30 additions & 0 deletions src/find_clusters/combine_temporal_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import pandas as pd
import re
from glob import glob


def load_data(file_pattern, suffix_filter):
all_data = []
# Find all files matching the pattern
files = glob(file_pattern)
for file in files:
# Skip files that don't match the suffix_filter
if not file.endswith(suffix_filter):
continue

# Extract the time point from the filename
time_match = re.search(r'C-feature_(\d+\.\d+)_metric', os.path.basename(file))
if time_match:
time_point = float(time_match.group(1)) # Extract and convert the time point
else:
raise ValueError(f"Time point not found in {file}")

# Load the data and add the TIME column
data = pd.read_csv(file)
data['TIME'] = time_point # Add the TIME column
all_data.append(data)

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
return combined_data
223 changes: 223 additions & 0 deletions src/find_clusters/find_best_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib as mpl

from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from combine_temporal_data import load_data
import numpy as np

def classify_and_visualize(data,
time_point,
features,
label_column):
# Encode labels as integers
data = data.copy()
data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning
label_encoder = LabelEncoder()
data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column])

X = data_at_time[features]
y = data_at_time[label_column]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature selection (ANOVA F-value)
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Rank features by importance
feature_scores = pd.DataFrame({
"Feature": features,
"Score": selector.scores_
}).sort_values(by="Score", ascending=False)
print("Feature Importance Rankings:")
print(feature_scores)
# Train an SVM classifier
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_selected, y_train)
y_pred = svm.predict(X_test_selected)

# Evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


return feature_scores


def visualize_features_response(data, time_point, features, response_name, label_column):
# Filter data for the specified time point
data_at_time = data[data['TIME'] == time_point].copy()
data_at_time = data_at_time[data_at_time["COMPONENTS"] == 1]
threshold = 0.2
columns_to_drop = [col for col in data_at_time.columns if ((data_at_time[col] == np.inf) | (data_at_time[col] == -np.inf)).mean() >= threshold]
data_at_time = data_at_time.drop(columns=columns_to_drop)
label_encoder = LabelEncoder()
data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column])
data_at_time = data_at_time.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0)
# Encode labels and prepare data
X = data_at_time[features]
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data_at_time[label_column]
categories = label_encoder.classes_

# Define a consistent colormap
cmap = plt.cm.viridis
norm = mpl.colors.Normalize(vmin=min(y), vmax=max(y)) # Normalize the label encoding
category_colors = {i: cmap(norm(i)) for i in range(len(categories))} # Map encoded labels to colors
# Perform PCA for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(X)

pc1_loadings = pca.components_[0] # First principal component
feature_importance = pd.DataFrame({
"Feature": features,
"Loading": pc1_loadings,
"Absolute Loading": abs(pc1_loadings)
})

# Rank features by absolute loading
feature_importance = feature_importance.sort_values(by="Absolute Loading", ascending=False)
print("Feature Importance Rankings:")
print(feature_importance)

#reduced_features[:, 1] *= 0
print("PCA Explained Variance Ratio:")
print(pca.explained_variance_ratio_)
# Create a heatmap for feature correlation
plt.figure(figsize=(12, 10))
sns.heatmap(data[features].corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Feature Correlation Heatmap")
plt.savefig("feature_correlation_heatmap.png")
# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6), gridspec_kw={'width_ratios': [2, 1]})
# PCA scatter plot
scatter = axes[0].scatter(reduced_features[:, 0],
reduced_features[:, 1],
c=y,
cmap=cmap,
s=50)

# Add custom colorbar with string labels
colorbar = fig.colorbar(scatter, ax=axes[0], label="Category")
colorbar_ticks = range(len(categories))
colorbar.set_ticks(colorbar_ticks)
colorbar.set_ticklabels(categories) # Use string labels for colorbar

# Add legend to PCA plot
handles, _ = scatter.legend_elements()
axes[0].legend(handles, categories, title="Categories")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[0].set_title("PCA Visualization of Features")

# Violin plot for response distribution
sns.boxplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], palette=category_colors)
sns.swarmplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], color='black')
# Set custom x-ticks using category names
axes[1].set_xticks(range(len(categories))) # Set the positions of the ticks
axes[1].set_xticklabels(categories) # Set the category names as tick labels
axes[1].set_xlabel("Category")
axes[1].set_ylabel(response_name)
axes[1].set_title(f"{response_name} Distribution for TIME={time_point}")

# Adjust layout and save the figure
plt.tight_layout()
plt.savefig("combined_visualization.png")
plt.show()

def visualize_vasculature_over_time(data, vasculature_type, features, label_column):
# Filter data for the selected vasculature type
data_filtered = data[data[label_column] == vasculature_type].copy()

# Remove columns with excessive inf/-inf values
threshold = 0.2
columns_to_drop = [col for col in data_filtered.columns if ((data_filtered[col] == np.inf) | (data_filtered[col] == -np.inf)).mean() >= threshold]
data_filtered = data_filtered.drop(columns=columns_to_drop)

# Replace inf/-inf with NaN and drop rows with NaN
data_filtered = data_filtered.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0)
# Encode time points as labels
time_points = data_filtered['TIME'].unique()
time_points.sort() # Ensure time points are sorted

# Standardize features
X = data_filtered[features]
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Perform PCA
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(X)

# Extract time labels for coloring
time_labels = data_filtered['TIME'].astype(str) # Convert to string for color mapping

# PCA Explained Variance
print("PCA Explained Variance Ratio:")
print(pca.explained_variance_ratio_)

# Feature importance (loadings for PC1)
pc1_loadings = pca.components_[0]
feature_importance = pd.DataFrame({
"Feature": features,
"Loading": pc1_loadings,
"Absolute Loading": abs(pc1_loadings)
}).sort_values(by="Absolute Loading", ascending=False)
print("Feature Importance Rankings:")
print(feature_importance)

# Define colormap for time points
cmap = plt.cm.viridis
norm = plt.Normalize(vmin=min(data_filtered['TIME']), vmax=max(data_filtered['TIME']))
colors = cmap(norm(data_filtered['TIME']))

# PCA Scatter Plot
plt.figure(figsize=(10, 7))
scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=data_filtered['TIME'], cmap=cmap, s=50)
colorbar = plt.colorbar(scatter, label="Time")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title(f"PCA Visualization of {vasculature_type} Over Time")
plt.savefig(f"pca_{vasculature_type}_over_time.png")



def main():
# Example usage
label_column = "KEY"
features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS"
]
#features = [
#
# ]
# Define the pattern to locate the files and filter suffix
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files
data = load_data(data_path_pattern, suffix_filter)
time_point = 15.0
response_name = "ACTIVITY"
#pca_visualization(data, time_point, features, label_column=label_column)
#visualize_response(data, time_point, response_name, label_column)
#visualize_features_response(data, time_point, features, response_name, label_column)
vasculature_type = "C_Savav"
visualize_vasculature_over_time(data, vasculature_type, features, label_column)
if __name__ == "__main__":
main()
85 changes: 85 additions & 0 deletions src/find_clusters/find_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
from glob import glob
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix
import seaborn as sns
from combine_temporal_data import load_data

def cluster_analysis_with_ground_truth(data, time_point, features, label_column):
# Filter data for the specified time point
data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning

# Select only the features for clustering
feature_data = data_at_time[features]

# Encode string ground truth labels to integers
label_encoder = LabelEncoder()
ground_truth_labels = label_encoder.fit_transform(data_at_time[label_column]) # Encode string labels to integers

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(feature_data)

optimal_clusters = len(set(ground_truth_labels)) # Use the number of unique ground truth categories
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init='auto')
cluster_labels = kmeans.fit_predict(normalized_features)

data_at_time.loc[:, 'CLUSTER'] = cluster_labels

# Compare clustering with ground truth
ari = adjusted_rand_score(ground_truth_labels, cluster_labels)
nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari}")
print(f"Normalized Mutual Information (NMI): {nmi}")

# Confusion Matrix
cm = confusion_matrix(ground_truth_labels, cluster_labels)
plt.figure(figsize=(8, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(optimal_clusters), yticklabels=label_encoder.classes_)
plt.xlabel("Predicted Clusters")
plt.ylabel("Ground Truth")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")

# Optional PCA visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(normalized_features)
plt.figure(figsize=(8, 5))
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=ground_truth_labels, cmap='viridis', s=50, label='Ground Truth')
#plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='plasma', s=20, label='Predicted Clusters', alpha=0.5)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title(f"Clusters vs Ground Truth for TIME={time_point}")
plt.legend()
plt.savefig("pca_clusters.png")

return data_at_time

def main():
features = [
"KEY", "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS"
]
label_column = "KEY"
features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW"]
# Define the pattern to locate the files and filter suffix
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files
data = load_data(data_path_pattern, suffix_filter)
#print(data.head()) # Show the first few rows of the combined DataFrame
#print(data['TIME'].unique()) # Display the unique time points
# Analyze clusters for TIME=0.0 with ground truth comparison
clustered_data = cluster_analysis_with_ground_truth(data, time_point=0.0, features=features, label_column=label_column)
print(clustered_data.head()) # View the clustered data


if __name__ == '__main__':
main()

0 comments on commit 0eadf37

Please sign in to comment.