-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
find clusters in different type of vasculature
- Loading branch information
Showing
4 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
import pandas as pd | ||
import re | ||
from glob import glob | ||
|
||
|
||
def load_data(file_pattern, suffix_filter): | ||
all_data = [] | ||
# Find all files matching the pattern | ||
files = glob(file_pattern) | ||
for file in files: | ||
# Skip files that don't match the suffix_filter | ||
if not file.endswith(suffix_filter): | ||
continue | ||
|
||
# Extract the time point from the filename | ||
time_match = re.search(r'C-feature_(\d+\.\d+)_metric', os.path.basename(file)) | ||
if time_match: | ||
time_point = float(time_match.group(1)) # Extract and convert the time point | ||
else: | ||
raise ValueError(f"Time point not found in {file}") | ||
|
||
# Load the data and add the TIME column | ||
data = pd.read_csv(file) | ||
data['TIME'] = time_point # Add the TIME column | ||
all_data.append(data) | ||
|
||
# Combine all data into a single DataFrame | ||
combined_data = pd.concat(all_data, ignore_index=True) | ||
return combined_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,223 @@ | ||
import os | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
import pandas as pd | ||
import matplotlib as mpl | ||
|
||
from sklearn.svm import SVC | ||
from sklearn.feature_selection import SelectKBest, f_classif | ||
from sklearn.decomposition import PCA | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import classification_report, accuracy_score | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.preprocessing import StandardScaler | ||
from combine_temporal_data import load_data | ||
import numpy as np | ||
|
||
def classify_and_visualize(data, | ||
time_point, | ||
features, | ||
label_column): | ||
# Encode labels as integers | ||
data = data.copy() | ||
data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning | ||
label_encoder = LabelEncoder() | ||
data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column]) | ||
|
||
X = data_at_time[features] | ||
y = data_at_time[label_column] | ||
|
||
# Split into train and test sets | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | ||
|
||
# Feature selection (ANOVA F-value) | ||
selector = SelectKBest(score_func=f_classif, k='all') | ||
X_train_selected = selector.fit_transform(X_train, y_train) | ||
X_test_selected = selector.transform(X_test) | ||
|
||
# Rank features by importance | ||
feature_scores = pd.DataFrame({ | ||
"Feature": features, | ||
"Score": selector.scores_ | ||
}).sort_values(by="Score", ascending=False) | ||
print("Feature Importance Rankings:") | ||
print(feature_scores) | ||
# Train an SVM classifier | ||
svm = SVC(kernel='linear', random_state=42) | ||
svm.fit(X_train_selected, y_train) | ||
y_pred = svm.predict(X_test_selected) | ||
|
||
# Evaluate performance | ||
print("Classification Report:") | ||
print(classification_report(y_test, y_pred)) | ||
print(f"Accuracy: {accuracy_score(y_test, y_pred)}") | ||
|
||
|
||
return feature_scores | ||
|
||
|
||
def visualize_features_response(data, time_point, features, response_name, label_column): | ||
# Filter data for the specified time point | ||
data_at_time = data[data['TIME'] == time_point].copy() | ||
data_at_time = data_at_time[data_at_time["COMPONENTS"] == 1] | ||
threshold = 0.2 | ||
columns_to_drop = [col for col in data_at_time.columns if ((data_at_time[col] == np.inf) | (data_at_time[col] == -np.inf)).mean() >= threshold] | ||
data_at_time = data_at_time.drop(columns=columns_to_drop) | ||
label_encoder = LabelEncoder() | ||
data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column]) | ||
data_at_time = data_at_time.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0) | ||
# Encode labels and prepare data | ||
X = data_at_time[features] | ||
scaler = StandardScaler() | ||
X = scaler.fit_transform(X) | ||
y = data_at_time[label_column] | ||
categories = label_encoder.classes_ | ||
|
||
# Define a consistent colormap | ||
cmap = plt.cm.viridis | ||
norm = mpl.colors.Normalize(vmin=min(y), vmax=max(y)) # Normalize the label encoding | ||
category_colors = {i: cmap(norm(i)) for i in range(len(categories))} # Map encoded labels to colors | ||
# Perform PCA for visualization | ||
pca = PCA(n_components=2) | ||
reduced_features = pca.fit_transform(X) | ||
|
||
pc1_loadings = pca.components_[0] # First principal component | ||
feature_importance = pd.DataFrame({ | ||
"Feature": features, | ||
"Loading": pc1_loadings, | ||
"Absolute Loading": abs(pc1_loadings) | ||
}) | ||
|
||
# Rank features by absolute loading | ||
feature_importance = feature_importance.sort_values(by="Absolute Loading", ascending=False) | ||
print("Feature Importance Rankings:") | ||
print(feature_importance) | ||
|
||
#reduced_features[:, 1] *= 0 | ||
print("PCA Explained Variance Ratio:") | ||
print(pca.explained_variance_ratio_) | ||
# Create a heatmap for feature correlation | ||
plt.figure(figsize=(12, 10)) | ||
sns.heatmap(data[features].corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True) | ||
plt.title("Feature Correlation Heatmap") | ||
plt.savefig("feature_correlation_heatmap.png") | ||
# Create subplots | ||
fig, axes = plt.subplots(1, 2, figsize=(16, 6), gridspec_kw={'width_ratios': [2, 1]}) | ||
# PCA scatter plot | ||
scatter = axes[0].scatter(reduced_features[:, 0], | ||
reduced_features[:, 1], | ||
c=y, | ||
cmap=cmap, | ||
s=50) | ||
|
||
# Add custom colorbar with string labels | ||
colorbar = fig.colorbar(scatter, ax=axes[0], label="Category") | ||
colorbar_ticks = range(len(categories)) | ||
colorbar.set_ticks(colorbar_ticks) | ||
colorbar.set_ticklabels(categories) # Use string labels for colorbar | ||
|
||
# Add legend to PCA plot | ||
handles, _ = scatter.legend_elements() | ||
axes[0].legend(handles, categories, title="Categories") | ||
axes[0].set_xlabel("PCA Component 1") | ||
axes[0].set_ylabel("PCA Component 2") | ||
axes[0].set_title("PCA Visualization of Features") | ||
|
||
# Violin plot for response distribution | ||
sns.boxplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], palette=category_colors) | ||
sns.swarmplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], color='black') | ||
# Set custom x-ticks using category names | ||
axes[1].set_xticks(range(len(categories))) # Set the positions of the ticks | ||
axes[1].set_xticklabels(categories) # Set the category names as tick labels | ||
axes[1].set_xlabel("Category") | ||
axes[1].set_ylabel(response_name) | ||
axes[1].set_title(f"{response_name} Distribution for TIME={time_point}") | ||
|
||
# Adjust layout and save the figure | ||
plt.tight_layout() | ||
plt.savefig("combined_visualization.png") | ||
plt.show() | ||
|
||
def visualize_vasculature_over_time(data, vasculature_type, features, label_column): | ||
# Filter data for the selected vasculature type | ||
data_filtered = data[data[label_column] == vasculature_type].copy() | ||
|
||
# Remove columns with excessive inf/-inf values | ||
threshold = 0.2 | ||
columns_to_drop = [col for col in data_filtered.columns if ((data_filtered[col] == np.inf) | (data_filtered[col] == -np.inf)).mean() >= threshold] | ||
data_filtered = data_filtered.drop(columns=columns_to_drop) | ||
|
||
# Replace inf/-inf with NaN and drop rows with NaN | ||
data_filtered = data_filtered.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0) | ||
# Encode time points as labels | ||
time_points = data_filtered['TIME'].unique() | ||
time_points.sort() # Ensure time points are sorted | ||
|
||
# Standardize features | ||
X = data_filtered[features] | ||
scaler = StandardScaler() | ||
X = scaler.fit_transform(X) | ||
|
||
# Perform PCA | ||
pca = PCA(n_components=2) | ||
reduced_features = pca.fit_transform(X) | ||
|
||
# Extract time labels for coloring | ||
time_labels = data_filtered['TIME'].astype(str) # Convert to string for color mapping | ||
|
||
# PCA Explained Variance | ||
print("PCA Explained Variance Ratio:") | ||
print(pca.explained_variance_ratio_) | ||
|
||
# Feature importance (loadings for PC1) | ||
pc1_loadings = pca.components_[0] | ||
feature_importance = pd.DataFrame({ | ||
"Feature": features, | ||
"Loading": pc1_loadings, | ||
"Absolute Loading": abs(pc1_loadings) | ||
}).sort_values(by="Absolute Loading", ascending=False) | ||
print("Feature Importance Rankings:") | ||
print(feature_importance) | ||
|
||
# Define colormap for time points | ||
cmap = plt.cm.viridis | ||
norm = plt.Normalize(vmin=min(data_filtered['TIME']), vmax=max(data_filtered['TIME'])) | ||
colors = cmap(norm(data_filtered['TIME'])) | ||
|
||
# PCA Scatter Plot | ||
plt.figure(figsize=(10, 7)) | ||
scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=data_filtered['TIME'], cmap=cmap, s=50) | ||
colorbar = plt.colorbar(scatter, label="Time") | ||
plt.xlabel("PCA Component 1") | ||
plt.ylabel("PCA Component 2") | ||
plt.title(f"PCA Visualization of {vasculature_type} Over Time") | ||
plt.savefig(f"pca_{vasculature_type}_over_time.png") | ||
|
||
|
||
|
||
def main(): | ||
# Example usage | ||
label_column = "KEY" | ||
features = [ | ||
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", | ||
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", | ||
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", | ||
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", | ||
"AVG_BETWEENNESS", "AVG_CORENESS" | ||
] | ||
#features = [ | ||
# | ||
# ] | ||
# Define the pattern to locate the files and filter suffix | ||
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv") | ||
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files | ||
data = load_data(data_path_pattern, suffix_filter) | ||
time_point = 15.0 | ||
response_name = "ACTIVITY" | ||
#pca_visualization(data, time_point, features, label_column=label_column) | ||
#visualize_response(data, time_point, response_name, label_column) | ||
#visualize_features_response(data, time_point, features, response_name, label_column) | ||
vasculature_type = "C_Savav" | ||
visualize_vasculature_over_time(data, vasculature_type, features, label_column) | ||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
from glob import glob | ||
from sklearn.cluster import KMeans | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.decomposition import PCA | ||
import matplotlib.pyplot as plt | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix | ||
import seaborn as sns | ||
from combine_temporal_data import load_data | ||
|
||
def cluster_analysis_with_ground_truth(data, time_point, features, label_column): | ||
# Filter data for the specified time point | ||
data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning | ||
|
||
# Select only the features for clustering | ||
feature_data = data_at_time[features] | ||
|
||
# Encode string ground truth labels to integers | ||
label_encoder = LabelEncoder() | ||
ground_truth_labels = label_encoder.fit_transform(data_at_time[label_column]) # Encode string labels to integers | ||
|
||
# Normalize the features | ||
scaler = StandardScaler() | ||
normalized_features = scaler.fit_transform(feature_data) | ||
|
||
optimal_clusters = len(set(ground_truth_labels)) # Use the number of unique ground truth categories | ||
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init='auto') | ||
cluster_labels = kmeans.fit_predict(normalized_features) | ||
|
||
data_at_time.loc[:, 'CLUSTER'] = cluster_labels | ||
|
||
# Compare clustering with ground truth | ||
ari = adjusted_rand_score(ground_truth_labels, cluster_labels) | ||
nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels) | ||
print(f"Adjusted Rand Index (ARI): {ari}") | ||
print(f"Normalized Mutual Information (NMI): {nmi}") | ||
|
||
# Confusion Matrix | ||
cm = confusion_matrix(ground_truth_labels, cluster_labels) | ||
plt.figure(figsize=(8, 5)) | ||
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(optimal_clusters), yticklabels=label_encoder.classes_) | ||
plt.xlabel("Predicted Clusters") | ||
plt.ylabel("Ground Truth") | ||
plt.title("Confusion Matrix") | ||
plt.savefig("confusion_matrix.png") | ||
|
||
# Optional PCA visualization | ||
pca = PCA(n_components=2) | ||
reduced_features = pca.fit_transform(normalized_features) | ||
plt.figure(figsize=(8, 5)) | ||
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=ground_truth_labels, cmap='viridis', s=50, label='Ground Truth') | ||
#plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='plasma', s=20, label='Predicted Clusters', alpha=0.5) | ||
plt.xlabel("PCA Component 1") | ||
plt.ylabel("PCA Component 2") | ||
plt.title(f"Clusters vs Ground Truth for TIME={time_point}") | ||
plt.legend() | ||
plt.savefig("pca_clusters.png") | ||
|
||
return data_at_time | ||
|
||
def main(): | ||
features = [ | ||
"KEY", "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", | ||
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", | ||
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", | ||
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", | ||
"AVG_BETWEENNESS", "AVG_CORENESS" | ||
] | ||
label_column = "KEY" | ||
features = [ | ||
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW"] | ||
# Define the pattern to locate the files and filter suffix | ||
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv") | ||
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files | ||
data = load_data(data_path_pattern, suffix_filter) | ||
#print(data.head()) # Show the first few rows of the combined DataFrame | ||
#print(data['TIME'].unique()) # Display the unique time points | ||
# Analyze clusters for TIME=0.0 with ground truth comparison | ||
clustered_data = cluster_analysis_with_ground_truth(data, time_point=0.0, features=features, label_column=label_column) | ||
print(clustered_data.head()) # View the clustered data | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |