diff --git a/sandbox/src/gp.py b/sandbox/src/gp.py index b8cfbc9..733ea3e 100644 --- a/sandbox/src/gp.py +++ b/sandbox/src/gp.py @@ -111,6 +111,7 @@ def clean_data(full_data, response): # Load data data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv" data = pd.read_csv(data_path) +data = data[data["KEY"] == "C_Lava"] output_names = ["ACTIVITY", "GROWTH", "SYMMETRY"] features = [ "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", @@ -173,6 +174,8 @@ def clean_data(full_data, response): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + print("X_train, X_test, y_train, y_test") + print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) diff --git a/src/find_clusters/combine_temporal_data.py b/src/find_clusters/combine_temporal_data.py new file mode 100644 index 0000000..dc3befe --- /dev/null +++ b/src/find_clusters/combine_temporal_data.py @@ -0,0 +1,30 @@ +import os +import pandas as pd +import re +from glob import glob + + +def load_data(file_pattern, suffix_filter): + all_data = [] + # Find all files matching the pattern + files = glob(file_pattern) + for file in files: + # Skip files that don't match the suffix_filter + if not file.endswith(suffix_filter): + continue + + # Extract the time point from the filename + time_match = re.search(r'C-feature_(\d+\.\d+)_metric', os.path.basename(file)) + if time_match: + time_point = float(time_match.group(1)) # Extract and convert the time point + else: + raise ValueError(f"Time point not found in {file}") + + # Load the data and add the TIME column + data = pd.read_csv(file) + data['TIME'] = time_point # Add the TIME column + all_data.append(data) + + # Combine all data into a single DataFrame + combined_data = pd.concat(all_data, ignore_index=True) + return combined_data diff --git a/src/find_clusters/find_best_features.py b/src/find_clusters/find_best_features.py new file mode 100644 index 0000000..d9f4901 --- /dev/null +++ b/src/find_clusters/find_best_features.py @@ -0,0 +1,223 @@ +import os +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import matplotlib as mpl + +from sklearn.svm import SVC +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.decomposition import PCA +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, accuracy_score +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import StandardScaler +from combine_temporal_data import load_data +import numpy as np + +def classify_and_visualize(data, + time_point, + features, + label_column): + # Encode labels as integers + data = data.copy() + data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning + label_encoder = LabelEncoder() + data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column]) + + X = data_at_time[features] + y = data_at_time[label_column] + + # Split into train and test sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + + # Feature selection (ANOVA F-value) + selector = SelectKBest(score_func=f_classif, k='all') + X_train_selected = selector.fit_transform(X_train, y_train) + X_test_selected = selector.transform(X_test) + + # Rank features by importance + feature_scores = pd.DataFrame({ + "Feature": features, + "Score": selector.scores_ + }).sort_values(by="Score", ascending=False) + print("Feature Importance Rankings:") + print(feature_scores) + # Train an SVM classifier + svm = SVC(kernel='linear', random_state=42) + svm.fit(X_train_selected, y_train) + y_pred = svm.predict(X_test_selected) + + # Evaluate performance + print("Classification Report:") + print(classification_report(y_test, y_pred)) + print(f"Accuracy: {accuracy_score(y_test, y_pred)}") + + + return feature_scores + + +def visualize_features_response(data, time_point, features, response_name, label_column): + # Filter data for the specified time point + data_at_time = data[data['TIME'] == time_point].copy() + data_at_time = data_at_time[data_at_time["COMPONENTS"] == 1] + threshold = 0.2 + columns_to_drop = [col for col in data_at_time.columns if ((data_at_time[col] == np.inf) | (data_at_time[col] == -np.inf)).mean() >= threshold] + data_at_time = data_at_time.drop(columns=columns_to_drop) + label_encoder = LabelEncoder() + data_at_time[label_column] = label_encoder.fit_transform(data_at_time[label_column]) + data_at_time = data_at_time.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0) + # Encode labels and prepare data + X = data_at_time[features] + scaler = StandardScaler() + X = scaler.fit_transform(X) + y = data_at_time[label_column] + categories = label_encoder.classes_ + + # Define a consistent colormap + cmap = plt.cm.viridis + norm = mpl.colors.Normalize(vmin=min(y), vmax=max(y)) # Normalize the label encoding + category_colors = {i: cmap(norm(i)) for i in range(len(categories))} # Map encoded labels to colors + # Perform PCA for visualization + pca = PCA(n_components=2) + reduced_features = pca.fit_transform(X) + + pc1_loadings = pca.components_[0] # First principal component + feature_importance = pd.DataFrame({ + "Feature": features, + "Loading": pc1_loadings, + "Absolute Loading": abs(pc1_loadings) + }) + + # Rank features by absolute loading + feature_importance = feature_importance.sort_values(by="Absolute Loading", ascending=False) + print("Feature Importance Rankings:") + print(feature_importance) + + #reduced_features[:, 1] *= 0 + print("PCA Explained Variance Ratio:") + print(pca.explained_variance_ratio_) + # Create a heatmap for feature correlation + plt.figure(figsize=(12, 10)) + sns.heatmap(data[features].corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True) + plt.title("Feature Correlation Heatmap") + plt.savefig("feature_correlation_heatmap.png") + # Create subplots + fig, axes = plt.subplots(1, 2, figsize=(16, 6), gridspec_kw={'width_ratios': [2, 1]}) + # PCA scatter plot + scatter = axes[0].scatter(reduced_features[:, 0], + reduced_features[:, 1], + c=y, + cmap=cmap, + s=50) + + # Add custom colorbar with string labels + colorbar = fig.colorbar(scatter, ax=axes[0], label="Category") + colorbar_ticks = range(len(categories)) + colorbar.set_ticks(colorbar_ticks) + colorbar.set_ticklabels(categories) # Use string labels for colorbar + + # Add legend to PCA plot + handles, _ = scatter.legend_elements() + axes[0].legend(handles, categories, title="Categories") + axes[0].set_xlabel("PCA Component 1") + axes[0].set_ylabel("PCA Component 2") + axes[0].set_title("PCA Visualization of Features") + + # Violin plot for response distribution + sns.boxplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], palette=category_colors) + sns.swarmplot(x=label_column, y=response_name, data=data_at_time, ax=axes[1], color='black') + # Set custom x-ticks using category names + axes[1].set_xticks(range(len(categories))) # Set the positions of the ticks + axes[1].set_xticklabels(categories) # Set the category names as tick labels + axes[1].set_xlabel("Category") + axes[1].set_ylabel(response_name) + axes[1].set_title(f"{response_name} Distribution for TIME={time_point}") + + # Adjust layout and save the figure + plt.tight_layout() + plt.savefig("combined_visualization.png") + plt.show() + +def visualize_vasculature_over_time(data, vasculature_type, features, label_column): + # Filter data for the selected vasculature type + data_filtered = data[data[label_column] == vasculature_type].copy() + + # Remove columns with excessive inf/-inf values + threshold = 0.2 + columns_to_drop = [col for col in data_filtered.columns if ((data_filtered[col] == np.inf) | (data_filtered[col] == -np.inf)).mean() >= threshold] + data_filtered = data_filtered.drop(columns=columns_to_drop) + + # Replace inf/-inf with NaN and drop rows with NaN + data_filtered = data_filtered.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0) + # Encode time points as labels + time_points = data_filtered['TIME'].unique() + time_points.sort() # Ensure time points are sorted + + # Standardize features + X = data_filtered[features] + scaler = StandardScaler() + X = scaler.fit_transform(X) + + # Perform PCA + pca = PCA(n_components=2) + reduced_features = pca.fit_transform(X) + + # Extract time labels for coloring + time_labels = data_filtered['TIME'].astype(str) # Convert to string for color mapping + + # PCA Explained Variance + print("PCA Explained Variance Ratio:") + print(pca.explained_variance_ratio_) + + # Feature importance (loadings for PC1) + pc1_loadings = pca.components_[0] + feature_importance = pd.DataFrame({ + "Feature": features, + "Loading": pc1_loadings, + "Absolute Loading": abs(pc1_loadings) + }).sort_values(by="Absolute Loading", ascending=False) + print("Feature Importance Rankings:") + print(feature_importance) + + # Define colormap for time points + cmap = plt.cm.viridis + norm = plt.Normalize(vmin=min(data_filtered['TIME']), vmax=max(data_filtered['TIME'])) + colors = cmap(norm(data_filtered['TIME'])) + + # PCA Scatter Plot + plt.figure(figsize=(10, 7)) + scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=data_filtered['TIME'], cmap=cmap, s=50) + colorbar = plt.colorbar(scatter, label="Time") + plt.xlabel("PCA Component 1") + plt.ylabel("PCA Component 2") + plt.title(f"PCA Visualization of {vasculature_type} Over Time") + plt.savefig(f"pca_{vasculature_type}_over_time.png") + + + +def main(): + # Example usage + label_column = "KEY" + features = [ + "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", + "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", + "AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", + "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", + "AVG_BETWEENNESS", "AVG_CORENESS" + ] + #features = [ +# +# ] + # Define the pattern to locate the files and filter suffix + data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv") + suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files + data = load_data(data_path_pattern, suffix_filter) + time_point = 15.0 + response_name = "ACTIVITY" + #pca_visualization(data, time_point, features, label_column=label_column) + #visualize_response(data, time_point, response_name, label_column) + #visualize_features_response(data, time_point, features, response_name, label_column) + vasculature_type = "C_Savav" + visualize_vasculature_over_time(data, vasculature_type, features, label_column) +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/find_clusters/find_clusters.py b/src/find_clusters/find_clusters.py new file mode 100644 index 0000000..c976ebd --- /dev/null +++ b/src/find_clusters/find_clusters.py @@ -0,0 +1,85 @@ +import os +from glob import glob +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix +import seaborn as sns +from combine_temporal_data import load_data + +def cluster_analysis_with_ground_truth(data, time_point, features, label_column): + # Filter data for the specified time point + data_at_time = data[data['TIME'] == time_point].copy() # Use .copy() to avoid the warning + + # Select only the features for clustering + feature_data = data_at_time[features] + + # Encode string ground truth labels to integers + label_encoder = LabelEncoder() + ground_truth_labels = label_encoder.fit_transform(data_at_time[label_column]) # Encode string labels to integers + + # Normalize the features + scaler = StandardScaler() + normalized_features = scaler.fit_transform(feature_data) + + optimal_clusters = len(set(ground_truth_labels)) # Use the number of unique ground truth categories + kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init='auto') + cluster_labels = kmeans.fit_predict(normalized_features) + + data_at_time.loc[:, 'CLUSTER'] = cluster_labels + + # Compare clustering with ground truth + ari = adjusted_rand_score(ground_truth_labels, cluster_labels) + nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels) + print(f"Adjusted Rand Index (ARI): {ari}") + print(f"Normalized Mutual Information (NMI): {nmi}") + + # Confusion Matrix + cm = confusion_matrix(ground_truth_labels, cluster_labels) + plt.figure(figsize=(8, 5)) + sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(optimal_clusters), yticklabels=label_encoder.classes_) + plt.xlabel("Predicted Clusters") + plt.ylabel("Ground Truth") + plt.title("Confusion Matrix") + plt.savefig("confusion_matrix.png") + + # Optional PCA visualization + pca = PCA(n_components=2) + reduced_features = pca.fit_transform(normalized_features) + plt.figure(figsize=(8, 5)) + plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=ground_truth_labels, cmap='viridis', s=50, label='Ground Truth') + #plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='plasma', s=20, label='Predicted Clusters', alpha=0.5) + plt.xlabel("PCA Component 1") + plt.ylabel("PCA Component 2") + plt.title(f"Clusters vs Ground Truth for TIME={time_point}") + plt.legend() + plt.savefig("pca_clusters.png") + + return data_at_time + +def main(): + features = [ + "KEY", "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", + "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", + "AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", + "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", + "AVG_BETWEENNESS", "AVG_CORENESS" + ] + label_column = "KEY" + features = [ + "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW"] + # Define the pattern to locate the files and filter suffix + data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv") + suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files + data = load_data(data_path_pattern, suffix_filter) + #print(data.head()) # Show the first few rows of the combined DataFrame + #print(data['TIME'].unique()) # Display the unique time points + # Analyze clusters for TIME=0.0 with ground truth comparison + clustered_data = cluster_analysis_with_ground_truth(data, time_point=0.0, features=features, label_column=label_column) + print(clustered_data.head()) # View the clustered data + + +if __name__ == '__main__': + main()