-
Notifications
You must be signed in to change notification settings - Fork 0
/
PAM.py
86 lines (73 loc) · 2.96 KB
/
PAM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import time
import pandas as pd
import os
def euclidean_distance(x1, x2):
"""Compute the Euclidean distance between two points."""
return np.linalg.norm(x1 - x2)
def total_cost(data, medoids, clusters):
"""Calculate the total cost (sum of distances) of the clustering."""
cost = 0
for medoid_idx in medoids:
for j in clusters[medoid_idx]:
cost += euclidean_distance(data[j], data[medoid_idx])
return cost
def assign_clusters(data, medoids):
"""Assign data points to the nearest medoids."""
clusters = {medoid_idx: [] for medoid_idx in medoids}
for i, point in enumerate(data):
closest_medoid = min(medoids, key=lambda medoid_idx: euclidean_distance(point, data[medoid_idx]))
clusters[closest_medoid].append(i)
return clusters
def pam(data, k, max_iterations=1000):
"""Partitioning Around Medoids (PAM) algorithm."""
l = len(data)
# Initialize medoids: select k random data points as initial medoids
medoids = np.random.choice(range(l), k, replace=False)
clusters = assign_clusters(data, medoids)
old_cost = total_cost(data, medoids, clusters)
td = old_cost
for _ in range(max_iterations):
# Iterate over each non-medoid point and try swapping with each medoid
for x in range(l):
if x not in medoids:
for m in range(k):
# Swap the current non-medoid point with the m-th medoid
new_medoids = medoids.copy()
new_medoids[m] = x
new_clusters = assign_clusters(data, new_medoids)
new_cost = total_cost(data, new_medoids, new_clusters)
# If the new clustering has lower cost, update medoids and clusters
if new_cost < old_cost:
medoids = new_medoids
clusters = new_clusters
old_cost = new_cost
td = old_cost
# If no improvement, break the loop
else:
break
return td, medoids, clusters
# Function to run PAM on a file
def run_pam_on_file(file_path):
df = pd.read_csv(file_path, header=None)
X = df.to_numpy()
k = 2 # Number of clusters
# Run PAM algorithm
start_time = time.time()
td, medoids, clusters = pam(X, k) # Ignore returned medoids and clusters
end_time = time.time()
duration = end_time - start_time
# Print min_td and time taken
print("File:", file_path)
print("Min_td:", np.sqrt(td))
print("Time taken:", duration, "seconds\n")
print("")
# Main execution
if __name__ == "__main__":
# Directory containing CSV files
directory = "featurevector"
# List all CSV files in the directory
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
# Iterate over each CSV file and run PAM algorithm
for file_path in csv_files:
run_pam_on_file(file_path)