-
Notifications
You must be signed in to change notification settings - Fork 0
/
removeKMBadSubint.py
171 lines (157 loc) · 7.15 KB
/
removeKMBadSubint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
import sys,os,glob
import numpy as np
import subprocess
"""
remove bad archives due to snr and all:avg(flux) stat analysis
"""
# use K-Mmeans method to remove bad archives due to telescope error
def euclidean_distance(vecA, vecB):
""" Get euclidean distance of two vectors """
# return np.sqrt(np.sum(np.square(vecA - vecB)))
return np.linalg.norm(vecA - vecB)
def random_centroids(data, k):
""" K-Mmeans algorithm """
dim = np.shape(data)[1] # get the dimension of the data
centroids = np.mat(np.zeros((k, dim)))
for j in range(dim): # Generate random numbers between the min and max in each dimension
min_j = np.min(data[:, j])
range_j = np.max(data[:, j]) - min_j
centroids[:, j] = min_j * np.mat(np.ones((k, 1))) + np.random.rand(k, 1) * range_j
return centroids
def nearest(data, cluster_centers, distance_func=euclidean_distance):
min_dist = np.inf
m = np.shape(cluster_centers)[0] # the number of clustering center initiated now
for i in range(m):
d = distance_func(data, cluster_centers[i, ]) # cal the distance of each point with every clustering centers
if min_dist > d: # choose the closest distance
min_dist = d
return min_dist
def get_centroids(data, k, distance_func=euclidean_distance):
""" K-Mmeans++ algorithm """
m, n = np.shape(data)
cluster_centers = np.mat(np.zeros((k, n)))
index = np.random.randint(0, m) # 1. select a data point randomly as the first clustering cneter
cluster_centers[0, ] = np.copy(data[index, ])
d = [0.0 for _ in range(m)] # 2. initializing a series of distances
for i in range(1, k):
sum_all = 0
for j in range(m):
d[j] = nearest(data[j, ], cluster_centers[0:i, ], distance_func) # 3. find the closest clustering point for very data point
sum_all += d[j] # 4. add all the closest distances
sum_all *= np.random.random() # 5. get random number between 0 & sum_all
for j, di in enumerate(d): # 6. set farstest data point as the clustering centers
sum_all -= di
if sum_all > 0:
continue
cluster_centers[i] = np.copy(data[j, ])
break
return cluster_centers
def KMeans(data, k, distance_func=euclidean_distance):
'''get clustering centers based on K-means algoriths'''
m = np.shape(data)[0] # get the row number m
cluster_assment = np.mat(np.zeros((m, 2))) # Initiate a matrix, to record clustering indexes and store and distance^2
# centroids = random_centroids(data, k) # generate initiation points
centroids = get_centroids(data, k)
cluster_changed = True # determine if the clustering point needs to be recalculated
while cluster_changed:
cluster_changed = False
for i in range(m):
distance_min = np.inf # set the smallest distance of sample points and clustering centers, default value is inf
index_min = -1 # category belonging to
for j in range(k):
distance_ji = distance_func(centroids[j, :], data[i, :])
if distance_ji < distance_min:
distance_min = distance_ji
index_min = j
if cluster_assment[i, 0] != index_min:
cluster_changed = True
cluster_assment[i, :] = index_min, distance_min ** 2 # store ditance^2
for cent in range(k): # update centroid, taking the mean of all the points in each clustering as the centroid
pts_in_cluster = data[np.nonzero(cluster_assment[:, 0].A == cent)[0]]
centroids[cent, :] = np.mean(pts_in_cluster, axis=0)
return centroids, cluster_assment
def CreateandCheckSNRandFluxstats(fi):
import matplotlib.pyplot as plt
colors = ['C0', 'C1', 'C2']
fluxfile = fi.split(".")[0]+'_flux.txt'
clusterfile = fi.split(".")[0]+'_cluster'
snrstatfig = fi.split(".")[0]+'_cluster.png'
print("Produce flux stats file")
cmdline = "psrstat -jpF -l subint=0- -c snr -c length -c all:avg %s > %s"%(fi, fluxfile)
print("%s"%cmdline)
subprocess.call(cmdline, shell=True)
if 'Apple' in sys.version:
#Macbook
cmdline = "sed -i '' 's/=/\ /g' %s"%(fluxfile)
else:
#Linux
cmdline = "sed -i 's/=/\ /g' %s"%(fluxfile)
subprocess.call(cmdline, shell=True)
avgflux = np.loadtxt(fluxfile, usecols=8)
Nsubint = avgflux.size
clusterdata = []
fig = plt.figure(figsize=[8,4])
# clustering snrs: 2clusters
k = 2
# flux analysis
centroids, cluster_assment = KMeans(avgflux.reshape(-1,1), k)
clusterdata.append(centroids)
clusterdata.append(cluster_assment)
# plot
ax = fig.add_subplot(121)
for idx in range(Nsubint):
ax.scatter(idx, avgflux[idx], c=colors[int(cluster_assment[idx,0])], marker='x')
ax.set_ylabel("Average flux")
# clustering snrs: 2clusters
k = 3
# flux analysis
centroids, cluster_assment = KMeans(avgflux.reshape(-1,1), k)
clusterdata.append(centroids)
clusterdata.append(cluster_assment)
# plot
ax = fig.add_subplot(122)
for idx in range(Nsubint):
ax.scatter(idx, avgflux[idx], c=colors[int(cluster_assment[idx,0])], marker='x')
ax.set_ylabel("Average flux")
fig.savefig(snrstatfig)
plt.close()
print("snrstat fig saved to %s"%(snrstatfig))
np.save(fi.split(".")[0]+'_cluster', np.array(clusterdata))
def LabelBadArchs(fi, k_flux=None):
clusterdata = np.load(fi.split(".")[0]+'_cluster.npy')
if k_flux==2:
centroids, cluster_assment = clusterdata[0], clusterdata[1]
elif k_flux==3:
centroids, cluster_assment = clusterdata[2], clusterdata[3]
else:
print("Only 2 & 3 are allowed, retry")
exit(0)
removeIdxes = np.where(cluster_assment[:,0]==np.argmax(centroids))[0]
subints_remove = ' '.join((str(x) for x in removeIdxes))
cmdline = "paz -w '%s' -m %s"%(subints_remove, fi)
print(cmdline)
def _get_parser():
"""
Arguments parser.
"""
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Use K-Mmeans algorithm clustering method to remove bad archives \
due to telescope error (removeBadArchs.py) @Heng Xu.20200722")
parser.add_argument('files', nargs='+', help='Archive(s) to be look into' )
parser.add_argument("-i", default=None, type=int, help="Obs indexes those need to be labeled.")
parser.add_argument("-k_snr", default=None, type=int, help="Number of clusterings based on snrs, 2 and 3.")
parser.add_argument("-k_flux",default=None, type=int, help="Number of clusterings based on flux, 2 and 3.")
parser.add_argument("-label", action='store_true', help="Label bad subints")
return parser.parse_args()
if __name__ == "__main__":
import argparse,subprocess
args = _get_parser()
if args.label and args.k_flux:
print("Labels bad subints using existing data...")
for fi in args.files:
LabelBadArchs(fi, args.k_flux)
else:
print("Analysis flux data and make diagnose plots...")
for fi in args.files:
CreateandCheckSNRandFluxstats(fi)