forked from vterron/lemon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmining.py
690 lines (562 loc) · 31.8 KB
/
mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
#! /usr/bin/env python
# Copyright (c) 2012 Victor Terron. All rights reserved.
# Institute of Astrophysics of Andalusia, IAA-CSIC
#
# This file is part of LEMON.
#
# LEMON is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
description = """
This module implements several data mining and data analysis algorithms to
identify potentially interesting objects, such as, for example, those whose
light curves are strongly correlated in different photometric filters or those
that have the lowest standard deviation in their curves and are therefore the
most constant.
"""
import collections
import datetime
import itertools
import numpy
import operator
import scipy.stats
# LEMON modules
import database
import methods
class NoStarsSelectedError(ValueError):
""" Raised when no stars can be returned by the LEMONdBMiner """
pass
class LEMONdBMiner(database.LEMONdB):
""" Interface to identify potentially interesting stars in a LEMONdB.
Databases accessed through this class are expected to be read-only, and
thus methods expensive (whether in I/O or CPU) in the parent class are
memoized. Making changes to the LEMONdB would be a very, very bad idea,
as you might end up with outdated data.
"""
@methods.memoize
def get_star(self, *args):
return super(LEMONdBMiner, self).get_star(*args)
@methods.memoize
def get_light_curve(self, *args):
return super(LEMONdBMiner, self).get_light_curve(*args)
@methods.memoize
def get_period(self, *args):
return super(LEMONdBMiner, self).get_period(*args)
@staticmethod
def _ascii_table(headers, table_rows, sort_index = 1, descending = True,
ndecimals = 8, dates_columns = None):
""" Format data as an ASCII table.
Returns a string with the data in 'table_rows' formatted as an ASCII
table, sorted by the values of the sort_index-th column. 'table_rows'
must be a sequence (table rows) of sequences (table columns), all of
the latter having the same number of elements; use Nones for missing
values and therefore empty cells in the table.
In addition to the data received in 'table_rows', a first column is
automatically prefixed with the index of the row. There is, thus, no
need to manually include it. This numbering starts at zero, as we
belive in spreading the word of Dijkstra's teachings.
Keyword arguments:
sort_index - the column by which the rows of the table are sorted.
descending - sort the columns into decreasing order?
ndecimals - number of decimals shown for real numbers.
dates_columns - the indexes of the columns, containing seconds, to be
formatted as a string indicating hours, minutes and
seconds (e.g., '0:11:06'). The number of days is also
included in case the number of seconds is greater than
or equal to twenty-four hours ('17 days, 6:12:00').
"""
if not dates_columns:
dates_columns = ()
if len(set(len(row) for row in table_rows)) != 1:
raise ValueError("number of columns must be the same for all rows")
if len(headers) != len(table_rows[0]):
raise ValueError("elements in 'headers' must equal that of columns")
# Sort the table by the specified column
if sort_index is not None:
table_rows.sort(key = operator.itemgetter(sort_index),
reverse = descending)
# Give format to all the data in the table as a string, as (1) a date,
# (2) a real number with the specified number of decimals or (3) by
# simply casting it to a string. The formatted data is stored as a
# two-level dictionary, where the first index maps to the row of the
# table and the second to the column.
table_data = collections.defaultdict(dict)
table_data[0][0] = '' # top-left 'corner' of table left empty
# Populate the table with the headers...
for column_index in range(len(headers)):
table_data[0][column_index + 1] = headers[column_index]
# ... and with all the data
for row_index, row in enumerate(table_rows):
table_data[row_index + 1][0] = str(row_index)
for column_index, value in enumerate(row):
if value is None:
table_cell = ''
elif column_index in dates_columns:
table_cell = str(datetime.timedelta(seconds = value))
elif isinstance(value, float):
table_cell = '%.*f' % (ndecimals, value)
else:
table_cell = str(value)
table_data[row_index + 1][column_index + 1] = table_cell
# Determine how many characters are needed for each column. Two
# different measures are made: the width of the entire column (i.e, the
# maximum length among the cells that share the same column) and the
# width of only the data (the maximum length among the cells that share
# the same column, excluding the header). These values will be used to
# right-justify the values and at the same time center them.
widths = {}
data_widths = {}
for column_index in table_data[0].iterkeys():
column_values = (table_data[row][column_index]
for row in table_data.iterkeys())
column_lengths = [len(str(x))
for x in column_values if x is not None]
# The maximum width is increased by two, so that no value in the
# table touches the vertical cell borders (the '|' characters)
widths[column_index] = max(column_lengths) + 2
data_widths[column_index] = max(column_lengths[1:]) # only the data
# Finally, populate the ASCII table with the headers...
header_values = table_data[0].itervalues()
header_str = '|'.join([x.center(widths[index])
for index, x in enumerate(header_values)])
output = header_str + '\n' + '-' * len(header_str) + '\n'
# ... and with the rest of the data
for row_index in sorted(table_data.iterkeys()):
if not row_index: # headers (index == 0) already formatted
continue
row_values = table_data[row_index].values()
output += row_values[0].ljust(data_widths[0]).center(widths[0]) + '|'
output += '|'.join(x.rjust(data_widths[index]).center(widths[index])
for index, x in enumerate(row_values) if index)
output += '\n'
return output
def sort_by_period_similarity(self, minimum = 2, normalization = 'max'):
""" Sort the stars by their period similarity.
This method sorts the stars in a LEMONdB by the similarity between
their periods in the different photometric filters in which they were
observed. How similar the periods of a star are is determined by
computing their standard deviation after normalizing them. A list of
two-element tuples, with the ID of the star and the aforementioned
standard deviation, respectively, is returned, sorted in increasing
order by the latter.
The NoStarsSelectedError exception is raised in case no stars can be
returned, either because 'minimum' is set to a too high value or
because the database has no information about the star periods.
Keyword arguments:
minimum - ignore stars whose periods have been calculated in fewer than
this number of photometric filters. As the standard deviation
of a single element is zero, 'minimum' must have a value of
at least two, since otherwise the first returned stars would
almost always be those with a single period, which most of
the time are not those of interest to us.
normalization - defines how the periods of a star are normalized (i.e.,
the value by which they are divided) before their
standard deviation is calculated. May be any of the
following, self-explanatory values: 'max', 'median'
or 'mean'.
"""
if minimum < 2:
raise ValueError("a minimum of at least two periods is needed")
if normalization == 'max':
norm_func = numpy.max
elif normalization == 'mean':
norm_func = numpy.average
elif normalization == 'median':
norm_func = numpy.median
else:
raise ValueError("unrecognized normalization method")
periods_stdevs = []
for star_id in self.star_ids:
star_periods = self.get_periods(star_id)
if len(star_periods) < minimum:
continue
normalized_periods = star_periods / norm_func(star_periods)
periods_stdev = float(numpy.std(normalized_periods, dtype = self.dtype))
periods_stdevs.append((star_id, periods_stdev))
if not periods_stdevs:
msg = "no stars with at least %d known periods"
raise NoStarsSelectedError(msg % minimum)
return sorted(periods_stdevs, key = operator.itemgetter(1))
def period_similarity(self, how_many, minimum = 2,
normalization = 'max', ndecimals = 8):
""" Return an ASCII table with the stars with the most similar periods.
This is a wrapper around the LEMONdBMiner.sort_by_period_similarity and
LEMONdBMiner._ascii_table methods, to avoid having to invoke them
sequentially. In other words: first, the 'how_many' stars with the most
similar periods are found, and then, sorted increasingly by the
standard deviation of their normalized periods, returned in an ASCII
table. Along with this value, the period of each star in each
photometric filter is shown, formatted such as '0:11:06' or
'17 days, 6:12:00'.
Keyword arguments:
minimum - ignore stars whose periods have been calculated in fewer than
this number of photometric filters. As the standard deviation
of a single element is zero, 'minimum' must have a value of
at least two, since otherwise the first returned stars would
almost always be those with a single period, which most of
the time are not those of interest to us.
normalization - defines how the periods of a star are normalized (i.e.,
the value by which they are divided) before their
standard deviation is calculated. May be any of the
following, self-explanatory values: 'max', 'median'
or 'mean'.
ndecimals - number of decimals shown for real numbers.
"""
# The 'how_many' stars with the most similar normalized periods
most_similar = \
self.sort_by_period_similarity(normalization = normalization,
minimum = minimum)[:how_many]
header = []
header.append("Star")
header.append("Stdev. Norm.")
for pfilter in self.pfilters:
header.append("Period %s" % pfilter.letter)
table_columns = []
for star_id, star_stdev in most_similar:
column = [star_id, star_stdev]
for pfilter in self.pfilters:
# LEMONdB.get_period returns a two-element tuple (period,
# step), but a single None if the star period is unknown
star_period = self.get_period(star_id, pfilter)
if star_period is None:
period_secs = step_secs = None
else:
period_secs, step_secs = star_period
column.append(period_secs)
table_columns.append(column)
return LEMONdBMiner._ascii_table(header, table_columns, sort_index = 1,
descending = False, ndecimals = ndecimals,
dates_columns = (2, 3, 4))
def match_bands(self, star_id, first_pfilter, second_pfilter,
delta = 3600 * 1.5):
""" Return the paired magnitudes from two light curves of the star.
The method takes the light curves of a star in two photometric filters
and 'matches' each point of the first curve to the closest point in
time in the second. If they are not more than 'delta' seconds apart,
they are considered to be paired. Returns a list of two-element tuples
which contain the two matched magnitudes. If the star has no light
curve in one (or both) of the light curves, None is returned.
"""
try:
# Extract the light curves of the star in the two filters, as
# (Unix time, magnitude, S/N) three-element tuples. NoneType
# is returned (and therefore the conversion to list raises
# TypeError) if the star has no light curve in one filter.
phot1_points = list(self.get_light_curve(star_id, first_pfilter))
phot2_points = list(self.get_light_curve(star_id, second_pfilter))
matches = []
for point1 in phot1_points:
# Raises ValueError for empty sequences
closest = min(phot2_points,
key = lambda x: abs(point1[0] - x[0]))
# Second element of the tuple (index = 1) is the magnitude
if abs(point1[0] - closest[0]) < delta: # we have a match!
matches.append((point1[1], closest[1]))
return matches
except (TypeError, ValueError):
return None
def star_correlation(self, star_id, first_pfilter, second_pfilter,
min_matches = 10, delta = 3600 * 1.5):
""" Return the correlation between two photometric filters of a star.
Take the output of LEMONdBMiner.match_bands, which pairs the magnitudes
from two light curves of the star, and compute the least-squares
regression of the matched elements. Returns a four-element tuple,
containing (a) the ID of the star, (b) the slope of the regression
line, (c) the correlation coefficient and (d) the number of 'paired'
elements that were used for the regression.
If the star has absolutely no photometric information in one or both
filters, or if less than 'min_matches' points are paired, NoneType is
returned.
"""
matches = self.match_bands(star_id, first_pfilter, second_pfilter,
delta = delta)
if not matches: # either None or empty
return None
if len(matches) < min_matches:
return None
match_x, match_y = zip(*matches)
slope, intercept, r_value, p_value, std_err = \
scipy.stats.linregress(match_x, match_y)
return star_id, slope, r_value, len(matches)
def band_correlation(self, how_many, sort_index = 0, delta = 3600 * 1.5,
min_matches = 10, ndecimals = 8):
""" Return an ASCII table with the most correlated stars.
The method takes the combinations of two elements out of all the
photometric filters in the database and, for each one, computes the
correlation coefficient of the least-squares regression for the two
light curves and takes the 'how_many' stars with the highest value.
Not all the points of the curve are used for the regression, but only
those which are at most 'delta' seconds apart. Those stars whose light
curves have fewer than 'min_matches' are ignored. The returned table
has the stars sorted in decreasing order by the correlation coefficient
of the 'sort_index'-th combination of photometric filters.
The NoStarsSelectedError exception is raised in case no stars can be
returned, either because 'min_matches' is set to a too high value or
because the database has no information on the curves of the stars.
"""
pfilter_combinations = list(itertools.combinations(self.pfilters, 2))
# First, calculate the correlation for all the stars between the
# 'sort_index'-th combination of photometric filters (the one by which
# the table will be sorted) and take the 'how_many' most correlated. In
# this manner, the correlation in the other filters will only have to
# be calculated for these 'how_many' stars, instead of for all the
# stars in the database.
main_combination = pfilter_combinations[sort_index]
main_correlation = []
for star_id in self. star_ids:
kwargs = {'min_matches' : min_matches, 'delta' : delta}
star_corr = self.star_correlation(star_id, *main_combination, **kwargs)
if star_corr:
main_correlation.append(star_corr)
if not main_correlation:
msg = "no stars with at least %d correlated points in %s-%s"
raise NoStarsSelectedError(msg % ((min_matches,) + main_combination))
# Sort the stars by the correlation between these two photometric
# filters, decreasing order, and take the best 'how-many' stars. The
# correlation coefficient is the third element of the tuples, while
# the ID of each star is the first one (indexes 2 and 0, respectively)
main_correlation.sort(key = operator.itemgetter(2), reverse = True)
most_correlated = main_correlation[:how_many]
most_correlated_ids = [x[0] for x in most_correlated]
# A two-level dictionary, mapping the combination of filters (first
# level) to the ID of the star (second level), which in turn maps to a
# two-element tuple: (a) the correlation coefficient and (b) the number
# of elements that were paired and used in the linear regression.
correlations = collections.defaultdict(dict)
for star_id, slope, r_value, nmatches in most_correlated:
correlations[main_combination][star_id] = (r_value, nmatches)
# Find the correlation coefficients in the other filters, but only for
# these 'how_many', most correlated stars that we have just identified.
for index, secondary_combination in enumerate(pfilter_combinations):
if index == sort_index:
continue # correlations already calculated!
for star_id in most_correlated_ids:
star_corr = self.star_correlation(star_id, *secondary_combination,
min_matches = min_matches,
delta = delta)
if star_corr:
r_value, nmatches = star_corr[-2:]
else:
# NoneType leaves table cells empty
r_value, nmatches = None, None
correlations[secondary_combination][star_id] = (r_value, nmatches)
# Now that all the correlations have been calculated, store the
# information in a list of lists, properly sorted, and create the
# ASCII table which displays all the data.
header = []
header.append('Star')
for first_pfilter, second_pfilter in pfilter_combinations:
pfilters_str = '%s-%s' % (first_pfilter.letter, second_pfilter.letter)
header.append("r-value %s" % pfilters_str)
header.append('Matches')
table_columns = []
for star_id in most_correlated_ids:
column = [star_id]
for pfilters_pair in pfilter_combinations:
r_value, nmatches = correlations[pfilters_pair][star_id]
column.append(r_value)
column.append(nmatches)
table_columns.append(column)
return LEMONdBMiner._ascii_table(header, table_columns, sort_index = None,
descending = True, ndecimals = ndecimals)
@staticmethod
def dump(path, data, decimals = 8):
""" Dump a sequence of floating point numbers to a text file.
The method iterates over 'data', each one of whose elements must be a
sequence of floating point numbers, and saves them to the 'path' text
file. Each iterable in 'data' is written to a different row, with its
elements tab-separated. If it already exists, the output file is
silently overwritten.
Keyword arguments:
ndecimals - number of decimals with which real numbers are saved.
"""
with open(path, 'wt') as fd:
for row in data:
fd.write('\t'.join(('%.*f' % (decimals, column)
for column in row)) + '\n')
def sort_by_curve_stdev(self, pfilter, minimum = 10):
""" Sort the stars by the standard deviation of their light curves.
Take the light curves of the stars in a photometric filter and compute
the standard deviation of their points. Returns a list of two-element
tuples, with the ID of each star and the said standard deviation,
respectively, sorted in increasing order by the latter. Those stars
with fewer than 'minimum' points in their light curve are ignored.
The NoStarsSelectedError exception is raised in case no stars can be
returned, either because 'minimum' is set to a too high value or
because the database has no information on the curves of the stars.
"""
curves_stdevs = []
for star_id in self.star_ids:
star_curve = self.get_light_curve(star_id, pfilter)
# NoneType returned if the star has no light curve
if star_curve is None or len(star_curve) < minimum:
continue
curves_stdevs.append((star_id, star_curve.stdev))
if not curves_stdevs:
msg = "no light curves with at least %d points in %s"
raise NoStarsSelectedError(msg % (minimum, pfilter))
return sorted(curves_stdevs, key = operator.itemgetter(1))
def curve_stdev(self, how_many, sort_index = 0,
minimum = 10, ndecimals = 8):
""" Return an ASCII table with the most constant stars.
The method computes the standard deviation of the light curve of each
star in all the photometric filters and takes the 'how_many' stars for
which this value is lowest (that is, the most constant stars) in the
'sort_index'-th photometric filter. The order of the photometric
filters is given by their effective wavelength midpoint. Those stars
whose light curves have fewer than 'minimum' points are ignored. The
returned table has the stars sorted in increasing order by the standard
deviation of the light curve in the 'sort-index'-th filter.
Keyword arguments:
ndecimals - number of decimals shown for real numbers.
"""
# Sort the stars by the standard deviation of their light curve in the
# 'sort-index'-th photometric filter, ascending order, and take the
# best 'how-many' stars. The first element of each tuple is the ID of
# the star; the second is the standard deviation of its light curve.
main_pfilter = self.pfilters[sort_index]
most_similar = \
self.sort_by_curve_stdev(main_pfilter, minimum = minimum)[:how_many]
most_similar_ids = [x[0] for x in most_similar]
# A two-level dictionary, mapping the photometric filter (first level)
# to the ID of the star (second level), which in turn maps to the
# standard deviation of the corresponding light curve.
stdevs = collections.defaultdict(dict)
for star_id, curve_std in most_similar:
stdevs[main_pfilter][star_id] = curve_std
# Find the standard deviation of the light curves in the other
# photometric filters, but only for these 'how_many', most similar
# stars that we have just identified.
for index, pfilter in enumerate(self.pfilters):
if index == sort_index:
continue # stdevs already calculated!
for star_id in most_similar_ids:
star_curve = self.get_light_curve(star_id, pfilter)
# NoneType returned if the star has no light curve
if star_curve is None or len(star_curve) < minimum:
stdevs[pfilter][star_id] = None
else:
stdevs[pfilter][star_id] = star_curve.stdev
header = []
header.append('Star')
for pfilter in self.pfilters:
header.append("%s Stdev" % pfilter)
table_columns = []
for star_id in most_similar_ids:
column = [star_id]
for pfilter in self.pfilters:
curve_stdev = stdevs[pfilter][star_id]
column.append(curve_stdev)
table_columns.append(column)
# sort_index + 1 because the first column contains the star IDs
return LEMONdBMiner._ascii_table(header, table_columns,
sort_index = sort_index + 1,
descending = False,
ndecimals = ndecimals)
def amplitudes_by_wavelength(self, increasing, npoints, use_median,
exclude_noisy, noisy_nstdevs,
noisy_use_median, noisy_min_ratio):
""" Find those stars with amplitudes sorted by wavelength.
The method returns a generator that iterates through each star of the
database and determines whether the amplitudes of its light curves
increase or decrease (depending on the truth value of the 'increasing'
parameter) with the wavelength. Not only the amplitudes must be sorted,
but also the star must have a light curve in all the filters for which
the LEMONdB has data.
It is possible, depending on the truth value of 'exclude_noisy', to
discard stars with one or more noisy amplitudes. In order to evaluate
whether an amplitude is noisy, it is compared against the median or
mean ('noisy_use_median' parameter) standard deviation of the light
curves of the 'noisy_nstdev' stars with the most similar instrumental
magnitude (for example: if the star had a magnitude of 11.4, we would
use the light curves of the 'noisy_stdev' stars with a magnitude
closest to this value). If the ratio between the amplitude and this
median (or mean) standard deviation is smaller than 'noisy_min_ratio',
the amplitude is marked as noisy and the star ignored.
A two-element tuple is yielded for each star that satisfies the above
conditions, with (a) the ID of the star, (b) a list of three-element
tuples which map each photometric filter (b1) to the amplitude of that
light curve (b2) and the median (or mean) standard deviation of the
stars with the most similar instrumental magnitudes (b3). For example:
(1023, [(Passband('KS'), 0.046757, 0.01646), (Passband('H'), 0.33077,
0.022678), (Passband('J'), 0.856194, 0.024012), (Passband('Z'),
1.033273, 0.030219)]). The last item in the three-element tuples is
None when 'exclude_noisy' is False, as the calculation of the standard
deviation of the stars of similar brightness is skipped.
The peak-to-peak amplitude of each light curve is obtained by using as
the peak and trough the median or mean (depending on the truth value of
the 'use_median' parameter) of the 'npoint' highest and lowest points
of the light curve. In other words: the amplitude used by this method
is defined as the difference between the median (or mean) of the
'npoints' highest and 'npoints' lowest differential magnitudes.
The generator yields None for those stars whose amplitudes are not
sorted by wavelength or that have one or more missing light curves or
noisy amplitudes. These values are useless and most of the time will be
ignored, but they make it possible to know exactly how many items the
generator has stepped through. This information could be used, for
example, to update a progress bar.
"""
pfilters = sorted(self.pfilters, reverse = not increasing)
kwargs = dict(npoints = npoints, median = use_median)
for star_id in self.star_ids:
discarded = False
star_amplitudes = []
cmp_stdevs = []
for pfilter in pfilters:
star_curve = self.get_light_curve(star_id, pfilter)
if not star_curve:
discarded = True
break
else:
amplitude = star_curve.amplitude(**kwargs)
star_amplitudes.append(amplitude)
if exclude_noisy:
# Find the ID of the 'noisy_stdevs' stars with the most
# similar magnitude among those that have a light curve.
# Pick elements from the generator one by one, until we
# have as many stars as needed.
similar = []
g = self.most_similar_magnitude(star_id, pfilter)
for _ in xrange(noisy_nstdevs):
try:
similar.append(g.next()[0])
except StopIteration:
break
# If there are fewer than 'noisy_stdevs' with a light
# curve we cannot determine whether the amplitude is
# noisy, so to be safe we just discard the star.
if len(similar) != noisy_nstdevs:
discarded = True
break
# Extract the light curve of each similar star and
# compute its standard deviation. Then, calculate their
# median (or arithmetic mean) and check whether the
# ratio between the amplitude and this value is above
# the threshold.
stdevs = []
for id_ in similar:
stdevs.append(self.get_light_curve(id_, pfilter).stdev)
func = numpy.median if noisy_use_median else numpy.mean
cmp_stdevs.append(func(stdevs))
ratio = amplitude / cmp_stdevs[-1]
if ratio < noisy_min_ratio:
discarded = True
break
if not discarded and sorted(star_amplitudes) == star_amplitudes:
# Use Nones as the last item of the three-element tuple
# when stars with noisy amplitudes are not discarded
if not exclude_noisy:
cmp_stdevs = [None] * len(star_amplitudes)
yield star_id, zip(pfilters, star_amplitudes, cmp_stdevs)
else:
yield None