-
Notifications
You must be signed in to change notification settings - Fork 0
/
acjsa.tex
883 lines (763 loc) · 44.7 KB
/
acjsa.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
%% This is file `elsarticle-template-2-harv.tex',
%%
%% Copyright 2009 Elsevier Ltd
%%
%% This file is part of the 'Elsarticle Bundle'.
%% ---------------------------------------------
%%
%% It may be distributed under the conditions of the LaTeX Project Public
%% License, either version 1.2 of this license or (at your option) any
%% later version. The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.2 or later is part of all distributions of LaTeX
%% version 1999/12/01 or later.
%%
%% The list of all files belonging to the 'Elsarticle Bundle' is
%% given in the file `manifest.txt'.
%%
%% Template article for Elsevier's document class `elsarticle'
%% with harvard style bibliographic references
%%
%% $Id: elsarticle-template-2-harv.tex 155 2009-10-08 05:35:05Z rishi $
%% $URL: http://lenova.river-valley.com/svn/elsbst/trunk/elsarticle-template-2-harv.tex $
%%
%%\documentclass[preprint,authoryear,12pt]{elsarticle}
%% Use the option review to obtain double line spacing
%% \documentclass[authoryear,preprint,review,12pt]{elsarticle}
%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
%% for a journal layout:
%% Astronomy & Computing uses 5p
%% \documentclass[final,authoryear,5p,times]{elsarticle}
\documentclass[final,authoryear,5p,times,twocolumn]{elsarticle}
%% if you use PostScript figures in your article
%% use the graphics package for simple commands
%% \usepackage{graphics}
%% or use the graphicx package for more complicated commands
\usepackage{graphicx}
%% or use the epsfig package if you prefer to use the old commands
%% \usepackage{epsfig}
%% The amssymb package provides various useful mathematical symbols
\usepackage{amssymb}
%% The amsthm package provides extended theorem environments
%% \usepackage{amsthm}
\usepackage[pdftex,pdfpagemode={UseOutlines},bookmarks,bookmarksopen,colorlinks,linkcolor={blue},citecolor={green},urlcolor={red}]{hyperref}
\usepackage{hypernat}
%% Alternatives to hyperref for testing
%\usepackage{url}
%\newcommand{\htmladdnormallinkfoot}[2]{#1\footnote{\texttt{#2}}}
%\newcommand{\htmladdnormallink}[1]{\texttt{#1}}
%\newcommand{\href}[2]{\texttt{#2}}
%% The lineno packages adds line numbers. Start line numbering with
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
%% for the whole article with \linenumbers after \end{frontmatter}.
%% \usepackage{lineno}
%% natbib.sty is loaded by default. However, natbib options can be
%% provided with \biboptions{...} command. Following options are
%% valid:
%% round - round parentheses are used (default)
%% square - square brackets are used [option]
%% curly - curly braces are used {option}
%% angle - angle brackets are used <option>
%% semicolon - multiple citations separated by semi-colon (default)
%% colon - same as semicolon, an earlier confusion
%% comma - separated by comma
%% authoryear - selects author-year citations (default)
%% numbers- selects numerical citations
%% super - numerical citations as superscripts
%% sort - sorts multiple citations according to order in ref. list
%% sort&compress - like sort, but also compresses numerical citations
%% compress - compresses without sorting
%% longnamesfirst - makes first citation full author list
%%
%% \biboptions{longnamesfirst,comma}
% \biboptions{}
\journal{Astronomy \& Computing}
%% Make single quotes look right in verbatim mode
\usepackage{upquote}
\usepackage{upgreek}
\usepackage{color}
% Aim to be consistent, and correct, about how we refer to sections
\newcommand*\secref[1]{Sect.~\ref{#1}}
\newcommand*\appref[1]{\ref{#1}}
\newcommand*\figref[1]{Fig.~\ref{#1}}
\begin{document}
\begin{frontmatter}
%% Title, authors and addresses
%% use the tnoteref command within \title for footnotes;
%% use the tnotetext command for the associated footnote;
%% use the fnref command within \author or \address for footnotes;
%% use the fntext command for the associated footnote;
%% use the corref command within \author for corresponding author footnotes;
%% use the cortext command for the associated footnote;
%% use the ead command for the email address,
%% and the form \ead[url] for the home page:
%%
%% \title{Title\tnoteref{label1}}
%% \tnotetext[label1]{}
%% \author{Name\corref{cor1}\fnref{label2}}
%% \ead{email address}
%% \ead[url]{home page}
%% \fntext[label2]{}
%% \cortext[cor1]{}
%% \address{Address\fnref{label3}}
%% \fntext[label3]{}
\title{Observatory/data centre partnerships and the VO-centric archive:
The JCMT Science Archive experience}
%% use optional labels to link authors explicitly to addresses:
%% \author[label1,label2]{<author name>}
%% \address[label1]{<address>}
%% \address[label2]{<address>}
\author[lsst]{Frossie Economou\corref{cor1}}
\ead{[email protected]}
\author[cadc]{S\'{e}verin Gaudet}
\author[cornell,jac]{Tim Jenness}
\author[jac]{Russell O.\ Redman}
\author[cadc]{Sharon Goliath}
\author[cadc]{Patrick Dowler}
\author[jac]{Malcolm~J.~Currie}
\author[jac]{Graham~S.~Bell}
\author[jac]{Sarah~F.~Graves}
\author[cadc]{John~Ouellette}
\author[jac,nrc,uvic]{Doug~Johnstone}
\author[cadc]{David Schade}
\author[uherts,jac]{Antonio~Chrysostomou}
\cortext[cor1]{Corresponding author}
\address[lsst]{LSST Project Office, 933 N.\ Cherry Ave, Tucson, AZ 85721, USA}
\address[cadc]{Canadian Astronomy Data Centre, National Research Council of Canada, 5071 West Saanich Road., Victoria, BC V9E 2E7, Canada}
\address[cornell]{Department of Astronomy, Cornell University, Ithaca,
NY 14853, USA}
\address[jac]{Joint Astronomy Centre, 660 N.\ A`oh\=ok\=u Place, Hilo, HI
96720, USA}
\address[nrc]{NRC-Herzberg Institute of Astrophysics, 5071 West Saanich Road,
Victoria, BC V9E~2E7, Canada}
\address[uvic]{ Department of Physics and Astronomy, University of Victoria, PO Box 3055 STN CSC, Victoria, BC V8W~3P6, Canada}
\address[uherts]{Centre for Astrophysics Research, University of Hertfordshire, College Lane, Hatfield, Hertfordshire AL10 9AB, UK}
\begin{abstract}
%% Text of abstract
We present, as a case study, a description of the
partnership between an observatory (JCMT) and a data centre (CADC)
that led to the development of the JCMT Science Archive (JSA). The
JSA is a successful example of a service designed to use Virtual Observatory (VO)
technologies from the start. We describe the motivation, process and
lessons learned from this approach.
\end{abstract}
\begin{keyword}
%% keywords here, in the form: keyword \sep keyword
%% MSC codes here, in the form: \MSC code \sep code
%% or \MSC[2008] code \sep code (2000 is the default)
facilities: JCMT \sep
Virtual Observatory tools \sep
astronomical databases: misc \sep
ISM: individual objects (G34.3)
\end{keyword}
\end{frontmatter}
% \linenumbers
%% Journal abbreviations
\newcommand{\mnras}{MNRAS}
\newcommand{\aap}{A\&A}
\newcommand{\aaps}{A\&AS}
\newcommand{\pasp}{PASP}
\newcommand{\apj}{ApJ}
\newcommand{\apjs}{ApJS}
\newcommand{\qjras}{QJRAS}
\newcommand{\an}{Astron.\ Nach.}
\newcommand{\ijimw}{Int.\ J.\ Infrared \& Millimeter Waves}
\newcommand{\procspie}{Proc.\ SPIE}
\newcommand{\aspconf}{ASP Conf. Ser.}
%% Applications
%% Links
\newcommand{\ascl}[1]{\href{http://www.ascl.net/#1}{ascl:#1}}
%% main text
\section{Origins}
The James Clerk Maxwell Telescope (JCMT) has collaborated with the Canadian Astronomy Data Centre (CADC) to
create the JCMT Science Archive (JSA) which provides raw and reduced JCMT data to the astronomical community
\citep{2008SPIE.7016E..16G,2008ASPC..394..450E,2008ASPC..394..135G,2011ASPC..442..203E}.
As a new generation of instruments was being developed for the JCMT in
the early 2000s \citep[HARP/ACSIS \& SCUBA-2;][]{2000ASPC..217...33D,2003SPIE.4855....1H},
it became clear that
the data rates from these instruments, of order 10\,MB/s, were going to be significantly
higher than earlier submillimeter instrumentation. In
particular SCUBA-2 was the first generation of submillimeter camera
that could be considered to be suitable for use as a large-scale
survey instrument. Exploratory discussions on the JSA between JCMT and CADC
began in 2003 and culminated in a decision to approve the
collaboration in May 2005 \citep{2005JCMTN23}. Development effort was
obtained in-house and also from the addition of two programmers recruited from
the UK Starlink project \citep{1982QJRAS..23..485D}, which had recently been closed.
The commitment to a JCMT Science Archive was followed shortly
afterwards by the approval of the JCMT Legacy Survey programme in July
2005 \citep{2005JCMTN23}. To ensure survey participation in the JSA
the JCMT Data Users' Group (JDUG) was created in early 2006 to provide
stakeholder input into the pipeline operation and advanced data
products \citep{2006JCMTN24R}.
\section{Motivation: Observatory}
Submillimeter data has traditionally been rather esoteric, closer to
radio than the optical/infrared regime familiar to most
astronomers. Raw data is typically in time series format (\figref{fig:g34ts}), and requires
in-house algorithms for transformation to science-ready formats such
as spectra or images. Calibration is
difficult due to the dominant and highly variable effect of the water
vapour in Earth's atmosphere
\citep[e.g.,][]{2002MNRAS.336....1A,2013MNRAS.430.2534D}.
\begin{figure}[t]
\includegraphics[angle=-90,width=\columnwidth]{g34ts}
\caption{Single bolometer time-series from a subset of a SCUBA-2
observation of G34.3 from 2012 June 11th. The final image is shown in
\figref{fig:g34}. The negative spikes are the detections of the
bright central source.}
\label{fig:g34ts}
\end{figure}
JCMT invested significant effort in automated data reduction based on
the ORAC-DR pipeline framework
\citep[][\ascl{1310.001}]{1999ASPC..172...11E,1999ASPC..172..171J,2005ASPC..347..585G,2008ASPC..394..565J,2015A&C.....9...40J}. In
many cases these automatically generated products were publication
quality, and thanks to a constantly updated calibration model, better
than what an inexperienced astronomer could be expected to achieve on
their own. Moreover with the advent of large bolometer arrays such as
SCUBA-2 \citep{2013MNRAS.430.2513H}, this data could be processed in
maps that resulted in image data that could be readily understood by
non-submm specialists, an example of which can be seen in \figref{fig:g34}.
\begin{figure}[t]
\includegraphics[width=\columnwidth]{g34}
\caption{SCUBA-2 observation of G34.3.}
\label{fig:g34}
\end{figure}
The JCMT had in-house experience with setting up a data archive in the
``filing cabinet'' sense of allowing users to search and retrieve raw
data, but apart from a prototype involving the on-demand generation
of SCUBA data products \citep{2002ASPC..281..243J}, had not tackled
the integration of data processing with data
product distribution in a full science archive environment.
Indeed, distribution of publication-quality data became an issue of
the highest priority with the advent of the JCMT Legacy Survey
Programme \citep{2010HiA....15..797C,2008ASPC..394..450E} using the
SCUBA-2 and HARP/ACSIS \citep{2009MNRAS.399.1026B} instruments. Aside
from the normal desire to provide a uniformly reduced product to the
survey teams, the processing demands for this data required a
non-trivial IT infrastructure.
The complex iterative map-maker algorithm used to reduce SCUBA-2 data
\citep[SMURF;][\ascl{1310.007}]{2013MNRAS.430.2545C} was expected to
generate higher fidelity maps when more of an observation could be
fitted into memory at one time. It was estimated that at minimum a
machine with 64\,GB of RAM would be required (and 128\,GB is the
current recommendation) but circa 2008 machines of this size were not readily
available to the typical JCMT observer.
So there were intrinsic
reasons to have a JCMT Science Archive allowing the survey consortia
to download the processed products. Ultimately, usage of such a
standalone archive would be dominated by JCMT users retrieving their own data, or
after the proprietary period elapsed, other JCMT users working in the
same scientific areas who were explicitly searching for JCMT data.
JCMT formed a strong interest in going further, and exposing its
high-value data product to data-mining astronomers who would not have
a priori knowledge either of JCMT in particular or sub-mm astronomy in
general. To that end, the Virtual Observatory (VO) data discovery and publication protocols
seemed like a natural choice for reaching the large parts of the
astronomical community that were oblivious to its existence. VO
publication would also have the advantage of exposing the JCMT data
sets to workhorse tools that VO-savvy astronomers already used, such
as TOPCAT \citep[][\ascl{1101.010}]{2005ASPC..347...29T} and Aladin
\citep[][\ascl{1112.019}]{2005ASPC..347..193O}.
However, despite being convinced of the desirability of
leveraging the VO tools and services for JCMT data, the observatory
had the usual constraints of time and effort. The small Scientific
Computing Group was busy with supporting the entire non-hardware-controlling
software suite at both JCMT and UKIRT \citep[see e.g.,][with both
telescopes operated by the same organization]{2002SPIE.4844..321E,2011tfa..confE..42J},
as well as developing data reduction for new instruments, helping with
their commissioning,
and supporting the JCMT Legacy Surveys. The ability to develop a
VO-aware data centre and support the demands of the hoped-for
increased usage base was just not there.
What JCMT had, however, was a pre-existing collaboration with CADC,
which hosted the older JCMT data archive \citep{1997ASPC..125..397T}
for the benefit of the Canadian astronomical community, Canada being
one of the three international partners funding the JCMT (the other
two being the United Kingdom and the Netherlands). CADC had early
involvement in VO protocols \citep{2002ASPC..281...36S,2015ACCADC}, was a
productive developer and enthusiastic supporter of VO standards, and
was known to ``eat its own dog
food''\footnote{See \url{http://en.wikipedia.org/wiki/Eating_your_own_dog_food} and \citet{2014arXiv1407.6463E} for more information.}
by using many of these
interfaces and services internally.
\section{Motivation: Data Centre}
\begin{figure*}[t]
\begin{center}
\includegraphics[width=0.8\textwidth]{acjsa_fig3}
\caption{AdvancedSearch Results.}
\label{fig:asresults}
\end{center}
\end{figure*}
CADC already had a varied collection of data from several
telescopes and space missions
\citep{1994ASPC...61..123C,2008SPIE.7016E..16G}. Keen to be able to
extend its holdings to new observatories and data sets while
requiring only a small and well-defined effort, CADC developed
the Common Archive Observation Model
\citep[CAOM:][]{2007ASPC..376..347D,2008ASPC..394..426D}. CAOM defines an extensive and
versatile data model that classifies every data file using a common set of physical,
observational, organizational, and processing metadata.
This allows a generic VO search tool, such as AdvancedSearch, to search the
entire set of CADC archives for data relevant to a chosen target in the sky.
% that allowed any data to be ingested in the CADC
% archive provided it could be adequately mapped to CAOM.
One of the main attractions of the JCMT data set was its significant
departure from many of the common forms of other astronomical data,
that predominantly came from optical and IR instrumentation.
Examples include:
\begin{itemize}
\item The ``photon energy'' axis for optical observations is normally described in wavelength
units like \AA{}ngstr\"{o}ms or microns, whereas most radio observations are defined in
frequency units like MHz and GHz. To ingest and search for JCMT observations
it was necessary to enhance the tools to handle both wavelength and frequency units, with the
consequence that CADC interfaces now handle transparently most standard conversions
amongst frequency, energy and wavelength units.
\item At the start of the collaboration, most optical
data consisted of two dimensional RA/Dec images and sets of spectra. Even at that time, JCMT data
came in RA/Dec, Galactic and offset co-ordinates, with up to 4 dimensions (2
spatial, wavelength and polarization). The JCMT standard pipeline generates a
diverse set of products, including spectra, data cubes, maps, previews showing both spectral
and spatial images, and catalogues for point sources, emission peaks and
clumps (extended regions of non-uniform emission).
\item Since most detector technologies only allow a photon to be
detected once, it can be safely assumed for optical instruments with multiple detectors
that the data products from different detectors will not overlap in WCS space. The ability
at radio wavelengths to amplify the detected signal and feed it into multiple
spectrometers allows the output of the JCMT multi-subsystem spectrometer ACSIS
to include spectra and data cubes that overlap in a variety of ways, sometimes with
different frequency resolution, sometimes overlapping just at the ends of the spectra
to allow a much wider frequency coverage for a given frequency resolution than could be managed
by any single subsystem.
\end{itemize}
The JCMT therefore
provided an excellent stretch to the model, and continues to do so; if JCMT data could be
described in CAOM, CADC would be in the unprecedented position of
being able to accept almost any data set from future observatories
with minimal changes to their system.
Another advantage in working with JCMT on its datasets, was the high
level of completion and accuracy that JCMT provided in its
metadata. Even modern instruments on some older telescopes follow
metadata conventions established by the observatory long before the FITS
World Coordinate System (WCS) conventions were agreed upon. At the
start of the collaboration, the CADC would assign an ``archive scientist'' to
each archive, whose job description included learning all the idiosyncrasies
of the observatory. A major part of that effort
involved working around poor or incomplete metadata that made
astronomical data archiving problematic, especially if the observatory tended to
change their data products and headers without warning. Maintaining a
proper ``Science Archive'' requires that both power users and
astronomers unfamiliar with an observatory's
internal conventions must be able to find and download science-ready data products
without mastering an arcane interface or guessing how to interpret the
metadata that it presents.
JCMT's dedication to high-fidelity
metadata and quick response in the rare case of problems made this an
attractive test data set.
The success of
this approach can be seen from the screen shot in \figref{fig:asresults},
which shows the reduced (Calibration Level 2) data from May 2014, sorted by
observation date, filtered to remove reduced-450\,$\upmu$m data (since the atmosphere
at 450\,$\upmu$m is often very opaque) and to include observations with RA $>$ 125.0 deg.
A pop-up preview of G34.3 is shown; clicking would bring up a larger version
of the preview in a new tab. The \texttt{productID} column shows the kind of data that
can be downloaded for each selection, giving the product type (reduced data files in
this example) and basic wavelength information (filter for continuum observations,
rest frequency and spectrometer configuration for heterodyne observations).
\section{VO Standards Used in the JSA}
\begin{description}
\item[CAOM] : Common Archive Observation Model -- This is the data model
used in all archives at the CADC. It was designed to be a superset
of VO data models so that VO data models and services could be easily
implemented on top of CAOM. While CAOM is not a VO data model per se,
it was designed and is used as the metadata interface between archives
and standard VO data models. \citep{2007ASPC..376..347D,2013ASPC..475..159R}
\item[ObsCore] : Observation Data Model Core Components -- This VO data model
is designed to support data discovery specifically by supporting the
exact same queries to TAP services run by all data centres. In the JSA,
this is simply a view of CAOM as it contains a subset of CAOM metadata \citep{obscore}.
\item[SIA] : Simple Image Access -- Version 1.0 is an early VO service
interface that supports positional searching and retrieval of 2D
images \citep{siap}.
Version 2.0 \citep{siav2} is a new VO service interface that supports data discovery
of multi-dimensional datasets (images and data cubes) using the ObsCore
data model. Both of these are implemented using CAOM and TAP (below).
\item[TAP] : Table Access Protocol -- This VO service interface supports ad-hoc
querying of the CAOM metadata and standard views like ObsCore. All JSA
science data is discoverable through this interface \citep{tap,2014NandrekarHeinis201437}.
\item[ADQL] : Astronomical Data Query Language -- Queries to the TAP service
are formatted in ADQL, which is designed to closely resemble the popular SQL
syntax used by many relational database systems \citep{adql}.
\item[DataLink] : DataLink Service -- This VO service interface allows users
and client software to drill-down from discovered datasets to the list
of files to download and to services that can operate on the data. The
SIA-2.0 and TAP services use this interface to provide access to JSA
data files and services \citep{datalink}.
\item[AccessData] : Access Data Prototype -- This prototype VO service interface
allows users to perform cutouts on data files in a standard set of
world coordinates.
\item[CDP] : Credential Delegation Protocol -- This VO service interface enables
CADC services to call other services on behalf of the user so that the
correct identity and access rights are enforced. In the JSA, this allows
the user interface (AdvancedSearch) to pass the authenticated user
identity to the TAP service so that query results will include metadata
and access information for proprietary observations the user can access \citep{cdp}.
\item[VOTable] : Virtual Observatory Table Format -- This is a common tabular
format used to exchange metadata between clients and services. It is
the standard output format in SIA, TAP, and DataLink \citep{votable}.
\end{description}
\section{Evolution of the Data Flow}
\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=0.75\textwidth]{jsa_dataflow_2015}
\end{center}
\caption{Data and metadata flow through the JSA as it is intended to be in early 2015. The JCMT and CADC processes are arranged in four columns, with the JCMT-specific processes on the left, ``portable processes'' (data processing, file transfer and metadata ingestion) in the dotted box in the centre-left, CADC processes in the centre-right, and client processes running on archive users machines on the right. The ``vertical drums'' in the figure represent relational databases. The ``horizontal drums'' represent file storage, but do not specify the technology used to implement the storage (disk drives at the JCMT, databases at the CADC and for the CANFAR VOSpace). Where the software is developed, maintained and run by the JCMT/JAC or CADC, this is indicated by dashed container boxes. Manual operations by JAC and CADC staff have been elided; user interactions are shown as arrows on the right side of the figure.}
\label{fig:jsadataflow}
\end{figure*}
The system that moves data from the JCMT to the CADC and on to our users has been under continuous development since the start of the collaboration. \figref{fig:jsadataflow} shows the current development goal, which should have been attained by the time this paper is published. Data files sent to be stored in the ``Archive Directory'' (AD) system at the CADC enter through the ``Data Web Service'' interface. File metadata in the databases comprising the ``JSA CAOM Metadata'' system are managed using the ``CAOM Repository'' interface and can be read through the ``TAP'' service. Similarly, users access data and metadata through the ``Data Web Service'' and TAP interfaces. The use of a small number of well tested interfaces improves the reliability of the service and makes it easier to maintain on a limited budget. Using the same interfaces that our users rely on ensures that problems are discovered and addressed quickly.
The system was initially quite different. Before the advent of CAOM, every archive maintained a custom database. Each file was stored in AD and ingested into the database as it arrived through e-transfer\footnote{For an introduction to the e-transfer system see \citet{2005ASPC..347..647M}}. The JCMT supplied by replication a set of observatory databases that contained file metadata for raw data, and published an interface control document (ICD) describing the file headers in reduced data products. The JCMT committed itself to follow strict FITS standards for file headers and WCS, and for raw data reproduced a set of columns in the ``File Metadata'' database that was nearly identical to the set of headers in the reduced data for single observations. The CADC archive scientist was responsible for the design of software that read the metadata from the replicated database or from the reduced data headers. Writing and maintaining the software to ingest the metadata into the ``JSA CAOM Metadata'' database required a team of software developers at the CADC. The successful operation of this system required close collaboration of the JCMT with the JSA team at the CADC, with weekly progress videocons and regular (often annual) face-to-face meetings to discuss larger issues. Although the system worked, it was cumbersome and expensive. A leaner and more versatile system was clearly desirable.
The container labelled ``Portable Processes'' in \figref{fig:jsadataflow} illustrates how the leaner system was implemented. The custom software for each archive was refactored into a set of simpler processes. Data processing ran at the CADC for easy access to the stored data, but was developed and maintained by the JAC. This encouraged a clean separation between the ``Data Processing Queue'' and ``Data Processing'' itself. The JSA was an early adopter of CAOM, which allowed raw and processed data ingestion to be factored out as separate processes. Since raw data ingestion applies to whole observations, the ``Raw Data Discovery Agent'' verifies that all of the raw data for an observation is stored in AD before starting the ``Raw Data Ingestion'' process. Originally, ``Processed Data Ingestion'' had its own discovery agent, but it is now controlled by the ``Data Processing Queue''.
The refactored system is quite modular and deployment is extremely
flexible. These processes were deployed at the CADC for most of the
last decade, but over the last year have migrated to the JAC. Data
processing is currently run at the JAC using a queue system with
database tables similar to those used by CADC's original interface to
Sun Grid Engine. This has allowed the associated software to run with
minimal changes. The new system has a web interface which is tailored
to the JCMT, including a facility for in-house quality assurance. It
is anticipated that data processing might move onto a CANFAR Virtual
Machine in the near future and be orchestrated by the current queuing
system. Ingestion can now run on any node that can access the ``CAOM
Repository'', read existing metadata through the TAP service and
optionally access the ``File Metadata'' service at the JCMT. This
extraordinary flexibility allows JCMT staff who best understand the
data to handle all data reduction and CADC staff who best understand
the archive to maintain those services.
\section{A Continuous Data Release Model}
Using CADC's data processing infrastructure and the capabilities of
JCMT/UKIRT's ORAC-DR automated data reduction, the JCMT Science
Archive adopted a model of continuous release
\citep{2011ASPC..442..203E}. As data was taken it was pushed for
reduction and was ingested at CADC in the same 24-hour period it was
observed. Thus, high-quality science products were published in the VO
as soon as the PI had access to them. Moreover, with every major
improvement in the data reduction software, data could be re-processed
and again immediately released.
Proprietary data goes into CAOM and becomes available via VO
interfaces almost immediately. Proprietary metadata and data
restrictions are enforced on all TAP queries and authentication will
permit authorized users to discover and download such data. Either
AdvancedSearch or direct TAP queries can be used by PIs and JCMT
legacy survey teams to find and download new data using this
authenticated access. For example, the Cosmology Legacy Survey team
\citep{2013MNRAS.432...53G} runs a script using the TAP interface to
keep track of new observations as they arrive in the archive. For year
2014, approximately 40~percent of all queries to the JSA came through
the TAP interface.
Continuous release made the VO publication mechanisms even more useful
than they are in the normal data discovery process, as product
availability is, from the point of view of the astronomer,
unpredictable rather than coming in fixed, scheduled, announced ``data
releases''. An interested user can therefore run regular automated TAP
queries with the expectation that newly-reduced data can appear from
their field of interest at any time.
\section{Post-Observatory}
Meanwhile, CADC was working on the Canadian Advanced Network for
Astronomical Research \citep[CANFAR;][]{2010SPIE.7740E..51G,2015ACCADC} project aiming to
support a cloud-like model for astronomical data reduction. The system
is based on giving the user a Virtual Machine (VM) that is then customized to
provide the appropriate software, environment and data access. The
user then defines a number of jobs that are serviced on a Condor
compute platform composed of customized VM copies.
This service has been of great utility to the Canadian astronomical
community dealing with large data volumes, with downloads of raw data from
the JSA to CANFAR processing nodes accounting for more than 40~percent of all
JSA raw data downloads in 2014.
The Gould Belt Legacy Survey \citep[GBS;][]{2007PASP..119..855W} make
use of use of CANFAR, and the GBS data
processing lifecycle is supported at every step by VO-compliant services.
Raw data is retrieved from VO-compliant discovery and delivery services,
that data is processed on the customized VMs provisioned on CANFAR, and
the resulting products are shared among survey members in VO-compliant
storage services using VOSpace \citep{vospace}. The total VOSpace
usage by the survey teams is currently approaching 1\,TB and this has
proven to be a critical part of the collaboration infrastructure when
dealing with teams spread over Canada, Hawaii and Europe.
The existence of the VOSpace system at CADC has also led to them
taking on the role of data publisher for JCMT science papers.
JSA data products and externally reduced products can be copied to a
VOSpace directory and associated with a
Digital Object Identifier. The first two data sets making use of this
functionality were \citet{2012MNRAS.424.3050W} and
\citet{2013ApJS..209....8D}.
\section{Extending VO for Radio Astronomy}
In the early days of the Virtual Observatory, the focus was
specifically on simple protocols \citep{siap,cone} to replace pre-existing
web services for image retrieval and cone search; with retrieval of
individual spectra coming somewhat later in VO developments
\citep{ssap,splatvo}. These were the pressing issues of the optical
community and this discussion dominated early protocol development.
Data cubes were seen as a task for the future as it was
felt that they were products that were not yet in the mainstream and
optical/IR instruments generating such cubes \citep[such as
the UIST IFU or TAURUS imaging Fabry-Perot spectrometer;][]{2004SPIE.5492.1160R,1982MNRAS.201..661A}
were seen as something of niche interest to be tackled later.
This was frustrating given that JCMT heterodyne
observations regularly generated cubes and with the arrival of ACSIS
in 2006, gigabyte data cubes were commonplace. There was no standard
available for making all these cubes available to the VO and it is
only recently \citep[e.g.,][]{2014AAS...22325505T} that a cube access
protocol has been approached with any seriousness, driven mainly, in
the USA, by ALMA and JWST developments \citep[e.g.,
MIRI;][]{2010SPIE.7731E..10W}. The proposed
recommendation for SIA-2.0 \citep{siav2}
will be able to handle the many data cubes generated by the JCMT
over the last two decades.
In Table~\ref{tab:cadcvo}, the line labeled ``TAP querying for
Spectra'' and ``TAP querying for Cubes''
indicate the number of 1-D spectra and data cubes in the JCMT collection. These can
easily be found using the CADC AdvancedSearch interface, or directly using
a TAP query. The full positional and photon energy WCS are provided
for these, even when the positional axes are degenerate. SIA-2.0
should be able to find all of these data, once it has been implemented.
Another peculiarity of submillimetre data is the lack of point
sources. Most Galactic objects are extended and dust and gas from
large clouds, outflows and filamentary structures are missed by
standard source extraction algorithms such as SExtractor
\citep[][\ascl{1010.064}]{1996A&AS..117..393B}. Instead, algorithms
such as FellWalker \citep[][\ascl{1311.007}]{2015FW,2007ASPC..376..425B} and
Clumpfind \citep[][\ascl{1107.014}]{1994ApJ...428..693W}, which detect
source emission in irregularly shaped clumps, were used when
doing source finding. VO ConeSearch was not
set up for this eventuality and the best we could hope for was to
provide a catalogue that indicated the peak of the emission. To work
around this problem clump catalogues are generated with the clump
outline approximated by a polygon specified in STC-S format
\citep{2010ASPC..434..213B}. These outlines can then be retrieved
using TAP for analysis or plotting. This is certainly less convenient
for the end user than a clump equivalent of ConeSearch so we are
extending the facilities in GAIA
\citep[][\ascl{1403.024}]{2009ASPC..411..575D} to hide the TAP
interface. We hope a variant of ConeSearch will be developed that
works for extended irregular sources.
It should be sufficient for an enhanced ConeSearch query to return
the results as catalogues with STC-S columns representing the shape of
the object that matches, and for a match to be defined as an overlap
between the region specified by the caller and the region defining the
object. In this manner all existing ConeSearch services could simply
return objects with circular regions with size corresponding to the
point spread function.
\section{Current Status}
\begin{figure}[t]
\includegraphics[width=\columnwidth]{jcmt-pubs-crop}
\caption{Breakdown of the 405 JCMT refereed publications between 2010 and
2013 indicating the fraction using data from the JCMT Science
Archive. The remaining segments are from papers only using instrument
data directly. Figure derived from \citet{2014SPIE9152-93}.}
\label{fig:jsapubs}
\end{figure}
\begin{table}
\caption{Data holdings in the JCMT Science Archive available over VO
protocols as of 2014 November.}
\label{tab:cadcvo}
\begin{center}
\begin{tabular}{|l|r|}
\hline
Data model & Data sets available\\ \hline
TAP with CAOM (AdvancedSearch) & 1\,279\,617 \\
TAP with ObsCore & 1\,103\,787\\
TAP querying for cubes & 102\,392\\
TAP querying for spectra & 227\,839\\
SIA & 335\,185\\ \hline
\end{tabular}
\end{center}
\end{table}
\begin{table}
\caption{Downloads of raw and reduced data from the JSA for the first
11 months of 2014. 40~percent of the raw downloads are to CANFAR
processing nodes. When interpreting the relative count of raw and
processed files, note that SCUBA-2 generates 480 discrete data files
every half hour, which may result in only two output maps (one for
each wavelength, depending on tiling scheme).}
\label{tab:down}
\begin{center}
\begin{tabular}{|l|r|r|}
\hline
& Number of files & Data volume / GB \\ \hline
Processed & 72\,730 & 1\,764 \\
Raw & 4\,427\,478 & 63\,611 \\ \hline
\end{tabular}
\end{center}
\end{table}
\figref{fig:jsapubs} demonstrates that between 2010 and 2013 more
than half of the refereed papers published containing JCMT data,
obtained data from the JSA. Table \ref{tab:cadcvo} provides the
current size of the data holdings accessible via a variety of VO
protocols, and table \ref{tab:down} provides details of how the
downloads from the JSA are split between raw and reduced data.
The collaboration has proven so successful that the opportunity was
taken to transfer the UKIRT raw data from the Cassegrain instruments
to CADC \citep{adassxxiii_P01}. It has been possible to re-use
the JSA processing infrastructure for UKIRT data processing as the
pipeline environment is identical \citep{2015A&C.....9...40J}. Similarly, the ingestion
software initially developed for the JSA was easily adapted to ingest
data from several other CADC collections, including BLAST
(Balloon-borne Large Aperture Submillimeter Telescope, CGPS
(Canadian Galactic Plane Survey), IRIS (Improved Reprocessing
of the IRAS Survey), and VGPS (VLA Galactic Plane Survey).
The JSA data processing continues to be improved
\citep{2014JCMTN35..19J} and the current plan is to reduce all the
public HARP/ACSIS and SCUBA-2 data using an ``all-sky'' HEALPix
projection \citep{2005ApJ...622..759G,2014SPIE9152-93,2014JCMTN35..20B}. This processing
will also result in catalogue products that are specifically designed
to answer the question of whether the JCMT saw any emission in a
particular part of the sky. This is achieved by doing a two-pass
approach to clump
finding where first the emission outline is determined, and ultimately
represented by an STC-S polygon, and then the individual peaks are located
\citep{2014JCMTN35..21G}.
There is also an intent to expand the holdings of the JSA to include
heterodyne data taken in an older format by the DAS
\citep{1986SPIE..598..134B} and AOS-C backends. Data from those
instruments is being converted from the GSD format \citep{GSD1999} to the newer ACSIS format and this
allows all the standard processing infrastructure to be used to create
reduced data products and make them available to the VO for the first
time.
The JSA pioneered the use of CAOM at the CADC being implemented in both
CAOM-0.9 and CAOM-1. The latest version, CAOM-2
\citep{2013ASPC..475..159R,2012ASPC..461..339D}, was released for general
use on 2014 May 1 and includes clarifications and improvements due to
lessons learned from the earlier models.
The metadata that is available for searching is richer, more complete, and easier
to understand than anything that has been available previously.
A full description of CAOM is in the early stages of preparation, but the earlier
references cited above still describe the core philosophy of the design, and the
current database schema can be found online\footnote{\url{http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/caom2/}}.
\section{Lessons Learned}
The JCMT Science Archive collaboration was a high successful foray
into VO publication via an observatory-data centre collaboration.
Elements that we believe led to this success:
\begin{itemize}
\item VO publication was a common goal with significant organizational
buy-in for both parties from the start, and was a primary technical
goal of the collaboration rather than an afterthought.
\item Within that shared vision, there was a clear division of
expertise and responsibilities for each side, allowing each
organization to focus on its proximate technical goals. Both
organizations had ``skin in the game'' that was served by the
technical work undertaken, which allowed this work to be carried out
without any kind of external agency funding (each institution
supported its own share of the work out of its normal budgetary
process).
\item Each organization worked from a position of strength based on an
advanced, robust and mature software architecture, allowing
development to focus on new functionality and interfaces between the
two systems. This minimized the communication overheads commonly
associated with distributed projects.
\item The role of ``data engineer'' responsible for developing software to
ingest new data into a CAOM archive no longer requires special privileges
at the CADC. It does require an expert
knowledgable about both the CAOM model and the products generated
by the data reduction system, but the tools developed for CAOM
allow this role to be assigned to the best available expert regardless of their
location or institutional association. Thus, for UKIRT, the Joint Astronomy Centre has been able to
assign one of their own staff to this role, and for the JSA a retired CADC staff
member currently fills the role.
\item There was a high level of pre-existing trust between the two
groups from their previous relationship leading to minimal need for
contractual language or management oversight. Indeed the entire
collaboration's only official governance document was a two paragraph
memorandum of understanding.
\end{itemize}
\section{Recommendations}
In the general case, for observatories that do not understand the
mechanisms or benefits of VO publication, collaboration with a
motivated VO-involved data centre that has the appropriate
infrastructure and keeps up to date with the IVOA standards process is
a far more effective choice than trying to develop those capabilities
in-house, especially since there seems to be confusion in the
observatory community as to what ``VO publication'' involves and what
are the merits of doing it.
However, in order to be able to properly leverage the capability of a
modern multi-mission data centre, a fanatical devotion to correct and
complete metadata should be considered a pre-requisite.
Good communications within the team of collaborators is essential. Regular
weekly or bi-weekly teleconferences and occasional face-to-face meetings
have been important to keeping everyone aware of issues and working to
common purposes.
\section*{Acknowledgments}
The James Clerk Maxwell Telescope has historically been operated by
the Joint Astronomy Centre on behalf of the Science and Technology
Facilities Council of the United Kingdom, the National Research
Council of Canada and the Netherlands Organisation for Scientific
Research. Additional funds for the construction of SCUBA-2 were
provided by the Canada Foundation for Innovation.
The Canadian Astronomy Data Centre is operated by the
National Research Council of Canada with the support of the Canadian
Space Agency. This research has made use of NASA's Astrophysics
Data System.
%% References
%%
%% Following citation commands can be used in the body text:
%%
%% \citet{key} ==>> Jones et al. (1990)
%% \citep{key} ==>> (Jones et al., 1990)
%%
%% Multiple citations as normal:
%% \citep{key1,key2} ==>> (Jones et al., 1990; Smith, 1989)
%% or (Jones et al., 1990, 1991)
%% or (Jones et al., 1990a,b)
%% \cite{key} is the equivalent of \citet{key} in author-year mode
%%
%% Full author lists may be forced with \citet* or \citep*, e.g.
%% \citep*{key} ==>> (Jones, Baker, and Williams, 1990)
%%
%% Optional notes as:
%% \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2)
%% \citep[e.g.,][]{key} ==>> (e.g., Jones et al., 1990)
%% \citep[see][pg. 34]{key}==>> (see Jones et al., 1990, pg. 34)
%% (Note: in standard LaTeX, only one note is allowed, after the ref.
%% Here, one note is like the standard, two make pre- and post-notes.)
%%
%% \citealt{key} ==>> Jones et al. 1990
%% \citealt*{key} ==>> Jones, Baker, and Williams 1990
%% \citealp{key} ==>> Jones et al., 1990
%% \citealp*{key} ==>> Jones, Baker, and Williams, 1990
%%
%% Additional citation possibilities
%% \citeauthor{key} ==>> Jones et al.
%% \citeauthor*{key} ==>> Jones, Baker, and Williams
%% \citeyear{key} ==>> 1990
%% \citeyearpar{key} ==>> (1990)
%% \citetext{priv. comm.} ==>> (priv. comm.)
%% \citenum{key} ==>> 11 [non-superscripted]
%% Note: full author lists depends on whether the bib style supports them;
%% if not, the abbreviated list is printed even when full requested.
%%
%% For names like della Robbia at the start of a sentence, use
%% \Citet{dRob98} ==>> Della Robbia (1998)
%% \Citep{dRob98} ==>> (Della Robbia, 1998)
%% \Citeauthor{dRob98} ==>> Della Robbia
%% References with bibTeX database:
\bibliographystyle{model2-names-astronomy}
\bibliography{acjsa}
%% Authors are advised to submit their bibtex database files. They are
%% requested to list a bibtex style file in the manuscript if they do
%% not want to use model2-names.bst.
%% References without bibTeX database:
% \begin{thebibliography}{00}
%% \bibitem must have one of the following forms:
%% \bibitem[Jones et al.(1990)]{key}...
%% \bibitem[Jones et al.(1990)Jones, Baker, and Williams]{key}...
%% \bibitem[Jones et al., 1990]{key}...
%% \bibitem[\protect\citeauthoryear{Jones, Baker, and Williams}{Jones
%% et al.}{1990}]{key}...
%% \bibitem[\protect\citeauthoryear{Jones et al.}{1990}]{key}...
%% \bibitem[\protect\astroncite{Jones et al.}{1990}]{key}...
%% \bibitem[\protect\citename{Jones et al., }1990]{key}...
%% \harvarditem[Jones et al.]{Jones, Baker, and Williams}{1990}{key}...
%%
% \bibitem[ ()]{}
% \end{thebibliography}
\end{document}
%%
%% End of file `elsarticle-template-2-harv.tex'.