main.tex

\documentclass{article} % For LaTeX2e
\usepackage{iclr2020,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
%\input{math_commands.tex}
\usepackage{amsmath,amsfonts,bm}

\usepackage[hidelinks]{hyperref}
\usepackage{url}
\usepackage{breqn}
\usepackage{booktabs}
\usepackage{siunitx} % used in SHREC table
\usepackage{bbm}        % for mathbbm
\usepackage{graphicx} \graphicspath{{figures/}}
\usepackage[export]{adjustbox} % figure vertical alignment
%\usepackage{floatrow} % figure and table together
\usepackage{capt-of} % or \usepackage{caption} % figure and table together
\usepackage{subfig}
\usepackage{wrapfig}
%\usepackage{floatflt}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}

%\title{Which spherical CNN should you use?\\ DeepSphere V2}
\title{DeepSphere: a graph-based spherical CNN}
%\title{DeepSphere: an efficient graph-based spherical CNN}
%\title{DeepSphere, an efficient (& equivariant) graph-based spherical CNN}
\author{Michaël Defferrard, Martino Milani \& Frédérick Gusset \\
École Polytechnique Fédérale de Lausanne (EPFL), Switzerland \\
\texttt{\{michael.defferrard,martino.milani,frederick.gusset\}@epfl.ch}
\AND
Nathanaël Perraudin \\
Swiss Data Science Center (SDSC), Switzerland \\
\texttt{nathanael.perraudin@sdsc.ethz.ch}
}

\newtheorem{definition}{Definition}[section]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{prop}{Proposition}
\newtheorem*{remark}{Remark}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\renewcommand{\b}[1]{{\bm{#1}}}  % bold symbol
\newcommand{\bO}{\mathcal{O}}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\S}{\mathbb{S}}
\newcommand{\G}{\mathcal{G}}  % graph
\newcommand{\V}{\mathcal{V}}  % vertex set
\newcommand{\E}{\mathcal{E}}  % edge set
\newcommand{\laplacian}{\Delta_{\mathbb S^2}}
\newcommand{\Rg}{R(g)}
\newcommand{\seminorm}[1]{\left|#1\right|}
\newcommand{\Ln}{\hat L_n^{t_n}}

\DeclareMathOperator*{\diag}{diag}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\newcommand{\todo}[1]{{\color[rgb]{.6,.1,.6}{#1}}}
\newcommand{\nati}[1]{{\color[rgb]{.3,.5,.9}{#1}}}
\newcommand{\mart}[1]{{\color[rgb]{.9,.5,.3}{#1}}}
\newcommand{\mdeff}[1]{{\color[rgb]{.8,.3,.2}{#1}}}

\newcommand{\figref}[1]{figure~\ref{fig:#1}}
\newcommand{\Figref}[1]{Figure~\ref{fig:#1}}
\newcommand{\tabref}[1]{table~\ref{tab:#1}}
\newcommand{\Tabref}[1]{Table~\ref{tab:#1}}
\newcommand{\secref}[1]{section~\ref{sec:#1}}
\newcommand{\Secref}[1]{Section~\ref{sec:#1}}
%\newcommand{\secref}[1]{\S\ref{sec:#1}}
\newcommand{\eqnref}[1]{(\ref{eqn:#1})}
%\newcommand{\eqnref}[1]{equation~(\ref{eqn:#1})}
%\newcommand{\Eqnref}[1]{Equation~(\ref{eqn:#1})}

\newcommand{\linefrac}[2]{
    {#1/#2}
}

\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}

\maketitle

\begin{abstract}
Designing a convolution for a spherical neural network requires a delicate tradeoff between efficiency and rotation equivariance.
% exactitude with respect to equivariance.
DeepSphere, a method based on a graph representation of the sampled sphere, strikes a controllable balance between these two desiderata.
% Say something here about isotropy?
This contribution is twofold.
First, we study both theoretically and empirically how equivariance is affected by the underlying graph with respect to the number of vertices and neighbors.
Second, we evaluate DeepSphere on relevant problems.
Experiments show state-of-the-art performance and demonstrates the efficiency and flexibility of this formulation.
Perhaps surprisingly, comparison with previous work suggests that anisotropic filters might be an unnecessary price to pay.
%Say something about speed here... Not sure what?
% memory, computations, implementation complexity
Our code is available at \url{https://github.com/deepsphere}.
\end{abstract}

\section{Introduction}

Spherical data is found in many applications (\figref{examples}).
% spherical data f: S^2 -> R, projection from R^3
Planetary data (such as meteorological or geological measurements) and brain activity are example of intrinsically spherical data.
The observation of the universe, LIDAR scans, and the digitalization of 3D objects are examples of projections due to observation.
Labels or variables are often to be inferred from them.
Examples are the inference of cosmological parameters from the distribution of mass in the universe \citep{perraudin2019deepspherecosmo}, the segmentation of omnidirectional images \citep{khasanova2017sphericalcnn}, and the segmentation of cyclones from Earth observation \citep{mudigonda2017climateevents}.

\begin{figure}[h]
	\centering
	\subfloat[]{
		\includegraphics[height=0.20\linewidth,valign=t]{example_brain_meg}
	} \hspace{0.1em}
	\subfloat[]{
		\includegraphics[height=0.20\linewidth,valign=t]{example_cosmo_cmb}
	} \hfill
	\subfloat[]{
		\includegraphics[height=0.20\linewidth,valign=t]{example_climate_TMQ}
		\label{fig:examples:climate}
	} \hfill
	\subfloat[]{
		\includegraphics[height=0.20\linewidth,valign=t]{example_ghcn_daily_tmax}
		\label{fig:examples:ghcn:tmax}
	} \hfill
	\subfloat[]{
		\includegraphics[height=0.20\linewidth,valign=t]{example_ghcn_graph}
		\label{fig:examples:ghcn:graph}
	}
	\caption{
		Examples of spherical data:
		(a) brain activity recorded through magnetoencephalography (MEG),\protect\footnotemark
		(b) the cosmic microwave background (CMB) temperature from \citet{planck2015overview},
		(c) hourly precipitation from a climate simulation \citep{jiang2019sphericalcnn}, %\citep{mudigonda2017climateevents},
		(d) daily maximum temperature from the Global Historical Climatology Network (GHCN).\protect\footnotemark
		A rigid full-sphere sampling is not ideal: brain activity is only measured on the scalp, the Milky Way's galactic plane masks observations, climate scientists desire a variable resolution, and the position of weather stations is arbitrary and changes over time.
		(e) Graphs can faithfully and efficiently represent sampled spherical data by placing vertices where it matters.
		%\todo{horizontal colorbar for brain}
	}
	\label{fig:examples}
\end{figure}
\footnotetext[1]{\scriptsize\url{https://martinos.org/mne/stable/auto_tutorials/plot_visualize_evoked.html}}
\footnotetext[2]{\scriptsize\url{https://www.ncdc.noaa.gov/ghcn-daily-description}}

% balance of ??
% Exploit the geometrical properties of the domain, symmetries.
% desiderata or tradeoff
% We identified the following desiderata for a spherical NN.
%Those take different tradeoffs among the following desiderata.
%\begin{enumerate}
%	\item \textbf{Respect the geometry.}  % geometry, domain
%		By respecting the domain, a spherical NN should respect the symmetries of the sphere, i.e., be equivariant to rotation, and not deform the sphere, e.g., by a projection on the icosahedron.
%	\item \textbf{Generality.} % powerful
%		How general the NN is as a function approximator.
%		For example, isotropic filters are less general than anisotropic filters.
%	\item \textbf{Scalability.}
%		How a spherical CNN scales w.r.t.\ the number of pixels in terms of computational cost and memory usage.
%		% memory: feature maps and parameters
%	\item \textbf{Flexibility.}
%		Whether the NN is constrained to work on specific samplings, part of the sphere (useful when data is masked), and irregular samplings.
%	\item \textbf{Simplicity.}
%		Simplicity of its derivation and implementation.
%\end{enumerate}

% keep the story simple and focus on the tradeoff between cost (time / computational and storage / memory) and exactitude
As neural networks (NNs) have proved to be great tools for inference, variants have been developed to handle spherical data.
Exploiting the locally Euclidean property of the sphere, early attempts used standard 2D convolutions on a grid sampling of the sphere \citep{boomsma2017sphericalcnn, su2017sphericalcnn, coors2018sphericalcnn}.
While simple and efficient, those convolutions are not equivariant to rotations.
On the other side of this tradeoff, \citet{cohen2018sphericalcnn} and \citet{esteves2018sphericalcnn} proposed to perform proper spherical convolutions through the spherical harmonic transform.
While equivariant to rotations, those convolutions are expensive (\secref{method:convolution}).

% On the one side, we have Cohen which is computationally very expensive but perfectly equivariant most general, on the other we have cube sphere which computationl very good but not equivariant at all. Other methods are tradedoff between these two. What is a good tradeoff? We think that DeepSphere is.

As a lack of equivariance can penalize performance (\secref{exp:cosmo}) and expensive convolutions prohibit their application to some real-world problems, methods standing between these two extremes are desired.
%Such high resolutions are not necessary to represent the depth maps of 3D objects, but are required to faithfully represent cosmological and climate data.
\citet{cohen2019gauge} proposed to reduce costs by limiting the size of the representation of the symmetry group by projecting the data from the sphere to the icosahedron.
% not sure if true
The distortions introduced by this projection might however hinder performance (\secref{exp:climate}).
% (from the continuous rotation group SO(3) to a discrete rotation group)
% the dimensionality of feature maps is equal to the size of the irreducible representation of the symmetry group
%As a platonic solid, it \todo{exhibits discrete rotations}.
% the representation of its symmetry group is much smaller than SO(3)
%\todo{not all rotations, subset of SO(3)?}

Another approach is to represent the sampled sphere as a graph connecting pixels according to the distance between them \citep{bruna2013gnn, khasanova2017sphericalcnn, perraudin2019deepspherecosmo}.
While Laplacian-based graph convolutions are more efficient than spherical convolutions, they are not exactly equivariant \citep{defferrard2019deepsphereequiv}.
In this work, we argue that graph-based spherical CNNs strike an interesting balance, with a controllable tradeoff between cost and equivariance (which is linked to performance).
Experiments on multiple problems of practical interest show the competitiveness and flexibility of this approach.
% Furthermore, experiments show that, perhaps surprisingly and contrarily to previously published results \citep{cohen2019gauge} \todo{[Bronstein?]}, anisotropic filters might be an unnecessary price to pay.
% This paper focuses on improving the rotation equivariance of \citet{perraudin2019deepspherecosmo}, a scalable graph-based method designed for cosmological applications.

\section{Method}

% \begin{figure}
% 	\centering
% 	\includegraphics[width=\linewidth]{figure_architecture_v3}
% 	\caption{Example architecture from \citet{perraudin2019deepspherecosmo}.
% 		Global tasks need a spatial summarization: the FCN variant is rotation invariant (and accepts inputs of varying sizes), while the CNN variant is not.
% 		Dense tasks (when the output lives on the sphere, like segmentation) are rotation equivariant.
% 		\todo{Shall we keep this figure? It's very nice, but we could save space. ML people probably have this in mind.}
% 	}
% 	\label{fig:architecture}
% \end{figure}

DeepSphere leverages graph convolutions to achieve the following properties: (i) computational efficiency, (ii) sampling flexibility, and (iii) rotation equivariance (\secref{equivariance}).
% An example architecture is shown in \figref{architecture}.
The main idea is to model the sampled sphere as a graph of connected pixels: the length of the shortest path between two pixels is an approximation of the geodesic distance between them.
We use the graph CNN formulation introduced in \citep{defferrard2016graphnn} and a pooling strategy that exploits hierarchical samplings of the sphere.

\paragraph{Sampling.}
A sampling scheme $\V = \{x_i \in \S^2\}_{i=1}^n$ is defined to be the discrete subset of the sphere containing the $n$ points where the values of the signals that we want to analyse are known. For a given continuous signal $f$, we represent such values in a vector $\b{f}\in\mathbb R^n$. As there is no analogue of uniform sampling on the sphere, many samplings have been proposed with different tradeoffs.
In this work, depending on the considered application, we will use the equiangular \citep{driscoll1994Fouriersphere}, HEALPix \citep{gorski2005healpix}, and icosahedral \citep{baumgardner1985icosahedral} samplings.

\paragraph{Graph.}
From $\V$, we construct a weighted undirected graph $\G = (\V, w)$, where the elements of $\V$ are the vertices and the weight $w_{ij} = w_{ji}$ is a similarity measure between vertices $x_i$ and $x_j$.
The combinatorial graph Laplacian $\b{L} \in \R^{n \times n}$ is defined as $\b{L} = \b{D} - \b{A}$, where $\b{A} = (w_{ij})$ is the weighted adjacency matrix, $\b{D} = (d_{ii})$ is the diagonal degree matrix, and $d_{ii} = \sum_j w_{ij}$ is the weighted degree of vertex $x_i$.
%$\b{D} = \diag(\b{A} \b{1})$
Given a sampling $\V$, usually fixed by the application or the available measurements, the freedom in constructing $\G$ is in setting $w$.
\Secref{equivariance} shows how to set $w$ to minimize the equivariance error.
%building a good graph amounts to set a good $w$.

\paragraph{Convolution.} \label{sec:method:convolution}
On Euclidean domains, convolutions are efficiently implemented by sliding a window in the signal domain.
On the sphere however, there is no straightforward way to implement a convolution in the signal domain due to non-uniform samplings.
Convolutions are most often performed in the spectral domain through a spherical harmonic transform (SHT).
That is the approach taken by \citet{cohen2018sphericalcnn} and \citet{esteves2018sphericalcnn}, which has a computational cost of $\bO(n^{3/2})$ on isolatitude samplings (such as the HEALPix and equiangular samplings) and $\bO(n^2)$ in general.
On the other hand, following \citet{defferrard2016graphnn}, graph convolutions can be defined as
\begin{equation} \label{eqn:graph_conv}
	h(\b{L}) \b{f} = \left(\sum_{i=0}^P \alpha_i \b{L}^i\right) \b{f},
\end{equation}
where $P$ is the polynomial order (which corresponds to the filter's size) and $\alpha_i$ are the coefficients to be optimized during training.\footnote{In practice, training with Chebyshev polynomials (instead of monomials) is slightly more stable. We believe it to be due to their orthogonality and uniformity.}
Those convolutions are used by \citet{khasanova2017sphericalcnn} and \citet{perraudin2019deepspherecosmo} and cost $\bO(n)$ operations through a recursive application of $\b{L}$.\footnote{As long as the graph is sparsified such that the number of edges, i.e., the number of non-zeros in $\b{A}$, is proportional to the number of vertices $n$. This can always be done as most weights are very small.}

\paragraph{Pooling.}
Down- and up-sampling is natural for hierarchical samplings,\footnote{The equiangular, HEALPix, and icosahedral samplings are of this kind.} where each subdivision divides a pixel in (an equal number of) child sub-pixels.
To pool (down-sample), the data supported on the sub-pixels is summarized by a permutation invariant function such as the maximum or the average.
To unpool (up-sample), the data supported on a pixel is copied to all its sub-pixels.
% Frédérick: Dans le cas de l'icosahedron, le pooling n'est pas invariant à la permutation (on prend la valeur du 1er pixel, sans regarder les autres)
% Frédérick: Et le unpooling est correct uniquement dans le cas de l'average pooling (mais on fait pas de max pooling dans ce cas). Encore une fois, l'icosahedron est inspiré de Jiang, où 1 seul sub-pixel prend la valeur du pixel, le reste est rempli avec des '1'

\paragraph{Architecture.}
All our NNs are fully convolutional, and employ a global average pooling (GAP) for rotation invariant tasks.
Graph convolutional layers are always followed by batch normalization and ReLU activation, except in the last layer.
Note that batch normalization and activation act on the elements of $\b{f}$ independently, and hence don't depend on the domain of $f$.
%We denote graph convolutional layers with $C$ output channels as $GC_C$ and fully connected layers with $C$ output neurons as $FC_C$.
%Those linear layers are always followed by batch normalization and ReLU activation, except in the last layer.
%softmax normalization as $SM$
%Pooling and unpooling by a factor $F$ are denoted as $P_F$ and $P^{-1}_F$, global average pooling as $GAP$.

% \section{Notation}
% Throughout this contribution, we denote with bold font all vectors and matrices such as $\b{L}$ the graph Laplacian and $\b{f}$ the vector of the sampled function $f$.
% \nati{Right now, it is not True}
% % , a weighted undirected graph $G(V, E, \mathbf W)$,
% \nati{I do not think we should have a notation section, but should somehow write this somewhere...}
% mdeff: I think this convention is clear enough when reading.

\section{Graph convolution and equivariance} \label{sec:equivariance}
%\section{Is graph convolution equivariant to rotation? [2 pages]}

While the graph framework offers great flexibility, its ability to faithfully represent the underlying sphere --- for graph convolutions to be rotation equivariant --- highly depends on the sampling locations and the graph construction.

\subsection{Problem formulation}
% 0.1 Explain the sampling problem
%As the graph convolution acts on a sampled version of the function $\b{f}$ and the rotation on its continuous counterpart $f$, a sampling operator is needed.
A continuous function $f : \mathcal C(\S^2) \supset F_\V \to \R$ is sampled as $T_\V(f) = \b{f}$ by the sampling operator $T_\V: C(\S^2) \supset F_\V \to \R^n$ defined as $\b{f}: f_i=f(x_i)$.
We require $F_\V$ to be a suitable subspace of continuous functions such that $T_\V$ is invertible, i.e., the function $f \in F_\V$ can be unambiguously reconstructed from its sampled values $\b{f}$.
The existence of such a subspace depends on the sampling $\V$ and its characterization is a common problem in signal processing \citep{driscoll1994Fouriersphere}.
For most samplings, it is not known if $F_\V$ exists and hence if $T_\V$ is invertible.
A special case is the equiangular sampling where a sampling theorem holds, and thus a closed-form of $T_\V^{-1}$ is known. % \citep{driscoll1994Fouriersphere}
For samplings where no such sampling formula is available, we leverage the discrete SHT to reconstruct $f$ from $\b{f}=T_\V f$, thus approximating $T_\V^{-1}$.
For all theoretical considerations, we assume that $F_\V$ exists and $f \in F_\V$.

%\mdeff{we need $T_\V^{-1}$ as we are concerned with continuous rotations}

% 0.2 Explain why the Laplacian is so important...
By definition, the (spherical) graph convolution is rotation equivariant if and only if it commutes with the rotation operator defined as $\Rg, g\in SO(3)$: $\Rg f(x) = f\left(g^{-1} x \right)$.
In the context of this work, graph convolution is performed by recursive applications of the graph Laplacian \eqnref{graph_conv}.
Hence, if $\Rg$ commutes with $\b{L}$, then, by recursion, it will also commute with the convolution $h(\b{L})$.
As a result, $h(\b{L})$ is rotation equivariant if and only if
% 0.3 Formulate the problem
\begin{equation*} %\label{eq:equivariance}
	\b{R}_\V(g) \b{L} \b{f} = \b{L} \b{R}_\V(g) \b{f}, \hspace{1cm} \forall f\in F_\V \text{ and } \forall g\in SO(3),
\end{equation*}
where $\b{R}_\V(g) = T_\V \Rg T_\V^{-1}$.
For an empirical evaluation of equivariance, we define the \textit{normalized equivariance error} for a signal $\b{f}$ and a  rotation $g$ as
\begin{equation} \label{eq:equivariance error}
	E_{\b{L}}(\b{f}, g) = \left(\frac{ \norm {\b{R}_\V(g) \b{L} \b{f} - \b{L} \b{R}_\V(g) \b{f}} }{\norm {\b{L} \b{f}}}\right)^2.
\end{equation}
More generally for a class of signals $f \in C \subset F_\V$, the \textit{mean equivariance error} defined as
\begin{equation} \label{eq:mean equivariance error}
	\overline E_{\b{L}, C} = \mathbb E_{\b{f}\in C, g\in SO(3)} \ E_{\b{L}}(\b{f}, g)
\end{equation}
represents the overall equivariance error.
The expected value is obtained by averaging over a finite number of random functions and random rotations.

%\subsection{Finding the optimal edge weights} \label{sec:optimal}
\subsection{Finding the optimal weighting scheme} \label{sec:optimal}

% 1) summarize the work of Khasanova
Considering the equiangular sampling and graphs where each vertex is connected to 4 neighbors (north, south, east, west), \citet{khasanova2017sphericalcnn} designed a weighting scheme to minimize \eqref{eq:mean equivariance error} for longitudinal and latitudinal rotations\footnote{Equivariance to longitudinal rotation is essentially given by the equiangular sampling.}.
% because longitudinal rotation can be discrete
Their solution gives weights inversely proportional to Euclidean distances:
\begin{equation} \label{eqn:weights:khasanova}
	w_{ij} = \frac{1}{\norm{x_i-x_j}}.
\end{equation}
%leading to a high mean equivariance error if the mean is computed over rotations in SO(3).
While the resulting convolution is not equivariant to the whole of $SO(3)$ (\figref{equivariance_error}), it is enough for omnidirectional imaging because, as gravity consistently orients the sphere, objects only rotate longitudinally or latitudinally.

% 2) explain our approach
To achieve equivariance to all rotations, we take inspiration from \citet{belkin2005towards}.
They prove that for a \emph{random uniform sampling}, the graph Laplacian $\b{L}$ built from weights
\begin{equation} \label{eqn:weights:belkin}
	w_{ij} = e^{- \frac{1}{4t} \norm{x_i - x_j}^2}
\end{equation}
converges to the Laplace-Beltrami operator $\Delta_{\mathbb{S}^2}$ as the number of samples grows to infinity.
This result is a good starting point as $\Delta_{\mathbb{S}^2}$ commutes with rotation, i.e., $\Delta_{\mathbb{S}^2}\Rg = \Rg\Delta_{\mathbb{S}^2}$.
% and (ii) is diagonalized by the spherical harmonics.
While the weighting scheme is full (i.e., every vertex is connected to every other vertex), most weights are small due to the exponential.
We hence make an approximation to limit the cost of the convolution \eqnref{graph_conv} by only considering the $k$ nearest neighbors ($k$-NN) of each vertex.
%In practice, the tricky part is to set $t$ in a non-asymptotic regime.
Given $k$, the optimal kernel width $t$ is found by searching for the minimizer of \eqref{eq:mean equivariance error}.
\Figref{kernel_widths} shows the optimal kernel widths found for various resolutions of the HEALPix sampling.
As predicted by the theory, $t_n \propto n^\beta, \beta \in \R$.
Importantly however, the optimal $t$ also depends on the number of neighbors $k$.

Considering the HEALPix sampling, \citet{perraudin2019deepspherecosmo} connected each vertex to their 8 adjacent vertices in the tiling of the sphere, computed the weights with \eqnref{weights:belkin}, and heuristically set $t$ to half the average squared Euclidean distance between connected vertices.
This heuristic however over-estimates $t$ (\figref{kernel_widths}) and leads to an increased equivariance error (\figref{equivariance_error}).
% We observe that using a small kernel size $t$, \cite{perraudin2019deepspherecosmo} would probably be more equivariant.

\begin{figure}
	\begin{minipage}{0.6\linewidth}
		\centering
		\includegraphics[width=\linewidth]{equivariance_error}
		\caption{
			Mean equivariance error \eqref{eq:mean equivariance error}.
			There is a clear tradeoff between equivariance and computational cost, governed by the number of vertices $n$ and edges $kn$.
		}
		\label{fig:equivariance_error}
	\end{minipage}
	\hfill
	\begin{minipage}{0.35\linewidth}
		\centering
		\includegraphics[width=\linewidth]{kernel_widths}
		\caption{Kernel widths.}
		\label{fig:kernel_widths}
		\vspace{1em}
		\includegraphics[height=4.5em]{lamp_000018}
		\hfill
		\includegraphics[height=4.5em]{lamp_000018_sphere_nobar}
		\caption{3D object represented as a spherical depth map.}
		\label{fig:depthmap}
		\vspace{1em}
		\includegraphics[width=\linewidth]{spectrum}
		\caption{Power spectral densities.}
		\label{fig:spectrum}
	\end{minipage}
\end{figure}

\subsection{Analysis of the proposed weighting scheme}

We analyze the proposed weighting scheme both theoretically and empirically.

\paragraph{Theoretical convergence.}
\begin{wrapfigure}[10]{r}{3cm}
	\centering
	\includegraphics[width=0.9\linewidth]{patch}
	\caption{Patch.}
	\label{fig:patch}
\end{wrapfigure}
We extend the work of \citep{belkin2005towards} to a sufficiently regular, deterministic sampling.
Following their setting, we work with the \emph{extended graph Laplacian} operator as the linear operator $L_n^t: L^{2}(\S^2) \rightarrow L^{2}(\S^2)$ such that
\begin{equation} \label{eq:Heat Kernel Graph Laplacian operator}
	 L_n^t f(y) := \frac{1}{n}\sum_{i=1}^{n} e^{ -\frac{\|x_i-y\|^2}{4t}} \left(f(y)-f(x_i)\right).
\end{equation}
This operator extends the graph Laplacian with the weighting scheme \eqnref{weights:belkin} to each point of the sphere (i.e., $\b{L}_n^t \b{f} = T_\V L_n^t f$).
As the radius of the kernel $t$ will be adapted to the number of samples, we scale the operator as $\hat{L}_n^t := |\S^2| (4\pi t^2)^{-1} L_n^t$.  % \frac{|\S^2|}{4\pi t^2}
% $\b{\hat L} = \frac{|\mathbb S^2|}{4\pi t^2}\b L_n^t$. \nati{why define this, we are not using it}
Given a sampling $\V$, we define $\sigma_i$ to be the patch of the surface of the sphere corresponding to $x_i$, $A_i$ its corresponding area, and $d_i$ the largest distance between the center $x_i$ and any point on the surface $\sigma_i$.
Define $d^{(n)} := \max_{i=1, \dots, n} d_i$ and $A^{(n)} := \max_{i=1, \dots, n} A_i$.
%Given any rotation $g\in SO(3)$, we define the corresponding rotation operator $\Rg$ such that for any function $f$ on the sphere
%\mdeff{Already defined in problem formulation.}
%\begin{equation} \label{eq:rotation operator}
%	\Rg f(x) := f(g^{-1}(x)).
%\end{equation}
\begin{theorem}
	For a sampling $\V$ of the sphere that is equi-area and such that $d^{(n)} \leq \frac{C}{n^\alpha}, \ \alpha\in (0,\linefrac{1}{2}]$, for all $f: \S^2 \rightarrow \R$ Lipschitz with respect to the Euclidean distance in $\R^3$, for all $y\in\S^2$, there exists a sequence $t_n = n^\beta$, $\beta\in\mathbb R$ such that
% 	the rescaled Heat Kernel Graph Laplacian $\frac{|\S^2|}{4\pi t_n^2}\b {L}^t_n$ converges pointwise to the Laplace Beltrami operator on the sphere $\Delta{\S^2}$
	\begin{equation*}
		\lim_{n\to\infty}\hat{L}_n^{t_n}f(y) = \laplacian f(y).
	\end{equation*}
	\label{theo:pointwise convergence for a regular sampling}
\end{theorem}
%As a direct implication of this theorem, the scaled graph Laplacian (the discrete counterpart of $\hat L^t_n$) converges pointwise towards the Laplace-Beltrami operator $\Delta_{\S^2}$.
This is a major step towards equivariance, as the Laplace-Beltrami operator commutes with rotation.
Based on this property, we show the equivariance of the scaled extended graph Laplacian.
\begin{theorem}\label{theo:equivariance}
Under the hypothesis of theorem~\ref{theo:pointwise convergence for a regular sampling}, the scaled graph Laplacian commutes with any rotation, in the limit of infinite sampling, i.e.,
\begin{equation*}
	\forall y\in\mathbb S^2\quad\left| \Rg \hat L_n^{t_n} f (y) - \hat L_n^{t_n} \Rg f(y) \right| \xrightarrow{n\to\infty}0.
\end{equation*}
\end{theorem}
From this theorem, it follows that the discrete graph Laplacian will be equivariant in the limit of $n \rightarrow \infty$ as by construction $\b{L}_n^t \b{f} = T_\V L_n^t f$ and as the scaling does not affect the equivariance property of $\b{L}_n^t$.

Importantly, the proof of Theorem~\ref{theo:pointwise convergence for a regular sampling} (in Appendix~\ref{sec: appendix: proof of theorem}) inspires our construction of the graph Laplacian.
In particular, it tells us that $t$ should scale as $n^\beta$, which has been empirically verified (\figref{kernel_widths}).
%In particular, it gives insights on how the parameter $t$ needs to be adjusted with respect to the number of points $n$.
Nevertheless, it is important to keep in mind the limits of Theorem \ref{theo:pointwise convergence for a regular sampling} and \ref{theo:equivariance}.
Both theorems present asymptotic results, but in practice we will always work with finite samplings.
Furthermore, since this method is based on the capability of the eigenvectors of the graph Laplacian to approximate the spherical harmonics,
% maybe erase the previous sentence?
a stronger type of convergence of the graph Laplacian would be preferable, i.e., spectral convergence (that is proved for a full graph in the case of random sampling for a class of Lipschitz functions in \citep{belkin2007convergence}).
% First, it is \emph{not} a proof of equivariance of the graph convolution for a specific sampling. To construct such a proof, a stronger convergence result is needed, for example spectral convergence (instead of pointwise).
% \todo{Can we intuitively write what pointwise convergence means, and how it is more limited than spectral or uniform convergence?}
%We note that in the random sampling case, the graph laplacian does \emph{not} converge uniformly in general \citep[Sec. 3]{belkin2007convergence}, but does for a class of Lipschitz functions \citep[Sec. 6]{belkin2005towards}.
Finally, while we do not have a formal proof for it, we strongly believe that the HEALPix sampling does satisfy the hypothesis $d^{(n)}\leq \frac{C}{n^\alpha}, \ \alpha\in (0,\linefrac{1}{2}]$, with $\alpha$ very close or equal to $\linefrac{1}{2}$.
The empirical results discussed in the next paragraph also points in this direction.
This is further discussed in Appendix \ref{sec: appendix: proof of theorem}.

% 3) Show experimental results
\paragraph{Empirical convergence.}

% 3.0) explain how we obtained the figures
\Figref{equivariance_error} shows the equivariance error \eqref{eq:mean equivariance error} for different parameter sets of DeepSphere for the HEALPix sampling as well as for the graph construction of \citet{khasanova2017sphericalcnn} for the equiangular sampling.
The error is estimated as a function of the sampling resolution and signal frequency.
The resolution is controlled by the number of pixels $n = 12N_{side}^2$ for HEALPix and $n = 4b^2$ for the equiangular sampling.
The frequency is controlled by setting the set $C$ to functions $f$ made of spherical harmonics of a single degree $\ell$.
To allow for an almost perfect implementation (up to numerical errors) of the operator $\b{R}_\V$, the degree $\ell$ was chosen in the range $(0, 3N_{side}-1)$ for HEALPix and $(0, b)$ for the equiangular sampling \citep{gorski1999healpixprimer}.
Using these parameters, the measured error is mostly due to imperfections in the empirical approximation of the Laplace-Beltrami operator and not to the sampling.

% 3.1) explain what the figure tells us
\Figref{equivariance_error} shows that the weighting scheme \eqnref{weights:khasanova} from \citep{khasanova2017sphericalcnn} does indeed not lead to a convolution that is equivariant to all rotations $g \in SO(3)$.\footnote{We however verified that the convolution is equivariant to longitudinal and latitudinal rotations, as intended.}
For $k=8$ neighbors, selecting the optimal kernel width $t$ improves on \citep{perraudin2019deepspherecosmo} at no cost, highlighting the importance of this parameter.
Increasing the resolution decreases the equivariance error in the high frequencies, an effect most probably due to the sampling.
Most importantly, the equivariance error decreases when connecting more neighbors.
Hence, the number of neighbors $k$ gives us a precise control of the tradeoff between cost and equivariance.

\section{Experiments}

% Show:
% * meet the desiderata: scale
% * DeepSphere V1 and V2 are equivalent in practice: better equivariance doesn't translate to better performance
% * anisotropy doesn't help

\subsection{3D objects recognition} \label{sec:exp:shrec}

The recognition of 3D shapes is a rotation invariant task: rotating an object doesn't change its nature.
While 3D shapes are usually represented as meshes or point clouds, representing them as spherical maps (\figref{depthmap}) naturally allows a rotation invariant treatment.
%LIDAR is an example of a physical sensor that acquires spherical depth maps
%, which are a posteriori represented as point clouds.
% but there is gravity, might not really be SO(3) invariant
%\todo{move to intro?} no, good to motivate each task

The SHREC'17 shape retrieval contest \citep{shrec17} contains 51,300 randomly oriented 3D models from ShapeNet \citep{shapenet}, to be classified in 55 categories (tables, lamps, airplanes, etc.).
As in \citep{cohen2018sphericalcnn}, objects are represented by 6 spherical maps.
At each pixel, a ray is traced towards the center of the sphere.
The distance from the sphere to the object forms a depth map.
The $\cos$ and $\sin$ of the surface angle forms two normal maps.
The same is done for the object's convex hull.\footnote{Albeit we didn't observe much improvement by using the convex hull.}
The maps are sampled by an equiangular sampling with bandwidth $b = 64$ ($n = 4 b^2 = 16,384$ pixels) or an HEALPix sampling with $N_{side} = 32$ ($n = 12 N_{side}^2 = 12,288$ pixels).

The equiangular graph is built with \eqnref{weights:khasanova} and $k = 4$ neighbors \citep[following][]{khasanova2017sphericalcnn}.
The HEALPix graph is built with \eqnref{weights:belkin}, $k = 8$, and a kernel width $t$ set to the average of the distances \citep[following][]{perraudin2019deepspherecosmo}.
The NN is made of $5$ graph convolutional layers, each followed by a max pooling layer which down-samples by $4$.
A GAP and a fully connected layer with softmax follow.
The polynomials are all of order $P=3$ and the number of channels per layer is $16, 32, 64, 128, 256$, respectively.
Following \citet{esteves2018sphericalcnn}, the cross-entropy plus a triplet loss is optimized with Adam for 30 epochs on the dataset augmented by 3 random translations.
The learning rate is $5 \cdot 10^{-2}$ and the batch size is 32.

\begin{table}
    \centering
	%\begin{tabular}{l cc r rr}
	\begin{tabular}{l cc r rS[table-format=2.0]}
		\toprule
		& \multicolumn{2}{c}{performance} & \multicolumn{1}{c}{size} & \multicolumn{2}{c}{speed} \\
		\cmidrule(lr){2-3} \cmidrule(lr){4-4} \cmidrule(lr){5-6}
		& F1 & mAP & params & \multicolumn{1}{c}{inference} & \multicolumn{1}{c}{training} \\
		\midrule
		% $SO(3)$ vs $S^2$ vs graph
		\citet{cohen2018sphericalcnn} ($b=128$) & - & 67.6 & 1400\,k & 38.0\,ms & 50\,h \\
		\citet{cohen2018sphericalcnn} (simplified,\protect\footnotemark $b=64$) & 78.9 & 66.5 & 400\,k & 12.0\,ms & 32\,h \\
		\citet{esteves2018sphericalcnn} ($b=64$) & 79.4 & 68.5 & 500\,k & 9.8\,ms & 3\,h \\
		DeepSphere (equiangular, $b=64$) & 79.4 & 66.5 & 190\,k & 0.9\,ms & 50\,m \\
		DeepSphere (HEALPix, $N_{side}=32$) & 80.7 & 68.6 & 190\,k & 0.9\,ms & 50\,m \\
		\bottomrule
	\end{tabular}
    \caption{
		Results on SHREC'17 (3D shapes). DeepSphere achieves similar performance at a much lower cost, suggesting that anisotropic filters are an unnecessary price to pay.
		% F1-score computed with sklearn, mAP from the official script of the competition.
		% inference speed = time for a single instance to do a single training pass
		% training speed = time for the neural net to train to peak performance
	}
    \label{tab:shrec17}
\end{table}
\footnotetext[7]{As implemented in \url{https://github.com/jonas-koehler/s2cnn}.}

Results are shown in \tabref{shrec17}.
As the network is trained for shape classification rather than retrieval, we report the classification F1 alongside the mAP used in the retrieval contest.\footnote{We omit the F1 for \citet{cohen2018sphericalcnn} as we didn't get the mAP reported in the paper when running it.}
DeepSphere achieves the same performance as \citet{cohen2018sphericalcnn} and \citet{esteves2018sphericalcnn} at a much lower cost, suggesting that anisotropic filters are an unnecessary price to pay.
% x times faster
As the information in those spherical maps resides in the low frequencies (\figref{spectrum}), reducing the equivariance error didn't translate into improved performance.
For the same reason, using the more uniform HEALPix sampling or lowering the resolution down to $N_{side} = 8$ ($n=768$ pixels) didn't impact performance either.
%\todo{The same conclusions are drawn from ModelNet40 (\tabref{modelnet40}).}

% Limited by representation of data? yep, treat them as point clouds

\subsection{Cosmological model classification} \label{sec:exp:cosmo}

Given observations, cosmologists estimate the posterior probability of cosmological parameters, such as the matter density $\Omega_m$ and the normalization of the matter power spectrum $\sigma_8$.
Those parameters are estimated by likelihood-free inference, which requires a method to extract summary statistics to compare simulations and observations.
As the sufficient and most concise summary statistics are the parameters themselves, one desires a method to predict them from simulations.
% such as Approximate Bayesian computation (ABC)
As that is complicated to setup, prediction methods are typically benchmarked on the classification of spherical maps instead \citep{schmelze2017cosmologicalmodel}.
We used the same task, data, and setup as \citet{perraudin2019deepspherecosmo}: the classification of $720$ partial convergence maps made of $n \approx 10^6$ pixels ($1/12 \approx 8\%$ of a sphere at $N_{side} = 1024$) from two $\Lambda$CDM cosmological models, ($\Omega_m = 0.31$, $\sigma_8 = 0.82)$ and ($\Omega_m = 0.26$, $\sigma_8 = 0.91)$, at a relative noise level of $3.5$ (i.e., the signal is hidden in noise of $3.5$ times higher standard deviation).
% 60 maps * 12 parts = 720 parts
Convergence maps represent the distribution of over- and under-densities of mass in the universe \citep[see][for a review of gravitational lensing]{bartelman2010gravitationallensing}.

Graphs are built with \eqnref{weights:belkin}, $k = 8, 20, 40$ neighbors, and the corresponding optimal kernel widths $t$ given in \secref{optimal}.
Following \citet{perraudin2019deepspherecosmo}, the NN is made of $5$ graph convolutional layers, each followed by a max pooling layer which down-samples by $4$.
A GAP and a fully connected layer with softmax follow.
The polynomials are all of order $P=4$ and the number of channels per layer is $16, 32, 64, 64, 64$, respectively.
The cross-entropy loss is optimized with Adam for 80 epochs.
The learning rate is $2 \cdot 10^{-4} \cdot 0.999^{\textrm{step}}$ and the batch size is 8.

\begin{table}
	\begin{minipage}[b]{0.7\linewidth}
    \centering
    \begin{tabular}{l c c}
		\toprule
		& accuracy & time \\ % & training time & NN inv. error
		\midrule
		\citet{perraudin2019deepspherecosmo}, 2D CNN baseline & 54.2 & 104\,ms \\
		\citet{perraudin2019deepspherecosmo}, CNN variant, $k=8$ & 62.1 & 185\,ms \\ % & 1\,h\,45
		\citet{perraudin2019deepspherecosmo}, FCN variant, $k=8$ & 83.8 & 185\,ms \\ % & 1\,h\,45
		$k=8$  neighbors, $t$ from \secref{optimal} & 87.1 & 185\,ms \\ % & 1\,h\,45
		$k=20$ neighbors, $t$ from \secref{optimal} & 91.3 & 250\,ms \\ % & 2\,h\,30
		$k=40$ neighbors, $t$ from \secref{optimal} & 92.5 & 363\,ms \\ % & 3\,h\,45
		\bottomrule
    \end{tabular}
	\caption{
		Results on the classification of partial convergence maps.
		Lower equivariance error translates to higher performance.
	} \label{tab:cosmo}
	\end{minipage} \hfill
	\begin{minipage}[b]{0.25\linewidth}
		\includegraphics[width=\linewidth]{cosmo_cost_accuracy_tradeoff}
		\captionof{figure}{Tradeoff between cost and accuracy.}
		\label{fig:cosmo:tradeoff}
	\end{minipage}
\end{table}

Unlike on SHREC'17, results (\tabref{cosmo}) show that a lower equivariance error on the convolutions translates to higher performance.
That is probably due to the high frequency content of those maps (\figref{spectrum}).
There is a clear cost-accuracy tradeoff, controlled by the number of neighbors $k$ (\figref{cosmo:tradeoff}).
This experiment moreover demonstrates DeepSphere's flexibility (using partial spherical maps) and scalability (competing spherical CNNs were tested on maps of at most $10,000$ pixels).

\subsection{Climate event segmentation} \label{sec:exp:climate}

%TODO: motivate -> summarize climate simulations to enable decisions
We evaluate our method on a task proposed by \citep{mudigonda2017climateevents}: the segmentation of extreme climate events, Tropical Cyclones (TC) and Atmospheric Rivers (AR), in global climate simulations (\figref{examples:climate}).
The data was produced by a 20-year run of the Community Atmospheric Model v5 (CAM5) and consists of 16 channels such as temperature, wind, humidity, and pressure at multiple altitudes.
We used the pre-processed dataset from \citep{jiang2019sphericalcnn}.\footnote{Available at \url{http://island.me.berkeley.edu/ugscnn/data}.}
%\url{https://github.com/maxjiang93/ugscnn}
%\url{https://github.com/maxjiang93/ugscnn/blob/master/experiments/exp4_sphere_climate/run.sh}
%\url{http://island.me.berkeley.edu/ugscnn/data/climate_sphere_l5.zip}
There is 1,072,805 spherical maps, down-sampled to a level-5 icosahedral sampling ($n = 10 \cdot 4^l + 2 = 10,242$ pixels).
The labels are heavily unbalanced with 0.1\% TC, 2.2\% AR, and 97.7\% background (BG) pixels.

The graph is built with \eqnref{weights:belkin}, $k = 6$ neighbors, and a kernel width $t$ set to the average of the distances.
Following \citet{jiang2019sphericalcnn}, the NN is an encoder-decoder with skip connections.
Details in \secref{climate:appendix}.
%The encoder is made of $5$ graph convolutional layers, each followed by a \todo{pick-one?} pooling layer which down-samples by $4$.
%the number of channels per layer is $16, 32, 64, 64, 64$, respectively.
%The decoder is made of
The polynomials are all of order $P=3$.
The cross-entropy loss (weighted or non-weighted) is optimized with Adam for 30 epochs.
The learning rate is $1 \cdot 10{-3}$ and the batch size is 64.

\begin{table}
	\centering
	\begin{tabular}{l l l}
	\toprule
	& accuracy & mAP \\
	\midrule
	%\citet{jiang2019sphericalcnn} (paper) & 94.67 & - \\
	\citet{jiang2019sphericalcnn} (rerun) & 94.95 & 38.41 \\
	\citet{cohen2019gauge} (S2R) & 97.5 & 68.6 \\
	\citet{cohen2019gauge} (R2R) & 97.7 & 75.9 \\
	DeepSphere (weighted loss) & $97.8\pm 0.3$ & $77.15\pm 1.94$ \\
	DeepSphere (non-weighted loss) & $87.8\pm 0.5$ & $89.16\pm 1.37$ \\
	\bottomrule
	\end{tabular}
	\caption{
		Results on climate event segmentation: mean accuracy (over TC, AR, BG) and mean average precision (over TC and AR).
		DeepSphere achieves state-of-the-art performance. %, suggesting again that anisotropic filters are not necessary.
	}
	\label{tab:climate}
\end{table}

Results are shown in \tabref{climate} (details in tables~\ref{tab:climate:accuracy}, \ref{tab:climate:map} and~\ref{tab:climate:speed}).
The mean and standard deviation are computed over 5 runs.
Note that while \citet{jiang2019sphericalcnn} and \citet{cohen2019gauge} use a weighted cross-entropy loss, that is a suboptimal proxy for the mAP metric.
DeepSphere achieves state-of-the-art performance, suggesting again that anisotropic filters are unnecessary.
Note that results from \citet{mudigonda2017climateevents} cannot be directly compared as they don't use the same input channels.

Compared to \citet{cohen2019gauge}'s conclusion, it is surprising that S2R does worse than DeepSphere (which is limited to S2S).
Potential explanations are (i) that their icosahedral projection introduces harmful distortions, or (ii) that a larger architecture can compensate for the lack of generality. % powerfulness?
We indeed observed that more feature maps and depth led to higher performance (\secref{climate:appendix}).

\subsection{Uneven sampling} \label{sec:exp:ghcn}
% time series modeling? GHCN?
% Weather prediction? But it's not a real task. Rather a demo.

To demonstrate the flexibility of modeling the sampled sphere by a graph, we collected historical measurements from $n \approx 10,000$ weather stations scattered across the Earth.\footnote{\url{https://www.ncdc.noaa.gov/ghcn-daily-description}}
The spherical data is heavily non-uniformly sampled, with a much higher density of weather stations over North America than the Pacific (\figref{examples:ghcn:tmax}).
For illustration, we devised two artificial tasks.
A dense regression: predict the temperature on a given day knowing the temperature on the previous 5 days.
A global regression: predict the day (represented as one period of a sine over the year) from temperature or precipitations.
Predicting from temperature is much easier as it has a clear yearly pattern.

The graph is built with \eqnref{weights:belkin}, $k = 5$ neighbors, and a kernel width $t$ set to the average of the distances.
The equivariance property of the resulting graph has not been tested, and we don't expect it to be good due to the heavily non-uniform sampling.
The NN is made of $3$ graph convolutional layers.
The polynomials are all of order $P=0$ or $4$ and the number of channels per layer is $50, 100, 100$, respectively.
For the global regression, a GAP and a fully connected layer follow.
For the dense regression, a graph convolutional layer follows instead.
The MSE loss is optimized with RMSprop for 250 epochs.
The learning rate is $1 \cdot 10^{-3}$ and the batch size is 64.

\begin{table}
    \centering
    \begin{tabular}{
			c
			S[table-format=2.2]
			S[table-format=1.2]
			S[table-format=1.3]
			S[table-format=1.2]
			S[table-format=1.2]
			S[table-format=1.3]
			S[table-format=1.2]
			S[table-format=1.2]
			S[table-format=-1.3]
		}
		\toprule
		& \multicolumn{3}{c}{temp. (from past temp.)} & \multicolumn{3}{c}{day (from temperature)} & \multicolumn{3}{c}{day (from precipitations)} \\
		\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
		order $P$ & {MSE} & {MAE} & {R2} & {MSE} & {MAE} & {R2} & {MSE} & {MAE} & {R2} \\
		\midrule
		$0$ & 10.88 & 2.42 & 0.896 & 0.10 & 0.10 & 0.882 & 0.58 & 0.42 & -0.980 \\
		%%$1$ &  8.91 & 2.20 & 0.906 & - & - & - & - & - & - \\
		$4$ &  8.20 & 2.11 & 0.919 & 0.05 & 0.05 & 0.969 & 0.50 & 0.18 &  0.597 \\
		%$9$ &  8.38 & 2.12 & 0.915 & - & - & - & - & - & - \\
		\bottomrule
    \end{tabular}
    \caption{
		Prediction results on data from weather stations.
		Structure always improves performance.
	}
    \label{tab:ghcn}
\end{table}

Results are shown in \tabref{ghcn}.
While using a polynomial order $P=0$ is like modeling each time series independently with an MLP, orders $P>0$ integrate neighborhood information.
Results show that using the structure induced by the spherical geometry always yields better performance.

% * increasing order is good until threshold \\
% * The NN indeed predicts the correct temperature at day T, and not day T-1 (baseline) even though there is small difference. \\
% * \todo{performance of baseline} \\

\section{Conclusion}

This work showed that DeepSphere strikes an interesting, and we think currently optimal, balance between desiderata for a spherical CNN.
A single parameter, the number of neighbors $k$ a pixel is connected to in the graph, controls the tradeoff between cost and equivariance (which is linked to performance).
% vague words
% cannot do better
As computational cost and memory consumption scales linearly with the number of pixels, DeepSphere scales to spherical maps made of millions of pixels, a required resolution to faithfully represent cosmological and climate data.
Also relevant in scientific applications is the flexibility offered by a graph representation (for partial coverage, missing data, and non-uniform samplings).
% partial coverage / missing data
Finally, the implementation of the graph convolution is straightforward, and the ubiquity of graph neural networks --- pushing for their first-class support in DL frameworks --- will make implementations even easier and more efficient.

A potential drawback of graph Laplacian-based approaches is the isotropy of graph filters, reducing in principle the expressive power of the NN.
% in principle -> may in fact not
% Our experiments showed that DeepSphere nonetheless achieved state-of-the-art performance, on par with methods featuring anisotropic filters.
% didn't hurt perf in any experiment
% expressive power / generality / powerful / function approximator
%Why anisotropic filters?
Experiments from \citet{cohen2019gauge} and \citet{boscaini2016anisotropicgraphnn} indeed suggest that more general convolutions achieve better performance.
Our experiments on 3D shapes (\secref{exp:shrec}) and climate (\secref{exp:climate}) however show that DeepSphere's isotropic filters do not hinder performance.
% the SHREC'17 experiment too shows that isotropic filters are sufficient
%Works on shape alignment indicates that anisotropic filters achieve better performance \todo{[Bronstein]}.
Possible explanations for this discrepancy are that NNs somehow compensate for the lack of anisotropic filters, or that some tasks can be solved with isotropic filters.
% why: icosahedron, can do anisotropy with isotropy + non-linearity + pooling
The distortions induced by the icosahedral projection in \citep{cohen2019gauge} or the leakage of curvature information in \citep{boscaini2016anisotropicgraphnn} might also alter performance.
%Further research is needed to determine if anisotropic filters are an unnecessary price to pay or if they are needed in some applications.
% As was recently shown for graph NNs used for graph classification, most general might not be the best, when the ultimate goal is generalization [Andreas paper? + previous by xx]

Developing graph convolutions on irregular samplings that respect the geometry of the sphere is another research direction of importance.
% known underlying manifold
Practitioners currently interpolate their measurements (coming from arbitrarily positioned weather stations, satellites or telescopes) to regular samplings.
This practice either results in a waste of resolution or computational and storage resources.
Our ultimate goal is for practitioners to be able to work directly on their measurements, however distributed.

% beyond the sphere => any manifold
% what are other manifolds of interest?
% done by monet & co on shapes

% beyond scalar fields => simplicial complexes

%\subsubsection*{Author Contributions}
%Left blank for anonymity reason.
% If you'd like to, you may include a section for author contributions as is done
% in many journals. This is optional and at the discretion of the authors.

\subsubsection*{Acknowledgments}

We thank Pierre Vandergheynst for advices,
%and Andreas Loukas for having processed the GHCN data.
and Taco Cohen for his inputs on the intriguing results of our comparison with \citet{cohen2019gauge}.
We thank the anonymous reviewers for their constructive feedback.
The following software packages were used for computation and plotting: PyGSP \citep{pygsp}, healpy \citep{healpy}, matplotlib \citep{matplotlib}, SciPy \citep{scipy}, NumPy \citep{numpy}, TensorFlow \citep{tensorflow}.
%We used the Python Graph Signal Processing package (PyGSP) \citep{pygsp} for computations and plots.

\bibliography{references}
\bibliographystyle{iclr2020}

\newpage
{\LARGE \sc {Supplementary Material}}
\appendix

\section{Proof of theorem \ref{theo:pointwise convergence for a regular sampling}}\label{sec: appendix: proof of theorem}

\paragraph{Preliminaries.}
The proof of theorem \ref{theo:pointwise convergence for a regular sampling} is inspired from the work of \citet{belkin2005towards}. As a result, we start by restating some of their results.
Given a sampling $\V = \{x_i\in\mathcal M\}_{i=1}^n$ of a closed, compact and infinitely differentiable manifold $\mathcal{M}$, a smooth ($\in\mathcal{C}_\infty(\mathcal{M)}$) function  $f:\mathcal{M} \rightarrow \mathbb{R}$, and defined the vector $\b{f}$ of samples of $f$ as follows: $T_\V f = \b{f} \in \mathbb{R}^n,\ \b{f}_i = f(x_i)$.
The proof is constructed by leveraging 3 different operators:
\begin{itemize}
  \item The extended graph Laplacian operator, already presented in \eqref{eq:Heat Kernel Graph Laplacian operator}, is a linear operator $L_n^t: L^{2}(\mathcal{M}) \rightarrow L^{2}(\mathcal{M})$ defined as
	\begin{equation}
%	\label{eq:Heat Kernel Graph Laplacian operator}
	 L_n^t f(y) := \frac{1}{n}\sum_{i=1}^{n} e^{ -\frac{\|x_i-y\|^2}{4t}} \left(f(y)-f(x_i)\right).
	\end{equation}
	Note that we have the following relation $\b{L}_n^t \b{f} = T_\V L_n^t f$.
	\item The functional approximation to the Laplace-Beltrami operator is a linear operator $L^t: L^{2}(\mathcal{M}) \rightarrow L^{2}(\mathcal{M})$ defined  as
	\begin{equation}
    \label{eq:Functional approximation to the Laplace-Beltrami operator}
	L^tf(y) = \int_{\mathcal{M}} e^{-\frac{\|x-y\|^2}{4t}}\left(f(y)-f(x)\right)d\mu(x),
	\end{equation}
	where $\mu$ is the uniform probability measure on the manifold $\mathcal{M}$, and $\text{vol}(\mathcal{M})$ is the volume of $\mathcal{M}$.
	\item
	The Laplace-Beltrami operator $\Delta_{\mathcal{M}}$ is defined as the divergence of the gradient
	\begin{equation}
        \label{eq:laplace-beltrami}
        \Delta_{\mathcal M}f(y) := -\text{div}(\nabla_{\mathcal M}f)
    \end{equation}
    of a differentiable function $f: \mathcal M\rightarrow \R$. The gradient $\nabla f: \mathcal M \rightarrow T_p\mathcal M$ is a vector field defined on the manifold pointing towards the direction of steepest ascent of $f$, where $T_p\mathcal M$ is the affine space of all vectors tangent to $\mathcal M$ at $p$.

\end{itemize}

Leveraging these three operators, \citet{belkin2005towards, belkin2007convergence} have build proofs of both pointwise and spectral convergence of the extended graph Laplacian towards the Laplace-Beltrami operator in the general setting of any compact, closed and infinitely differentiable manifold $\mathcal M$, where the sampling $\V$ is drawn randomly on the manifold. For this reason, their results are all to be interpreted in a probabilistic sense.
Their proofs consist in establishing that \eqref{eq:Heat Kernel Graph Laplacian operator} converges in probability towards \eqref{eq:Functional approximation to the Laplace-Beltrami operator} as $n\rightarrow \infty$ and \eqref{eq:Functional approximation to the Laplace-Beltrami operator} converges towards \eqref{eq:laplace-beltrami} as $t\rightarrow 0$. In particular, this second step is given by the following:
\begin{prop}
[\citet{belkin2005towards}, Proposition 4.4]
Let $\mathcal{M}$ be a $k$-dimensional compact smooth manifold embedded in some Euclidean space $\mathbb{R}^N$, and fix $y\in\mathcal{M}$. Let $f\in\mathcal{C}_\infty(\mathcal{M)}$. Then
\begin{equation}
\frac{1}{t}\frac{1}{(4\pi t)^{k/2}} L^tf(y) \xrightarrow{t\to 0 } \frac{1}{\text{vol}(\mathcal M)}\Delta_{\mathcal M}f(y).
\end{equation}
\label{prop:3}
\end{prop}

\paragraph{Building the proof.}
As the sphere is a compact smooth manifold embedded in $\mathbb{R}^3$, we can reuse proposition \ref{prop:3}. Thus, our strategy to prove Theorem \ref{theo:pointwise convergence for a regular sampling} is to (i) show that
\begin{equation}\label{eq:continuous convergence}
	\lim_{n\to\infty}L_n^{t} f(y) =  L^t(y)
\end{equation}
for a particular class of \emph{deterministic} samplings, and (ii) apply Proposition \ref{prop:3}.

%  \frac{|\mathbb{S}^2|}{4\pi t}
% \begin{definition}{}\\
% 	\label{def:Heat Kernel Graph Laplacian operator}
% 	\text{Given a sampling $\{x_i\in\mathcal M\}_{i=1}^n$ of the manifold we define the \textbf{operator} }$L_n^t$ such that
% 	$$L_n^tf(y) := \frac{1}{n}\left[ \sum_{i=1}^n \exp \left\{ {-\frac{||x_i-y||^2}{4t}}\right\}\left(f(y)-f(x_i)\right)\right]$$
% \end{definition}

% it is known that \citep{belkin2005towards} to prove the convergence of $(\b {L}_n^t \b f) _i$ to $\Delta_{\S^2}f(x_i)$ it is sufficient to prove the convergence of its continuous counterpart $L_n^tf(x)$ to $\Delta_{\S^2}f(x)$
% where $L_n^t$ is defined as follows:

% For the sake of the proof we will need also the following facts:
% \begin{definition}{} (\cite{belkin2005towards})\\ \label{eq: my L^t} Let $\mu$ be the uniform probability measure on the manifold $\mathcal M$, and let $\text{vol}(\mathcal M)$ be the volume of $\mathcal M$. We define the functional approximation to the Laplace-Beltrami operator to be the operator $L^t: L^{2}(\mathcal{M}) \rightarrow L^{2}(\mathcal{M})$ such that
% 	\label{def:Functional approximation to the Laplace-Beltrami operator}
% 	$$ L^tf(y) = \int_{\mathcal M}\exp\left\{-\frac{||y-x||^2}{4t}\right\}\left(f(y)-f(x)\right)d\mu(x)$$
% \end{definition}

% brief introduction:
% This proof consists in two steps: first, we prove that $L_n^t \rightarrow L^t$, and second, we use proposition \ref{prop:3} to prove \eqref{eq:continuous convergence}. This equation, as already explained in \citep{belkin2007convergence}, directly implies theorem \ref{theo:pointwise convergence for a regular sampling}.

We start by proving that for smooth functions, for any fixed $t$, the extended graph Laplacian $L^t_n$ converges towards its continuous counterpart $L^t$ as the sampling increases in size.

\begin{prop}\label{prop:1}
	For an equal area sampling $\{x_i\in\S^2\}_{i=1}^n: A_i=A_j \forall i,j$ of the sphere it is true that for all $f: \S^2 \rightarrow \R$ Lipschitz with respect to the Euclidean distance $\|\cdot\|$ with Lipschitz constant $C_f$
	\begin{equation*}
	\left| \int_{\S^2}f({ x})\text{d}{\mu(x)} - \frac{1}{n}\sum_i f( x_i)\right|\leq C_f d^{(n)}.
	\end{equation*}
	Furthermore, for all $y\in\S^2$ the Heat Kernel Graph Laplacian operator $L^t_n$ converges pointwise to the functional approximation of the Laplace Beltrami operator $L^t$
	\begin{equation*}
	 L_n^tf(y)\xrightarrow{n\to\infty} L^tf(y).
	\end{equation*}
\end{prop}
\begin{proof}

	Assuming $f:\mathbb S^2 \rightarrow \R$ is Lipschitz with Lipschitz constant $C_f$, we have
	\begin{equation*}
	\left| \int_{\sigma_{i}}f({ x})\text{d}{\mu(x)} - \frac{1}{n}f( x_i)\right| \leq C_fd^{(n)}\frac{1}{n},
	\end{equation*}
	where $\sigma_i\subset \S^2$ is the subset of the sphere corresponding to the patch around $x_i$. Remember that the sampling is equal area.
	Hence, using the triangular inequality and summing all the contributions of the $n$ patches, we obtain
	\begin{equation*}
	\left| \int_{\S^2}f({ x})\text{d}{\mu(x)} - \frac{1}{n}\sum_i f( x_i)\right| \leq \sum_i \left| \frac{1}{4\pi^2} \int_{\sigma_{i}}f({ x})\text{d}{\mu(x)} - \frac{1}{n}f( x_i)\right|\leq n  C_fd^{(n)}\frac{1}{n} = C_fd^{(n)}
	\end{equation*}
	A direct application of this result leads to the following pointwise convergences
	\begin{equation*}
	\forall f \text{ Lipschitz,}\quad \forall y\in\S^2,  \quad\quad \frac{1}{n}\sum_i e^{-\frac{\|x_i-y\|^2}{4t}}\rightarrow   \int e^{-\frac{\|x-y\|^2}{4t}}d\mu(x)
	\end{equation*}
    \begin{equation*}
    \forall f \text{ Lipschitz,}\quad \forall y\in\S^2,  \quad\quad \frac{1}{n}\sum_i e^{-\frac{||x_i-y||^2}{4t}}f(x_i)\rightarrow   \int e^{-\frac{\|x-y\|^2}{4t}}f(x)d\mu(x)
    \end{equation*}
	Definitions \ref{eq:Heat Kernel Graph Laplacian operator} and \ref{eq:Functional approximation to the Laplace-Beltrami operator} end the proof.
\end{proof}

The last proposition show that for a \emph{fixed} $t$, $L_n^tf(x)\rightarrow \linefrac{1}{4\pi^2} L^tf(x)$. To utilize Proposition \ref{prop:3} and complete the proof, we need to find a sequence of $t_n$ for which this holds as $t_n \rightarrow 0$. Furthermore this should hold with a faster decay than $\frac{1}{4\pi t_n^2}$.
\begin{prop}\label{prop:2}
	Given a sampling regular enough, i.e., for which we assume $A_i=A_j \ \forall i,j\text{ and }d^{(n)}\leq \frac{C}{n^\alpha}, \ \alpha\in (0,\linefrac{1}{2}]$, a Lipschitz function $f$ and a point $y\in\S^2$ there exists a sequence $t_n = n^\beta, \beta<0$ such that
\begin{equation*}
    \forall f \text{ Lipschitz, } \forall x\in\S^2 \quad \left|\frac{1}{4\pi t_n^2}\left(L_n^{t_n}f(x) -  L^{t_n}f(x)\right)\right|\xrightarrow{n\to \infty}0.
\end{equation*}
\end{prop}
\begin{proof}
% [Proof of Proposition \ref{prop:2}]
To ease the notation, we define
\begin{align}
	K^t(x,y) &:=  e^{-\frac{\|x-y\|^2}{4t}}\\
	\phi^t(x;y) &:= e^{-\frac{\|x-y\|^2}{4t}}\left(f(y)-f(x)\right).
\end{align}
We start with the following inequality
\begin{align}
	\|L_n^tf-L^tf\|_\infty &= \max _{y\in \S^2} \left|L_n^tf(y)-L^tf(y)\right| \nonumber\\
	&= \max _{y\in \S^2} \left| \frac{1}{n} \sum_{i=1}^n \phi^t(x_i; y)- \int_{\S^2} \phi^t(x;y)d\mu(x) \right| \nonumber\\
	&\leq \max _{y\in \S^2}  \sum_{i=1}^n   \left| \frac{1}{n}  \phi^t(x_i; y)- \int_{\sigma_i} \phi^t(x;y)d\mu(x) \right| \nonumber\\
	&\leq  d^{(n)} \max _{y\in \S^2} C_{\phi^t_y} , \label{eq:prop3-base-ineq}
\end{align}
where $C_{\phi^t_y}$ is the Lipschitz constant of $x \rightarrow \phi^t(x, y)$ and the last inequality follows from Proposition \ref{prop:1}.
Using the assumption $d^{(n)}\leq \frac{C}{\sqrt{n}}$ we find
\begin{equation*}
\|L_n^tf-L^tf\|_\infty  \leq  \frac{C}{\sqrt{n}} \max _{y\in \S^2}  C_{\phi^t_y}
\end{equation*}
We now find the explicit dependence between $t$ and $C_{\phi^t_y}$
\begin{align*}
	C_{\phi^t_y} &= \|\partial_x\phi^t(\cdot;y)\|_\infty\\&
	= \|\partial_x\left(K^t(\cdot;y)f\right)\|_\infty\\&
	= \|\partial_x K^t(\cdot;y)f + K^t(\cdot;y)\partial_x f||_\infty\\&
	\leq \|\partial_x K^t(\cdot;y)f\|_\infty + \|K^t(\cdot;y)\partial_x f\|_\infty\\&
	\leq  \|\partial_x K^t(\cdot;y)\|_\infty\|f\|_\infty + \|K^t(\cdot;y)\|_\infty\|\partial_x f\|_\infty\\&
	= \|\partial_x K^t(\cdot;y)\|_\infty\|f\|_\infty + \|\partial_x f\|_\infty\\&
	= C_{K^t_y} \|f\|_\infty + \|\partial_xf\|_\infty\\&
	= C_{K^t_y} \|f\|_\infty + C_f
\end{align*}
where $C_{K^t_y}$ is the Lipschitz constant of the function $x\rightarrow K^t(x;y)$. We note that this constant does not depend on $y$:
\begin{equation*}
C_{K^t_y} = \norm{\partial_x e^{-\frac{x^2}{4t}}}_\infty = \norm{\frac{x}{2t}e^{-\frac{x^2}{4t}}}_\infty = \left. \frac{x}{2t}e^{-\frac{x^2}{4t}}\right|_{x=\sqrt{2t}}=(2et)^{-\frac{1}{2}}\propto t ^ {-\frac{1}{2}}.
\end{equation*}
Hence we have
\begin{align*}
	\frac{C}{\sqrt{n}}  \max _{y\in \S^2} C_{\phi^t_y}
	&\leq  \frac{C}{\sqrt{n}} \left( (2et)^{-\frac{1}{2}} \norm{f}_\infty + C_f \right)\\
	&\leq \frac{C \norm{f}_\infty}{n^\alpha(2et)^{1/2}} +   \frac{C}{n^\alpha} C_f.
\end{align*}
Inculding this result in \eqref{eq:prop3-base-ineq} and rescaling by $\linefrac{1}{4\pi t^2}$, we obtain
\begin{align*}
	\norm{\frac{1}{4\pi t^2}\left(L_n^tf-L^tf\right)}_\infty&\leq \frac{1}{4\pi t^2}\norm{\left(L_n^tf-L^tf\right)}_\infty \\
	&\leq \frac{C}{4\pi}\left[\frac{\norm{f}_\infty}{\sqrt{2e}}\frac{1}{n^\alpha t^{5/2}} + \frac{C_f}{n^\alpha t^2}\right].
\end{align*}
In order for $ \frac{C}{4\pi}\left[\frac{\norm{f}_\infty}{\sqrt{2e}}\frac{1}{n^\alpha t^{5/2}} + \frac{C_f}{n^\alpha t^2}\right] \xrightarrow[t\to 0 ]{n\to\infty}0$,
we need $\begin{cases}
% t \rightarrow 0\\
% n \rightarrow \infty\\
n^\alpha t^{5/2} \rightarrow \infty\\
n^\alpha t^2 \rightarrow \infty
\end{cases}$ \\
It happens if $\begin{cases}
t(n) = n^\beta, &\beta\in(-\frac{2\alpha}{5}, 0) \\
t(n) = n^\beta, &\beta\in(-\frac{\alpha}{2}, 0)
\end{cases} \implies t(n) = n^\beta, \quad \beta\in(-\frac{2\alpha}{5}, 0)$.\\
Indeed, we have\\
$n^alpha t^{5/2}=n^{5/2\beta+\alpha}\xrightarrow{n \to \infty} \infty$ since $\frac{5}{2}\beta+\alpha>0 \iff \beta>-\frac{2\alpha}{5}$\\
and $n^\alpha t^2=n^{2\beta+\alpha}\xrightarrow {n \to \infty} \infty$ since $2\beta+\alpha>0 \iff \beta>-\frac{\alpha}{2}$.\\
As a result, for $t=n^\beta$ with $\beta\in(-\frac{1}{5}, 0)$ we have
$\begin{cases}
(t_n)\xrightarrow{n\to\infty}0\\
\norm{\frac{1}{4\pi t_n^2}L_n^{t_n}f-\frac{1}{4\pi t_n^2}L^{t_n}f}_\infty  \xrightarrow{n\to\infty}0,
\end{cases}$\\
which concludes the proof.
\end{proof}

Theorem \ref{theo:pointwise convergence for a regular sampling}, is then an immediate consequence of Proposition \ref{prop:2} and \ref{prop:3}.
\begin{proof}[Proof of Theorem \ref{theo:pointwise convergence for a regular sampling}]
	Thanks to Proposition \ref{prop:2} and Proposition \ref{prop:3}	we conclude that $\forall y\in\S^2 $
	\begin{equation*}
	\lim_{n\to\infty}\frac{1}{4\pi t_n^2} L_n^{t_n}f(y) =  \lim_{n\to\infty}\frac{1}{4\pi t_n^2} L^{t_n}f(y) = \frac{1}{|\S^2|}\laplacian f(y)
	\end{equation*}
\end{proof}

% I don't really have time to think clearly about this sorry

% \nati{@martino, actually we do not need $ d^{(n)}\leq \frac{1}{\sqrt{n}}$, we only need $ d^{(n)}\leq \frac{1}{n^\alpha}$, where $\alpha>0$. As the $\S^2$ is of dimension $2$, $\alpha$ cannot be larger than $\linefrac{1}{2}$. If we put that into the theorem, we obtain that $\frac{5}{2}\beta+\alpha>0 \iff \beta>-\frac{2\alpha}{5}$. I think this is important, because we can still have convergence even if we do not have perfect patches... Please confirm ASAP, so we can update the Proposition and the text bellow. For HEALPix sampling, we could say that while we have not proof, it make sense that it statisfies the hypothesis probably with $\alpha=\linefrac{1}{2}$.}

In \citep{belkin2005towards}, the sampling is drawn from a uniform random distribution on the sphere, and their proof heavily relies on the uniformity properties of the distribution from which the sampling is drawn. In our case the sampling is deterministic, and this is indeed a problem that we need to overcome by imposing the regularity conditions above.

To conclude, we see that the result obtained is of similar form than the result obtained in \citep{belkin2005towards}. Given the kernel density $t(n)=n^\beta$, \citet{belkin2005towards} proved convergence in the random case for $\beta \in (-\frac{1}{4}, 0)$ and we proved convergence in the deterministic case for $\beta \in (-\frac{2\alpha}{5}, 0)$, where $\alpha \in (0, \linefrac{1}{2}]$ (for the spherical manifold).

\section{Proof of Theorem~\ref{theo:equivariance}}
\begin{proof}
Fix $x\in \mathbb S^2$. Since any rotation $\Rg$ is an isometry, and the Laplacian $\Delta$ commutes with all isometries of a Riemanniann manifold, and defining $\Rg f =: f'$ for ease of notation, we can write that
	\begin{align*}
			\seminorm{\Rg \Ln f (x) - \Ln \Rg f(x) } &\leq \seminorm{\Rg \Ln f (x) - \Rg \laplacian f(x)} + \seminorm{\Rg \laplacian f (x) - \Ln \Rg f(x)} =\\
			& =\seminorm{\Rg (\Ln f - \laplacian f ) (x)} + \seminorm{ \laplacian f' (x) - \Ln f'(x)} \leq \\
			& \leq \seminorm{(\Ln f - \laplacian f ) (g^{-1}(x))} + \seminorm{ \laplacian f' (x) - \Ln f'(x)}
	\end{align*}

Since $g^{-1}(x)\in\mathbb{S}^2$ and $f'$ still satisfies hypothesis, we can apply theorem \ref{theo:pointwise convergence for a regular sampling} to say that
\begin{align*}
 &\seminorm{(\Ln f - \laplacian f ) (g^{-1}(x))} \xrightarrow{n\to\infty}0\\
  &\seminorm{ \laplacian f' (x) - \Ln f'(x)}\xrightarrow{n\to\infty}0
\end{align*}
to conclude that $$\forall x\in\mathbb S^2 \quad\seminorm{\Rg \Ln f (x) - \Ln \Rg f(x) } \xrightarrow{n\to\infty}0$$

\end{proof}


\section{Experimental details}

\subsection{3D objects recognition}

\Tabref{shrec17_retrieval} shows the results obtained from the SHREC'17 competition's official evaluation script.
% \Tabref{modelnet40} shows results on ModelNet40.

\begin{table}
    \centering
	\scriptsize
    \begin{tabular}{l cccc cccc}
    & \multicolumn{4}{c}{micro (label average)} & \multicolumn{4}{c}{macro (instance average)} \\
	\cmidrule(lr){2-5} \cmidrule(lr){6-9}
    & P@N & R@N & F1@N & mAP & P@N & R@N & F1@N & mAP \\
	\toprule
    \citet{cohen2018sphericalcnn} ($b=128$) & 0.701 & 0.711 & 0.699 & 0.676 & - & - & - & - \\
    \citet{cohen2018sphericalcnn} (simplified, $b=64$) & 0.704 & 0.701 & 0.696 & 0.665 & 0.430 & 0.480 & 0.429 & 0.385 \\
    \citet{esteves2018sphericalcnn} ($b=64$) & 0.717 & 0.737 & - & 0.685 & 0.450 & 0.550 & - & 0.444 \\
    DeepSphere (equiangular $b=64$) & 0.709 & 0.700 & 0.698 & 0.665 & 0.439 & 0.489 & 0.439 & 0.403 \\
    DeepSphere (HEALPix $N_{side}=32$) & 0.725 & 0.717 & 0.715 & 0.686 & 0.475 & 0.508 & 0.468 & 0.428\\
	\bottomrule
    \end{tabular}
    \caption{Official metrics from the SHREC'17 object retrieval competition.}
    \label{tab:shrec17_retrieval}
\end{table}

% \begin{table}
%     \centering
%     \begin{tabular}{l cccc}
% 	\toprule
%     & no/no & z/z & SO3/SO3 & z/SO3 \\
% 	\midrule
% 	\citet{cohen2018sphericalcnn} & 85.0 & - & - & - \\
% 	\citet{jiang2019sphericalcnn} & 90.5 & - & - & - \\
% 	\citet{esteves2018sphericalcnn} & - & 88.9 & 86.9 & 76.7 \\
% 	%Esteves \emph{MVCNN} & 94.69 & - & - & - \\
% 	DeepSphere & 87.8 & 86.8 & 86.7 & 76.9 \\
% 	\bottomrule
%     \end{tabular}
%     \caption{
% 		Accuracy results on ModelNet40.
% 		``no'' denotes no augmentation, ``z'' is augmented with rotations around the Z-axis, and ``SO3'' with ZYZ rotations.
% 		\todo{The first is training time augmentation, the second test time augmentation.}
% 	}
%     \label{tab:modelnet40}
% \end{table}

\begin{dmath}
    [GC_{16}\, +\, BN\, +\, ReLU]_{nside32}\, +\, \textrm{Pool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{nside16}\, +\, \textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{nside8}\, +\, \textrm{Pool}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{nside4}\, +\,\textrm{Pool}\, +\, [GC_{256}\, +\, BN\, +\, ReLU]_{nside2}\, +\, \textrm{Pool}\, +\, GAP\, +\, FCN\, +\, \textrm{softmax}
\end{dmath}

\subsection{Cosmological model classification}

\begin{dmath}
    [GC_{16}\, +\, BN\, +\, ReLU]_{nside1024}\, +\, \textrm{Pool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{nside512}\, +\, \textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{nside256}\, +\, \textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{nside128}\, +\,\textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{nside64}\, +\, \textrm{Pool}\, +\, [GC_{2}]_{nside32}\, +\, GAP\, +\, \textrm{softmax}
\end{dmath}

\subsection{Climate event segmentation} \label{sec:climate:appendix}

\Tabref{climate:accuracy}, \ref{tab:climate:map}, and \ref{tab:climate:speed} show the accuracy, mAP, and efficiency of all the NNs we ran.

The experiment with the model from \citet{jiang2019sphericalcnn} was rerun in order to obtain the AP metrics, but with a batch size of 64 instead of 256 due to GPU memory limit.

Several experiments were run with different architectures for DeepSphere (DS). Jiang architecture use a similar one as \citet{jiang2019sphericalcnn}, with only the convolutional operators replaced. DeepSphere only is the original architecture giving the best results, deeper and with four times more feature maps than Jiang architecture. And the wider architecture is the same as the previous one with two times the number of feature maps.

Regarding the weighted loss, the weights are chosen with \texttt{scikit-learn} function \texttt{compute\_class\_weight} on the training set.

% * Jiang rerun with a batch size of 64 instead of 256 due to memory limit.\\
% * DS-jiang is the similar architecture as \citet{jiang2019sphericalcnn}. \\
% * DS-optimal has 2 times the number of feature maps of DS. \\
% * \todo{What is DS and DS-Cohen?} DS has 4 times more feature maps and one more layer than DS-Jiang. DS-optimal has still 2 times more feature maps. DS-Cohen should be mostly gauge (based on paper, no code): 4 times more feature maps, but 2 layers less than DS-Jiang. \\
% * \todo{Is (equi non-weighted) the full resolution?} Yes, but less feature maps and layers. Can run, but not enough GPU memory for an optimal architecture. \\
% * perf with same arch as Jiang in appendix => not enough channels!\\
% * even more feature maps give us one more point in perf\\

% trade off between AP and accuracy in the case of TC class, it seems.\\
% (full: we scale (50B pixels, 20TB) $\Rightarrow$ lead to better perf?)
% full dataset not enough memory to run a correct model

\begin{table}
    \centering
	\begin{tabular}{l l l l l}
		\toprule
        & TC & AR & BG & mean \\
		\midrule
		\citet{mudigonda2017climateevents} & 74 & 65 & 97 & 78.67 \\
		\citet{jiang2019sphericalcnn} (paper) & 94 & 93 & 97 & 94.67 \\
		\citet{jiang2019sphericalcnn} (rerun) & 93.9 & 95.7 & 95.2 & 94.95 \\
        \citet{cohen2019gauge} (S2R) & 97.8 & 97.3 & 97.3 & 97.5 \\
        \citet{cohen2019gauge} (R2R) & 97.9 & 97.8 & 97.4 & 97.7 \\
		\midrule
		DS (Jiang architecture, weighted loss) & 97.1 & 97.6 & 96.5 & 97.1 \\
        DS (weighted loss) & $97.4\pm 1.1$ & $97.7\pm 0.7$ & $98.2\pm 0.5$ & $97.8\pm 0.3$ \\
		DS (wider architecture, weighted loss) & 91.5 & 93.4 & 99.0 & 94.6 \\
		\midrule
        DS (Jiang architecture, non-weighted loss) & 33.6 & 93.6 & 99.3 & 75.5 \\
        DS (non-weighted loss) & $69.2\pm 3.7$ & $94.5\pm 2.9$ & $99.7\pm 0.1$ & $87.8\pm 0.5$ \\
        DS (wider architecture, non-weighted loss) & 73.4 & 92.7 & 99.8 & 88.7 \\
% 		\midrule
% 		\todo{DS (Cohen architecture, weighted loss)} & 95.5 & 96.9 & 93.9 & 95.4 \\
% 		\todo{DS (Cohen architecture, non-weighted loss)} & 56.4 & 90.0 & 98.9 & 81.8 \\
% 		\todo{DS (full dataset, non-weighted loss)} & 31.3 & 75.2 & 99.9 & 68.80 \\
		\bottomrule
    \end{tabular}
    \caption{
		Results on climate event segmentation: accuracy.
		Tropical cyclones (TC) and atmospheric rivers (AR) are the two positive classes, against the background (BG).
		\citet{mudigonda2017climateevents} is not directly comparable as they don't use the same input feature maps.
		Note that a non-weighted cross-entropy loss is not optimal for the accuracy metric.
		\label{tab:climate:accuracy}
	}
\end{table}

\begin{table}
	\centering
	\begin{tabular}{l l l l}
		\toprule
        & TC & AR & mean \\
		\midrule
		\citet{jiang2019sphericalcnn} (rerun) & 11.08 & 65.21 & 38.41 \\
        \citet{cohen2019gauge} (S2R) & - & -& 68.6 \\
        \citet{cohen2019gauge} (R2R) & - & -& 75.9 \\
		\midrule
        DS (Jiang architecture, non-weighted loss) & 46.2 & 93.9 & 70.0 \\
        DS (non-weighted loss) & $80.86\pm 2.42$ & $97.45\pm 0.38$ & $89.16\pm 1.37$ \\
        DS (wider architecture, non-weighted loss) & 84.71 & 98.05 & 91.38 \\
		\midrule
        DS (Jiang architecture, weighted loss) & 49.7 & 89.2 & 69.5 \\
        DS (weighted loss) & $58.88\pm 3.17$ & $95.41\pm 1.51$ & $77.15\pm 1.94$ \\
        DS (wider architecture, weighted loss) & 52.80 & 94.78 & 73.79 \\
% 		\midrule
% 		\todo{DS (Cohen architecture, weighted loss)} & 13.1 & 80.3 & 46.7 \\
% 		\todo{DS (Cohen architecture, non-weighted loss)} & 59.8 & 85.0 & 72.4 \\
% 		\todo{DS (full dataset, non-weighted loss)} & 55.53 & 94.85 & 75.19 \\
		\bottomrule
    \end{tabular}
    \caption{
		Results on climate event segmentation: average precision.
		Tropical cyclones (TC) and atmospheric rivers (AR) are the two positive classes.
		Note that a weighted cross-entropy loss is not optimal for the average precision metric.
	}
		\label{tab:climate:map}
\end{table}

\begin{table}
	\centering
	\begin{tabular}{l r r r}
		\toprule
		& \multicolumn{1}{c}{size} & \multicolumn{2}{c}{speed} \\
        \cmidrule(lr){2-2} \cmidrule(lr){3-4}
		& params & inference & training \\
		\midrule
        %Mudigonda et al. & 97 & 74 & 65 & 78.67 & - & - & - \\
		\citet{jiang2019sphericalcnn} & 330\,k & 10\,ms & 10\,h \\ % 328'339
		% \citet{cohen2019gauge} (S2R) & - & - & - \\
        % \citet{cohen2019gauge} (R2R) & - & - & - \\
		DeepSphere (Jiang architecture) & 590\,k & 5\,ms & 3\,h \\ % 590k
        DeepSphere & 13\,M & 33\,ms & 13\,h \\ % 12'926'432
		DeepSphere (wider architecture) & 52\,M & 50\,ms & 20\,h \\
% 		\todo{DS (Cohen architecture)} & 2\,M & 11\,ms & 4\,h \\
% 		\todo{DS (full dataset)} & - & 567\,ms & 48\,h \\
		\bottomrule
    \end{tabular}
    \caption{
		Results on climate event segmentation: size and speed.
	}
		\label{tab:climate:speed}
\end{table}
\pagebreak
% \subsubsection*{DS (Jiang architecture)}

DeepSphere with Jiang architecture\\
encoder:\\
\begin{dmath}
    [GC_{8}\, +\, BN\, +\, ReLU]_{L5}\,+\,\textrm{Pool}\,+\, [GC_{16}\, +\, BN\, +\, ReLU]_{L4}\, +\, \textrm{Pool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{L3}\, +\, \textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{L2}\, +\,\textrm{Pool}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L1}\, +\, \textrm{Pool}\,  +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L0}
\end{dmath}
decoder:\\
\begin{dmath}
    \textrm{Unpool}\, +\,[GC_{128}\, +\, BN\, +\, ReLU]_{L1}\, +\, \textrm{concat}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L1}\, +\, \textrm{Unpool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{L2}\, +\, \textrm{concat}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{L2}\, +\, \textrm{Unpool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{L3}\, +\, \textrm{concat}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{L3}\, +\,\textrm{Unpool}\, +\, [GC_{16}\, +\, BN\, +\, ReLU]_{L4}\,+\, \textrm{concat}\, +\, [GC_{16}\, +\, BN\, +\, ReLU]_{L4}\, +\,\textrm{Unpool}\,  +\, [GC_{8}\, +\, BN\, +\, ReLU]_{L5}\,+\,\textrm{concat}\, +\, [GC_{8}\, +\, BN\, +\, ReLU]_{L5}\, + \,[GC_3]_{L5}
\end{dmath}
% number of parameters: 590'976  % more feat is 16 times more
Concat is the operation that concatenate the results of the corresponding encoder layer.
% concat the results of the corresponding encoder layer after the graph convolution. Similar to ``Jiang et al.''

% \subsubsection*{DS Icosahedron architecture}

Original DeepSphere architecture with encoder decoder\\
encoder:\\
\begin{dmath}
    [GC_{32}\, +\, BN\, +\, ReLU]_{L5}\,+\, [GC_{64}\, +\, BN\, +\, ReLU]_{L5}\, +\, \textrm{Pool}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L4}\, +\, \textrm{Pool}\, +\, [GC_{256}\, +\, BN\, +\, ReLU]_{L3}\, +\,\textrm{Pool}\, +\, [GC_{512}\, +\, BN\, +\, ReLU]_{L2} +\,\textrm{Pool}\, +\, [GC_{512}\, +\, BN\, +\, ReLU]_{L1} +\,\textrm{Pool}\, +\, [GC_{512}]_{L0}
\end{dmath}
decoder:\\
\begin{dmath}
    \textrm{Unpool}\, +\,[GC_{512}\, +\, BN\, +\, ReLU]_{L1}\, +\, \textrm{concat}\, +\, [GC_{512}\, +\, BN\, +\, ReLU]_{L1}\, +\, \textrm{Unpool}\, +\, [GC_{256}\, +\, BN\, +\, ReLU]_{L2}\, +\, \textrm{concat}\, +\, [GC_{256}\, +\, BN\, +\, ReLU]_{L2}\, +\, \textrm{Unpool}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L3}\, +\, \textrm{concat}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{L3}\, +\,\textrm{Unpool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{L4}\,+\, \textrm{concat}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{L4}\, +\,\textrm{Unpool}\,  +\, [GC_{32}\, +\, BN\, +\, ReLU]_{L5}\,+ \, [GC_3]_{L5}
\end{dmath}
% number of parameters: 2,2M


% Not similar to Jiang and 4 times the feature maps.

% batch size 64, adam optimizer, constant learning rate of $1 \cdot 10{-3}$, cross-entropy loss, both weighted and non weighted.

% 30 epochs

% DeepSphere with a wider architecture is exactly the same, but with 2 times the number of feature maps.

% \subsubsection*{Full dataset - Equiangular architecture}
% Equiangular sampling, with a latitude bandwidth 384 and longitude bandwidth 576.

% Due to memory problem, could not use the same architecture

% 8 epochs

% encoder:\\
% \begin{dmath}
%     [GC_{16}\, +\, BN\, +\, ReLU]_{bw}\,+\, [GC_{16}\, +\, BN\, +\, ReLU]_{bw}\, +\, \textrm{Pool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{bw/16}\, +\, \textrm{Pool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{bw/64}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{bw/64}
% \end{dmath}
% decoder:\\
% \begin{dmath}
%     \textrm{Unpool}\, +\,[GC_{128}\, +\, BN\, +\, ReLU]_{bw/64}\, +\, \textrm{concat}\, +\, [GC_{128}\, +\, BN\, +\, ReLU]_{bw/64}\, +\, \textrm{Unpool}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{bw/64}\, +\, \textrm{concat}\, +\, [GC_{64}\, +\, BN\, +\, ReLU]_{bw/64}\, +\, \textrm{Unpool}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{bw/16}\, +\, \textrm{concat}\, +\, [GC_{32}\, +\, BN\, +\, ReLU]_{bw/16}\, +\,\textrm{Unpool}\, +\, [GC_{16}\, +\, BN\, +\, ReLU]_{bw}\,+\, \textrm{concat}\, +\, [GC_{16}\, +\, BN\, +\, ReLU]_{bw}\, +\,\textrm{Unpool}\,  +\, [GC_{16}\, +\, BN\, +\, ReLU]_{bw}\,+\,[GC_3]_{bw}
% \end{dmath}

% batch size 1

\subsection{Uneven sampling}

Architecture for dense regression:
\begin{dmath}
    [GC_{50}\, +\, BN\, +\, ReLU]\, +\, [GC_{100}\, +\, BN\, +\, ReLU]\, +\, [GC_{100}\, +\, BN\, +\, ReLU]\, +\, [GC_{1}]
\end{dmath}

Architecture for global regression:
\begin{dmath}
    [GC_{50}\, +\, BN\, +\, ReLU]\, +\, [GC_{100}\, +\, BN\, +\, ReLU]\, +\, [GC_{100}\, +\, BN\, +\, ReLU]\, +\, GAP\, +\, FCN
\end{dmath}

\end{document}