main.tex

\pdfminorversion=4
\documentclass[aspectratio=169]{beamer}

\mode<presentation>
{
  \usetheme{default}
  \usecolortheme{default}
  \usefonttheme{default}
  \setbeamertemplate{navigation symbols}{}
  \setbeamertemplate{caption}[numbered]
  \setbeamertemplate{footline}[frame number]  % or "page number"
  \setbeamercolor{frametitle}{fg=white}
  \setbeamercolor{footline}{fg=black}
} 

\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{tikz}
\usepackage{courier}
\usepackage{array}
\usepackage{bold-extra}
\usepackage{minted}
\usepackage[thicklines]{cancel}
\usepackage{fancyvrb}

\xdefinecolor{dianablue}{rgb}{0.18,0.24,0.31}
\xdefinecolor{darkblue}{rgb}{0.1,0.1,0.7}
\xdefinecolor{darkgreen}{rgb}{0,0.5,0}
\xdefinecolor{darkgrey}{rgb}{0.35,0.35,0.35}
\xdefinecolor{darkorange}{rgb}{0.8,0.5,0}
\xdefinecolor{darkred}{rgb}{0.7,0,0}
\definecolor{darkgreen}{rgb}{0,0.6,0}
\definecolor{mauve}{rgb}{0.58,0,0.82}

\title[2022-03-21-reload-stats-of-physicists]{Metrics of computing trends in NHEP}
\author{Jim Pivarski}
\institute{Princeton University -- IRIS-HEP}
\date{March 21, 2022}

\usetikzlibrary{shapes.callouts}

\begin{document}

\logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=1 cm]{princeton-logo-long.png}\hspace{0.1 cm}\raisebox{0.1 cm}{\includegraphics[height=0.8 cm]{iris-hep-logo-long.png}}\hspace{0.1 cm}}}}}

\begin{frame}
  \titlepage
\end{frame}

\logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=1 cm]{princeton-logo.png}\hspace{0.1 cm}\raisebox{0.1 cm}{\includegraphics[height=0.8 cm]{iris-hep-logo.png}}\hspace{0.1 cm}}}}}

% Uncomment these lines for an automatically generated outline.
%\begin{frame}{Outline}
%  \tableofcontents
%\end{frame}

% START START START START START START START START START START START START START

\begin{frame}{\mbox{ }}
\large
\vspace{0.5 cm}
\begin{columns}
\column{0.7\linewidth}
This is a talk about measuring {\it physicists}: what they talk about and what they do for computing.

\vspace{0.25 cm}
\uncover<2->{Measuring people, such as advertising click-throughs, seemed odd to me when I first went from physics to data science, since the events are {\it not} independent and it would be hard to quantify errors.}

\vspace{0.25 cm}
\uncover<3->{However, it can be a meaningful thing to do, taking all the caveats seriously, and certainly better than {\it guessing} or {\it assuming} we understand the community.}

\vspace{0.25 cm}
\uncover<4->{Inspiration: read Sharon Traweek's anthropological study of physicists at SLAC and KEK in the 1970's. Physicists can be data points!}

\column{0.3\linewidth}
\uncover<4->{\includegraphics[width=\linewidth]{traweek-beamtimes-and-lifetimes.jpg}}
\end{columns}
\end{frame}

\begin{frame}{User needs are very different from my expectations, 6 years ago}
\vspace{0.5 cm}
\begin{columns}
\column{1.12\linewidth}
\only<1>{\includegraphics[width=\linewidth]{evolving-views-1.png}}\only<2>{\includegraphics[width=\linewidth]{evolving-views-2.png}}\only<3>{\includegraphics[width=\linewidth]{evolving-views-3.png}}\only<4>{\includegraphics[width=\linewidth]{evolving-views-4.png}}
\end{columns}
\end{frame}

\begin{frame}{Ways to study humans}
\vspace{0.1 cm}
\textcolor{gray}{\scriptsize (Important note: I am not an expert. Below is what I learned from college friends who went into social sciences.)}

\large
\vspace{0.1 cm}
\begin{itemize}\setlength{\itemsep}{0.25 cm}
\item \textcolor{darkblue}{Qualitative:}

\vspace{0.05 cm}
\begin{itemize}\large\setlength{\itemsep}{0.15 cm}
\item \textcolor{darkblue}{Focus groups:} \normalsize most open to unexpected ideas. Want to keep the group size and mix such that participants are willing to speak up. Goal is to discover new {\it dimensions} of the vector space, not just points within it. \large

\item \textcolor{darkblue}{One-on-one interviews:} \normalsize can be deeper but less broad than focus groups. Lacks the multiplying effect of responding to each other's opinions. \large

\item \textcolor{darkblue}{History/documents:} \normalsize observational, rather than experimental, but this method can reach further into the past. \large
\end{itemize}

\item \textcolor{darkblue}{Quantitative:}

\vspace{0.05 cm}
\begin{itemize}\large\setlength{\itemsep}{0.15 cm}
\item \textcolor{darkblue}{Surveys:} \normalsize can get large, statistically meaningful datasets, at the cost of losing flexibility/openness to new ideas. Now you {\it are} filling in a vector space. \large

\item \textcolor{darkblue}{Proxy metrics:} \normalsize can measure what people {\it do}, rather than what they {\it say}. \large
\end{itemize}

\end{itemize}
\end{frame}

\begin{frame}{Proxy metrics: high statistics, cautious interpretation}
\vspace{0.25 cm}
\mbox{\hspace{0.75 cm}\includegraphics[width=\linewidth]{google-flu-trends.png}}

\begin{uncoverenv}<2->
\vspace{-3.35 cm}
\hspace{-0.25 cm}\begin{minipage}{0.3\linewidth}
{\bf Google Flu Trends}

{\bf (2008--2015)}

\small
\vspace{0.25 cm}
Count searches for

things like ``fever,'' ``cough,''

interpret as flu activity.

\vspace{0.25 cm}
(This was controversial.)
\end{minipage}
\vspace{3.35 cm}
\end{uncoverenv}
\end{frame}

\begin{frame}{Example: what happened here?}
\vspace{0.25 cm}
\textcolor{darkblue}{\mbox{\hspace{-0.5 cm}}GitHub stars versus time in IRIS-HEP projects}

\vspace{0.25 cm}
\begin{columns}
\column{0.75\linewidth}
\only<1-2>{\includegraphics[width=\linewidth]{irishep-as-stars-1.pdf}}\only<3->{\includegraphics[width=\linewidth]{irishep-as-stars-2.pdf}}

\column{0.25\linewidth}
\vspace{-0.5 cm}

\uncover<2->{What we're trying to explain is a big, qualitative feature, not the little bumps.}

\vspace{0.45 cm}
\uncover<3->{Temporally coincides with this event:}

\uncover<3->{\small\textcolor{blue}{\url{https://news.ycombinator.com/item?id=29576323}}}

\vspace{0.45 cm}
\uncover<4->{Is it causal?}

\vspace{0.45 cm}
\uncover<5->{It would be hard to believe it isn't.}

\vspace{0.5 cm}
\end{columns}
\end{frame}

\begin{frame}{Stacked download statistics for Scikit-HEP and related packages}
\vspace{-0.25 cm}
\begin{columns}
\column{1.2\linewidth}
\mbox{\hspace{-1 cm}\only<1>{\includegraphics[width=\linewidth]{pip-allos-scikithep-log.pdf}}\only<2>{\includegraphics[width=\linewidth]{pip-macwin-scikithep-log.pdf}}}
\end{columns}
\end{frame}

\begin{frame}{\mbox{ }}
\Large
\vspace{0.4 cm}

Linux includes batch jobs, which sometimes \mintinline{bash}{pip install} the same package on thousands of workers.

\vspace{0.5 cm}
\uncover<2->{Selecting only MacOS and Windows removes most batch jobs, but it excludes some individual users (like me), probably with a behavioral bias.}

\vspace{0.5 cm}
\uncover<3->{Still, there's continuous testing jobs on MacOS and Windows.}

\vspace{0.5 cm}
\uncover<4->{Do we even {\it want} to exclude these things? What do we {\it want} the observable to quantify?}
\end{frame}

\begin{frame}{How about this one: Python version in \only<1-2>{\mintinline{bash}{pip install uproot}}\only<3-4>{\mintinline{bash}{pip install numpy}}}
\vspace{-0.25 cm}
\begin{columns}
\column{1.2\linewidth}
\only<1>{\includegraphics[width=\linewidth]{pip-allos-pythonversion-uprootusers-lin.pdf}}\only<2>{\includegraphics[width=\linewidth]{pip-macwin-pythonversion-uprootusers-lin.pdf}}\only<3>{\includegraphics[width=\linewidth]{pip-allos-pythonversion-numpyusers-lin.pdf}}\only<4>{\includegraphics[width=\linewidth]{pip-macwin-pythonversion-numpyusers-lin.pdf}}
\end{columns}
\end{frame}

\begin{frame}{More often useful when {\it comparing} two things}
\vspace{0.25 cm}
\Large
\underline{Transition from ``old'' Awkward/Uproot/Coffea to ``new''}
\vspace{0.5 cm}
\begin{columns}
\column{1.2\linewidth}
\hspace{0.25 cm}\includegraphics[width=0.31\linewidth]{pip-allos-awkward-log.pdf}\includegraphics[width=0.31\linewidth]{pip-allos-uproot-log.pdf}\includegraphics[width=0.31\linewidth]{pip-allos-coffea-log.pdf}
\end{columns}
\end{frame}

\begin{frame}{Directed study: how are physicists using C++ and Python?}
\vspace{0.5 cm}
{\Large Analyze code in 11\,635 GitHub repos written by 2\,172 physicists:}

\vspace{0.25 cm}
\begin{enumerate}
\item Ask GitHub which users forked CMSSW and call them ``CMS physicists.'' (CMSSW has been on GitHub for a long enough time to see trends.)
\item Clone all of the physicists' repos (the ones that are not forks of something else).
\item Search the code of these repos and count matches.
\item Take care to exclude CMSSW configuration files, which are also Python.
\end{enumerate}

\begin{center}
\includegraphics[width=0.5\linewidth]{github-api-website.png}
\end{center}
\end{frame}

\begin{frame}{\only<1>{Language use: C++, Python, and Jupyter}\only<2>{Packages: ROOT, Scientific Python, Uproot/Awkward}}
\vspace{0.25 cm}
\textcolor{darkblue}{\mbox{\hspace{-0.5 cm}}\only<1>{Number of non-fork GitHub repos created by CMS physicists (users who forked CMSSW)}\only<2>{Same sample, now counting regex matches for \mintinline{python}{import XYZ}, \mintinline{python}{from XYZ import}, etc.}}

\vspace{-0.35 cm}
\begin{columns}
\column{1.15\linewidth}
\only<1>{\includegraphics[width=\linewidth]{gihub-language-fullstudy.pdf}}\only<2>{\includegraphics[width=\linewidth]{gihub-package-fullstudy.pdf}}
\end{columns}
\end{frame}

\begin{frame}{This is an update of my earlier GitHub-CMSSW study}
\vspace{0.5 cm}
\large

\begin{enumerate}\setlength{\itemsep}{0.35 cm}
\item Inclusively counting ``repo that contains a C++ file'' or ``a Python file,'' rather than GitHub's exclusive determination of ``repo language.''

\item Distinguishing between Python files that are CMSSW configurations and other Python files (GitHub doesn't).

\item I've downloaded all the repos, so I can run my own regex searches, rather than relying on GitHub's.
\end{enumerate}

\vspace{0.5 cm}
\begin{uncoverenv}<2->
We can do more: run clang-tidy/pylint? Features of libraries used?

\textcolor{blue}{\scriptsize\url{https://pivarski-princeton.s3.amazonaws.com/GitHub-CMSSW-user-nonfork-raw-data.tar}}

\vspace{0.5 cm}
\textcolor{gray}{\normalsize (Note: these are all public repos/public data.)}
\end{uncoverenv}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Programming languages in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-language.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-language.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Lucas Taylor, {\it Summary of Data Analysis Track,} CHEP 2001}
\vspace{0.25 cm}
\begin{columns}
\column{0.05\linewidth}

\column{0.81\linewidth}
\includegraphics[width=\linewidth]{chep-2001-python.png}

\column{0.2\linewidth}
Note: PyROOT introduced in 2004 (v4.00/04).
\end{columns}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Programming paradigms in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-paradigm.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-paradigm.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Software frameworks in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-package-1.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-package-1.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Machine learning in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-ml.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-ml.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Hardware accelerators in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-accelerator.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-accelerator.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Kinds of tasks in \only<1>{CHEP}\only<2>{ACAT} papers}

\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-task.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-task.pdf}}
\end{columns}
\end{frame}

\begin{frame}{Conclusions}
\Large
\vspace{0.5 cm}
\begin{itemize}\setlength{\itemsep}{0.5 cm}
\item Different ways of understanding people, including the NHEP software community: focus groups, interviews, historical documents, surveys, and proxy metrics.

\item This talk focused on proxy metrics, which are quantitative, but you have to pay close attention to what they're quantifying.

\item Some clear trends and conclusions emerged. \mbox{Others are muddled.\hspace{-1 cm}}
\end{itemize}
\end{frame}

\begin{frame}{Conclusions}
\Large
\vspace{0.25 cm}
Relatively clear trends/conclusions.

\vspace{0.2 cm}
\begin{itemize}\setlength{\itemsep}{0.12 cm}\large
\item Scikit-HEP adoption increased by more than 10$\times$ since 2018. Maybe 100$\times$.
\item Most physicists use Python 3.9, ahead of the wider (NumPy) community.
\item The Awkward/Uproot/Coffea version update is done (more new than old).
\item C++ use in CMS is {\it steady, not decreasing}, while scientific Python (NumPy/Matplotlib/Pandas/Jupyter) is increasing past that level.
\item Python (in general, e.g.\ for configuration files), has been slowly increasing since 2000, at the same time as the abrupt Fortran $\to$ C++ switch.
\item Declarative/columnar interest is increasing, but nothing like ``object oriented'' in the 1990's. ``Arrays'' are coming back (since Fortran).
\item Machine learning is a {\it resurgence}, with more synonyms than in the 1990's. (GPUs/FPGAs on the same timescale.)
\item ``Analysis'' has grown as a preoccupation at CHEP and ACAT.
\end{itemize}
\end{frame}

\end{document}