-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.tex
370 lines (287 loc) · 14.1 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
\pdfminorversion=4
\documentclass[aspectratio=169]{beamer}
\mode<presentation>
{
\usetheme{default}
\usecolortheme{default}
\usefonttheme{default}
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
\setbeamertemplate{footline}[frame number] % or "page number"
\setbeamercolor{frametitle}{fg=white}
\setbeamercolor{footline}{fg=black}
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{tikz}
\usepackage{courier}
\usepackage{array}
\usepackage{bold-extra}
\usepackage{minted}
\usepackage[thicklines]{cancel}
\usepackage{fancyvrb}
\xdefinecolor{dianablue}{rgb}{0.18,0.24,0.31}
\xdefinecolor{darkblue}{rgb}{0.1,0.1,0.7}
\xdefinecolor{darkgreen}{rgb}{0,0.5,0}
\xdefinecolor{darkgrey}{rgb}{0.35,0.35,0.35}
\xdefinecolor{darkorange}{rgb}{0.8,0.5,0}
\xdefinecolor{darkred}{rgb}{0.7,0,0}
\definecolor{darkgreen}{rgb}{0,0.6,0}
\definecolor{mauve}{rgb}{0.58,0,0.82}
\title[2022-03-21-reload-stats-of-physicists]{Metrics of computing trends in NHEP}
\author{Jim Pivarski}
\institute{Princeton University -- IRIS-HEP}
\date{March 21, 2022}
\usetikzlibrary{shapes.callouts}
\begin{document}
\logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=1 cm]{princeton-logo-long.png}\hspace{0.1 cm}\raisebox{0.1 cm}{\includegraphics[height=0.8 cm]{iris-hep-logo-long.png}}\hspace{0.1 cm}}}}}
\begin{frame}
\titlepage
\end{frame}
\logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=1 cm]{princeton-logo.png}\hspace{0.1 cm}\raisebox{0.1 cm}{\includegraphics[height=0.8 cm]{iris-hep-logo.png}}\hspace{0.1 cm}}}}}
% Uncomment these lines for an automatically generated outline.
%\begin{frame}{Outline}
% \tableofcontents
%\end{frame}
% START START START START START START START START START START START START START
\begin{frame}{\mbox{ }}
\large
\vspace{0.5 cm}
\begin{columns}
\column{0.7\linewidth}
This is a talk about measuring {\it physicists}: what they talk about and what they do for computing.
\vspace{0.25 cm}
\uncover<2->{Measuring people, such as advertising click-throughs, seemed odd to me when I first went from physics to data science, since the events are {\it not} independent and it would be hard to quantify errors.}
\vspace{0.25 cm}
\uncover<3->{However, it can be a meaningful thing to do, taking all the caveats seriously, and certainly better than {\it guessing} or {\it assuming} we understand the community.}
\vspace{0.25 cm}
\uncover<4->{Inspiration: read Sharon Traweek's anthropological study of physicists at SLAC and KEK in the 1970's. Physicists can be data points!}
\column{0.3\linewidth}
\uncover<4->{\includegraphics[width=\linewidth]{traweek-beamtimes-and-lifetimes.jpg}}
\end{columns}
\end{frame}
\begin{frame}{User needs are very different from my expectations, 6 years ago}
\vspace{0.5 cm}
\begin{columns}
\column{1.12\linewidth}
\only<1>{\includegraphics[width=\linewidth]{evolving-views-1.png}}\only<2>{\includegraphics[width=\linewidth]{evolving-views-2.png}}\only<3>{\includegraphics[width=\linewidth]{evolving-views-3.png}}\only<4>{\includegraphics[width=\linewidth]{evolving-views-4.png}}
\end{columns}
\end{frame}
\begin{frame}{Ways to study humans}
\vspace{0.1 cm}
\textcolor{gray}{\scriptsize (Important note: I am not an expert. Below is what I learned from college friends who went into social sciences.)}
\large
\vspace{0.1 cm}
\begin{itemize}\setlength{\itemsep}{0.25 cm}
\item \textcolor{darkblue}{Qualitative:}
\vspace{0.05 cm}
\begin{itemize}\large\setlength{\itemsep}{0.15 cm}
\item \textcolor{darkblue}{Focus groups:} \normalsize most open to unexpected ideas. Want to keep the group size and mix such that participants are willing to speak up. Goal is to discover new {\it dimensions} of the vector space, not just points within it. \large
\item \textcolor{darkblue}{One-on-one interviews:} \normalsize can be deeper but less broad than focus groups. Lacks the multiplying effect of responding to each other's opinions. \large
\item \textcolor{darkblue}{History/documents:} \normalsize observational, rather than experimental, but this method can reach further into the past. \large
\end{itemize}
\item \textcolor{darkblue}{Quantitative:}
\vspace{0.05 cm}
\begin{itemize}\large\setlength{\itemsep}{0.15 cm}
\item \textcolor{darkblue}{Surveys:} \normalsize can get large, statistically meaningful datasets, at the cost of losing flexibility/openness to new ideas. Now you {\it are} filling in a vector space. \large
\item \textcolor{darkblue}{Proxy metrics:} \normalsize can measure what people {\it do}, rather than what they {\it say}. \large
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Proxy metrics: high statistics, cautious interpretation}
\vspace{0.25 cm}
\mbox{\hspace{0.75 cm}\includegraphics[width=\linewidth]{google-flu-trends.png}}
\begin{uncoverenv}<2->
\vspace{-3.35 cm}
\hspace{-0.25 cm}\begin{minipage}{0.3\linewidth}
{\bf Google Flu Trends}
{\bf (2008--2015)}
\small
\vspace{0.25 cm}
Count searches for
things like ``fever,'' ``cough,''
interpret as flu activity.
\vspace{0.25 cm}
(This was controversial.)
\end{minipage}
\vspace{3.35 cm}
\end{uncoverenv}
\end{frame}
\begin{frame}{Example: what happened here?}
\vspace{0.25 cm}
\textcolor{darkblue}{\mbox{\hspace{-0.5 cm}}GitHub stars versus time in IRIS-HEP projects}
\vspace{0.25 cm}
\begin{columns}
\column{0.75\linewidth}
\only<1-2>{\includegraphics[width=\linewidth]{irishep-as-stars-1.pdf}}\only<3->{\includegraphics[width=\linewidth]{irishep-as-stars-2.pdf}}
\column{0.25\linewidth}
\vspace{-0.5 cm}
\uncover<2->{What we're trying to explain is a big, qualitative feature, not the little bumps.}
\vspace{0.45 cm}
\uncover<3->{Temporally coincides with this event:}
\uncover<3->{\small\textcolor{blue}{\url{https://news.ycombinator.com/item?id=29576323}}}
\vspace{0.45 cm}
\uncover<4->{Is it causal?}
\vspace{0.45 cm}
\uncover<5->{It would be hard to believe it isn't.}
\vspace{0.5 cm}
\end{columns}
\end{frame}
\begin{frame}{Stacked download statistics for Scikit-HEP and related packages}
\vspace{-0.25 cm}
\begin{columns}
\column{1.2\linewidth}
\mbox{\hspace{-1 cm}\only<1>{\includegraphics[width=\linewidth]{pip-allos-scikithep-log.pdf}}\only<2>{\includegraphics[width=\linewidth]{pip-macwin-scikithep-log.pdf}}}
\end{columns}
\end{frame}
\begin{frame}{\mbox{ }}
\Large
\vspace{0.4 cm}
Linux includes batch jobs, which sometimes \mintinline{bash}{pip install} the same package on thousands of workers.
\vspace{0.5 cm}
\uncover<2->{Selecting only MacOS and Windows removes most batch jobs, but it excludes some individual users (like me), probably with a behavioral bias.}
\vspace{0.5 cm}
\uncover<3->{Still, there's continuous testing jobs on MacOS and Windows.}
\vspace{0.5 cm}
\uncover<4->{Do we even {\it want} to exclude these things? What do we {\it want} the observable to quantify?}
\end{frame}
\begin{frame}{How about this one: Python version in \only<1-2>{\mintinline{bash}{pip install uproot}}\only<3-4>{\mintinline{bash}{pip install numpy}}}
\vspace{-0.25 cm}
\begin{columns}
\column{1.2\linewidth}
\only<1>{\includegraphics[width=\linewidth]{pip-allos-pythonversion-uprootusers-lin.pdf}}\only<2>{\includegraphics[width=\linewidth]{pip-macwin-pythonversion-uprootusers-lin.pdf}}\only<3>{\includegraphics[width=\linewidth]{pip-allos-pythonversion-numpyusers-lin.pdf}}\only<4>{\includegraphics[width=\linewidth]{pip-macwin-pythonversion-numpyusers-lin.pdf}}
\end{columns}
\end{frame}
\begin{frame}{More often useful when {\it comparing} two things}
\vspace{0.25 cm}
\Large
\underline{Transition from ``old'' Awkward/Uproot/Coffea to ``new''}
\vspace{0.5 cm}
\begin{columns}
\column{1.2\linewidth}
\hspace{0.25 cm}\includegraphics[width=0.31\linewidth]{pip-allos-awkward-log.pdf}\includegraphics[width=0.31\linewidth]{pip-allos-uproot-log.pdf}\includegraphics[width=0.31\linewidth]{pip-allos-coffea-log.pdf}
\end{columns}
\end{frame}
\begin{frame}{Directed study: how are physicists using C++ and Python?}
\vspace{0.5 cm}
{\Large Analyze code in 11\,635 GitHub repos written by 2\,172 physicists:}
\vspace{0.25 cm}
\begin{enumerate}
\item Ask GitHub which users forked CMSSW and call them ``CMS physicists.'' (CMSSW has been on GitHub for a long enough time to see trends.)
\item Clone all of the physicists' repos (the ones that are not forks of something else).
\item Search the code of these repos and count matches.
\item Take care to exclude CMSSW configuration files, which are also Python.
\end{enumerate}
\begin{center}
\includegraphics[width=0.5\linewidth]{github-api-website.png}
\end{center}
\end{frame}
\begin{frame}{\only<1>{Language use: C++, Python, and Jupyter}\only<2>{Packages: ROOT, Scientific Python, Uproot/Awkward}}
\vspace{0.25 cm}
\textcolor{darkblue}{\mbox{\hspace{-0.5 cm}}\only<1>{Number of non-fork GitHub repos created by CMS physicists (users who forked CMSSW)}\only<2>{Same sample, now counting regex matches for \mintinline{python}{import XYZ}, \mintinline{python}{from XYZ import}, etc.}}
\vspace{-0.35 cm}
\begin{columns}
\column{1.15\linewidth}
\only<1>{\includegraphics[width=\linewidth]{gihub-language-fullstudy.pdf}}\only<2>{\includegraphics[width=\linewidth]{gihub-package-fullstudy.pdf}}
\end{columns}
\end{frame}
\begin{frame}{This is an update of my earlier GitHub-CMSSW study}
\vspace{0.5 cm}
\large
\begin{enumerate}\setlength{\itemsep}{0.35 cm}
\item Inclusively counting ``repo that contains a C++ file'' or ``a Python file,'' rather than GitHub's exclusive determination of ``repo language.''
\item Distinguishing between Python files that are CMSSW configurations and other Python files (GitHub doesn't).
\item I've downloaded all the repos, so I can run my own regex searches, rather than relying on GitHub's.
\end{enumerate}
\vspace{0.5 cm}
\begin{uncoverenv}<2->
We can do more: run clang-tidy/pylint? Features of libraries used?
\textcolor{blue}{\scriptsize\url{https://pivarski-princeton.s3.amazonaws.com/GitHub-CMSSW-user-nonfork-raw-data.tar}}
\vspace{0.5 cm}
\textcolor{gray}{\normalsize (Note: these are all public repos/public data.)}
\end{uncoverenv}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Programming languages in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-language.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-language.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Lucas Taylor, {\it Summary of Data Analysis Track,} CHEP 2001}
\vspace{0.25 cm}
\begin{columns}
\column{0.05\linewidth}
\column{0.81\linewidth}
\includegraphics[width=\linewidth]{chep-2001-python.png}
\column{0.2\linewidth}
Note: PyROOT introduced in 2004 (v4.00/04).
\end{columns}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Programming paradigms in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-paradigm.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-paradigm.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Software frameworks in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-package-1.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-package-1.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Machine learning in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-ml.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-ml.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Hardware accelerators in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-accelerator.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-accelerator.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Longer baseline: title/abstract matches in InspireHEP}
\vspace{0.35 cm}
\textcolor{darkblue}{Kinds of tasks in \only<1>{CHEP}\only<2>{ACAT} papers}
\begin{columns}
\column{1.05\linewidth}
\only<1>{\includegraphics[width=\linewidth]{chep-papers-task.pdf}}\only<2>{\includegraphics[width=\linewidth]{acat-papers-task.pdf}}
\end{columns}
\end{frame}
\begin{frame}{Conclusions}
\Large
\vspace{0.5 cm}
\begin{itemize}\setlength{\itemsep}{0.5 cm}
\item Different ways of understanding people, including the NHEP software community: focus groups, interviews, historical documents, surveys, and proxy metrics.
\item This talk focused on proxy metrics, which are quantitative, but you have to pay close attention to what they're quantifying.
\item Some clear trends and conclusions emerged. \mbox{Others are muddled.\hspace{-1 cm}}
\end{itemize}
\end{frame}
\begin{frame}{Conclusions}
\Large
\vspace{0.25 cm}
Relatively clear trends/conclusions.
\vspace{0.2 cm}
\begin{itemize}\setlength{\itemsep}{0.12 cm}\large
\item Scikit-HEP adoption increased by more than 10$\times$ since 2018. Maybe 100$\times$.
\item Most physicists use Python 3.9, ahead of the wider (NumPy) community.
\item The Awkward/Uproot/Coffea version update is done (more new than old).
\item C++ use in CMS is {\it steady, not decreasing}, while scientific Python (NumPy/Matplotlib/Pandas/Jupyter) is increasing past that level.
\item Python (in general, e.g.\ for configuration files), has been slowly increasing since 2000, at the same time as the abrupt Fortran $\to$ C++ switch.
\item Declarative/columnar interest is increasing, but nothing like ``object oriented'' in the 1990's. ``Arrays'' are coming back (since Fortran).
\item Machine learning is a {\it resurgence}, with more synonyms than in the 1990's. (GPUs/FPGAs on the same timescale.)
\item ``Analysis'' has grown as a preoccupation at CHEP and ACAT.
\end{itemize}
\end{frame}
\end{document}