-
Notifications
You must be signed in to change notification settings - Fork 1
/
bmc_article.tex
1568 lines (1258 loc) · 67.2 KB
/
bmc_article.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%% BioMed_Central_Tex_Template_v1.06
%% %
% bmc_article.tex ver: 1.06 %
% %
%%IMPORTANT: do not delete the first line of this template
%%It must be present to enable the BMC Submission system to
%%recognise this template!!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <8 June 2012> %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.html and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% additional documentclass options:
% [doublespacing]
% [linenumbers] - put the line numbers on margins
%%% loading packages, author definitions
\documentclass[twocolumn]{bmcart}% uncomment this for twocolumn layout and comment line below
% \documentclass{bmcart}
%%% Load packages
\usepackage{amsthm,amsmath}
\RequirePackage{natbib}
%\RequirePackage[authoryear]{natbib}% uncomment this for author-year bibliography
\RequirePackage{hyperref}
\usepackage[utf8]{inputenc} %unicode support
%\usepackage[applemac]{inputenc} %applemac support if unicode package fails
%\usepackage[latin1]{inputenc} %UNIX support if unicode package fails
\usepackage{lineno} % for line numbers
\usepackage{textcomp} % for single quotes
% orcid code from https://tex.stackexchange.com/questions/445563/ieeetran-how-to-include-orcid-in-tex-pdf-with-pdflatex/445583#445583
\usepackage{scalerel}
\usepackage{tikz}
\usetikzlibrary{svg.path}
\definecolor{orcidlogocol}{HTML}{A6CE39}
\tikzset{
orcidlogo/.pic={
\fill[orcidlogocol] svg{M256,128c0,70.7-57.3,128-128,128C57.3,256,0,198.7,0,128C0,57.3,57.3,0,128,0C198.7,0,256,57.3,256,128z};
\fill[white] svg{M86.3,186.2H70.9V79.1h15.4v48.4V186.2z}
svg{M108.9,79.1h41.6c39.6,0,57,28.3,57,53.6c0,27.5-21.5,53.6-56.8,53.6h-41.8V79.1z M124.3,172.4h24.5c34.9,0,42.9-26.5,42.9-39.7c0-21.5-13.7-39.7-43.7-39.7h-23.7V172.4z}
svg{M88.7,56.8c0,5.5-4.5,10.1-10.1,10.1c-5.6,0-10.1-4.6-10.1-10.1c0-5.6,4.5-10.1,10.1-10.1C84.2,46.7,88.7,51.3,88.7,56.8z};
}
}
\newcommand\orcidicon[1]{\href{https://orcid.org/#1}{\mbox{\scalerel*{
\begin{tikzpicture}[yscale=-1,transform shape]
\pic{orcidlogo};
\end{tikzpicture}
}{|}}}}
\usepackage{tcolorbox}
\usepackage{lineno}
\usepackage{textcomp}
\usepackage{enumitem}
\usepackage{graphicx}
\graphicspath{ {./images/} }
\usepackage{minted}
\usepackage{hyperref} %<--- Load after everything else
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% uncomment these to remove figures
%\def\includegraphic{}
%\def\includegraphics{}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
%%% Put your definitions there:
\startlocaldefs
\usepackage{url}
\endlocaldefs
\newenvironment{handson_box}
{\begin{center}
\begin{tabular}{|p{0.9\textwidth/2}|}
\hline\\
}
{
\\\\\hline
\end{tabular}
\end{center}
}
\newcounter{handsoncounter}
\newtcolorbox[use counter=handsoncounter,number format=\arabic]{handson_box_colour}[2][]{boxrule=0.3mm,colback=gray!5!white,colframe=bmcblue, arc=0mm, title=Hands-on \thetcbcounter: #2,#1}
%\newtcolorbox{handson_box_colour}{boxrule=0.3mm,colback=gray!5!white,colframe=bmcblue, arc=0mm}
%%% Begin ...
\begin{document}
%%% Start of article front matter
\begin{frontmatter}
\begin{fmbox}
\dochead{Educational}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Intuitive, reproducible high-throughput molecular dynamics in Galaxy: a tutorial}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Specify information, if available, %%
%% in the form: %%
%% <key>={<id1>,<id2>} %%
%% <key>= %%
%% Comment or delete the keys which are %%
%% not used. Repeat \author command as much %%
%% as required. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author[ addressref={aff1}, % id's of addresses, e.g. {aff1,aff2}
%noteref={n1}, % id's of article notes, if any
email={[email protected]} % email address
]{\inits{SAB}\fnm{Simon A.} \snm{Bray}} \orcidicon{0000-0002-0621-6705}
\author[
addressref={aff2},
%noteref={n1},
email={}
]{\inits{TS}\fnm{Tharindu} \snm{Senapathi}} \orcidicon{0000-0002-3277-4022}
\author[
addressref={aff2},
email={[email protected]}
]{\inits{CBB}\fnm{Christopher B.} \snm{Barnett}} \orcidicon{0000-0002-1467-5741}
\author[
addressref={aff1},
corref={aff1,aff2},
email={[email protected]; [email protected]}
]{\inits{BAG}\fnm{Björn A.} \snm{Grüning}} \orcidicon{0000-0002-3079-6586}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%% Repeat \address commands as much as %%
%% required. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address[id=aff1]{% % unique id
\orgname{Department of Computer Science, University of Freiburg}, % university, etc
\street{Georges-Köhler-Allee 106}, %
%\postcode{} % post or zip code
\city{Freiburg}, % city
\cny{Germany} % country
}
\address[id=aff2]{%
\orgname{Scientific Computing Research Unit, University of Cape Town},
\postcode{7700}
\city{Cape Town},
\cny{South Africa}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter short notes here %%
%% %%
%% Short notes will be after addresses %%
%% on first page. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{artnotes}
%\note{Sample of title note} % note to the article
\note[id=n1]{Equal contributor} % note, connected to author
\end{artnotes}
\end{fmbox}% comment this for two column layout
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% Please refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and include the section headings %%
%% accordingly for your article type. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstractbox}
\begin{abstract} % abstract
This paper is a tutorial developed for the data analysis platform Galaxy. The purpose of Galaxy is to make high-throughput computational data analysis, such as molecular dynamics, a structured, reproducible and transparent process. In this tutorial we
focus on 3 questions: How are protein-ligand systems parameterized for
molecular dynamics simulation? What kind of analysis can be carried out
on molecular trajectories? How can high-throughput MD be used to study
multiple ligands? After finishing you will have learned about
force-fields and MD parameterization, how to conduct MD simulation
and analysis for a protein-ligand system, and understand how different
molecular interactions contribute to the binding affinity of
ligands to the Hsp90 protein.
\end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The keywords begin here %%
%% %%
%% Put each keyword in separate \kwd{}. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{keyword}
\kwd{Galaxy}
\kwd{Molecular Dynamics}
\kwd{Reproducible}
\end{keyword}
% MSC classifications codes, if any
%\begin{keyword}[class=AMS]
%\kwd[Primary ]{}
%\kwd{}
%\kwd[; secondary ]{}
%\end{keyword}
\end{abstractbox}
%
%\end{fmbox}% uncomment this for twcolumn layout
\end{frontmatter}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Main Body begins here %%
%% %%
%% Please refer to the instructions for %%
%% authors on: %%
%% http://www.biomedcentral.com/info/authors%%
%% and include the section headings %%
%% accordingly for your article type. %%
%% %%
%% See the Results and Discussion section %%
%% for details on how to create sub-sections%%
%% %%
%% use \cite{...} to cite references %%
%% \cite{koon} and %%
%% \cite{oreg,khar,zvai,xjon,schn,pond} %%
%% \nocite{smith,marg,hunn,advi,koha,mouse}%%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%% start of article main body
% <put your article body there>
%%%%%%%%%%%%%%%%
%% Background %%
%%
%\linenumbers
\hypertarget{introduction}{%
\section*{Introduction}\label{introduction}}
Molecular dynamics (MD) is a commonly used method in computational chemistry and cheminformatics, in particular for studying the interactions between small molecules and large biological macromolecules such as proteins~\cite{berendsen01}. However, the barrier to entry for MD simulation is high; not only is the theory difficult to master, but commonly used MD software is technically demanding. Furthermore, generating reliable, reproducible simulation data requires the user to maintain detailed records of all parameters and files used, which again poses a challenge to newcomers to the field. One solution to the latter problem is usage of a workflow management system such as Galaxy~\cite{afgan_galaxy_2018}, which provides a selection of tools for molecular dynamics simulation and analysis~\cite{senapathi_biomolecular_2019}. MD simulations are rarely performed singly; in recent years, the concept of high-throughput molecular dynamics (HTMD) has come to the fore~\cite{HARVEY20121059,Guterres2020}. Galaxy lends itself well to this kind of study, as we will demonstrate in this paper, thanks to features allowing construction of complex workflows, which can then be executed on multiple inputs in parallel.
This tutorial provides a detailed workflow for high-throughput molecular dynamics with Galaxy, using the N-terminal domain (NTD) of Hsp90 (heat shock protein 90) as a case-study. Galaxy~\cite{afgan_galaxy_2018} is a data analysis platform that provides access to thousands of tools for scientific computation. It features a web-based user interface while automatically and transparently managing underlying computation details, enabling structured and reproducible high-throughput data analysis. This tutorial provides sample data, workflows, hands-on material and references for further reading. It presumes that the user has a basic understanding of the Galaxy platform. The aim is to guide the user through the various steps of a molecular dynamics study, from accessing publicly available crystal structures, to performing MD simulation (leveraging the popular GROMACS~\cite{abraham_gromacs_2015,Lemkul2019} engine), to analysis of the results.
The entire analysis described in this article can be conducted efficiently on any Galaxy server which has the needed tools. In particular, we recommend using the Galaxy Europe server
(\url{https://cheminformatics.usegalaxy.eu}) or the Galaxy South Africa server (\url{https://galaxy-compchem.ilifu.ac.za}). For users who wish to run their own Galaxy server locally, we provide a Docker container (\url{https://quay.io/repository/galaxy/computational-chemistry-training}) containing a full Galaxy installation, with all tools required for the tutorial preinstalled.
The tutorial presented in this article has been developed as part of the Galaxy Training Network~\cite{Batut2018} and its most up-to-date version is accessible online on the Galaxy Training Materials website~\cite{gtn_comp}, under the URL \url{https://training.galaxyproject.org/training-material/topics/computational-chemistry/tutorials/htmd-analysis/tutorial.html}.
\subsection*{What is high-throughput molecular
dynamics?}\label{what-is-high-throughput-molecular-dynamics}
Molecular dynamics (MD) is a method to simulate molecular motion by
iterative application of Newton's laws of motion. It is often applied to
large biomolecules such as proteins or nucleic acids. A common
application is to assess the interaction between these macromolecules
and a number of small molecules (e.g. potential drug candidates). This
tutorial provides a guide to setting up and running a high-throughput
workflow for screening multiple small molecules, using the open-source
GROMACS tools provided through the Galaxy platform. Following simulation, the trajectory data is analyzed using a range of tools to investigate structural properties and correlations over time.
\subsection*{Why is Hsp90 interesting to
study?}\label{why-is-hsp90-interesting-to-study}
The 90 kDa heat shock protein (Hsp90) is a chaperone protein responsible
for catalyzing the conversion of a wide variety of proteins to a
functional form; examples of the Hsp90 clientele, which totals several
hundred proteins, include nuclear steroid hormone receptors and protein
kinases~\cite{Pearl2006}. The mechanism by which Hsp90 acts varies between clients, as
does the client binding site; the process is dependent on
post-translational modifications of Hsp90 and the identity of
co-chaperones which bind and regulate the conformational cycle~\cite{Schopf2017}.
Due to its vital biochemical role as a chaperone protein involved in
facilitating the folding of many client proteins, Hsp90 is an attractive
pharmaceutical target. In particular, as protein folding is a potential
bottleneck to cellular reproduction and growth, blocking Hsp90
function using inhibitors which bind tightly to the ATP binding site of the NTD
could assist in treating cancer; for example, the antibiotic
geldanamycin and its analogs are under investigation as possible
anti-tumor agents~\cite{Stebbins1997,Hermane2019}.
In the structure which will be examined during this tutorial, the ligand of concern is a resorcinol, a common class of compounds with affinity for the Hsp90 N-terminal domain. It is registered in the PubChem database under the compound ID 135508238~\cite{ligand_resorcinol}. As can be seen by viewing the PDB structure, the resorcinol part of the structure is embedded in the binding site, bound by a hydrogen bond to residue aspartate-93. The ligand structure also contains a triazole and a fluorophenyl ring, which lie nearer to the surface of the protein.
\begin{figure}[ht!]
\includegraphics[width=0.4\textwidth]{hsp90lig}
\caption{\csentence{Hsp90 cartoon view.}
Hsp90 cartoon with ligands in active site, rendered using the Galaxy NGL plugin~\cite{Rose2018ngl}}
\label{fig:Hsp90}
\end{figure}
\hypertarget{methods}{%
\section*{Methods: Simulation}\label{methods}}
This section guides the reader through the step-by-step process required to prepare, run and analyze Hsp90. A brief explanation of the theory and purpose of each step is provided. Refer to the hands-on sections as these describe the task with tools and parameters to be carried out using Galaxy.
%\hypertarget{simulation}{%
%\subsection*{Simulation}\label{simulation}}
\subsection*{Get data}\label{get-data}
Create a new Galaxy history and then download a crystal structure for the Hsp90 protein from the Protein Data Bank (PDB). The structure is provided under accession code \texttt{6HHR}~\cite{Schuetz2018} and shows Hsp90 in complex with a the resorcinol ligand (Figure \ref{fig:Hsp90}).
\begin{handson_box_colour}{Data upload}
%\textbf{Hands-on: Data upload}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Create a new history for this tutorial
\item
Search Galaxy for the \textbf{Get PDB} tool. Request the accession code \texttt{6HHR}.
\item
Rename the dataset to `Hsp90 structure'
\end{enumerate}
\end{handson_box_colour}
\subsection*{Topology generation}\label{topology-generation}
The PDB structure is prepared for MD simulation in a process referred to as parameterization or topology generation.
GROMACS distinguishes between constant and dynamic attributes of the
atoms in the system. The constant attributes (e.g. atom charges, bonds
connecting atoms) are listed in the topology (TOP file), while dynamic
attributes (attributes that can change during a simulation, e.g. atom
position, velocities and forces) are stored in structure (PDB or GRO)
and trajectory (XTC and TRR) files.
The PDB file includes neither parameters for simulations, nor the positions of hydrogen atoms. Therefore, before beginning simulation, this information must be calculated and added to a topology file.
% perhaps some more discussion here and a citation of https://www.sciencedirect.com/science/article/pii/S1877117319302157
\subsection*{Extract protein and ligand
coordinates}\label{extract-protein-and-ligand-coordinates}
Parameterization is performed separately for the ligand and protein.
The PDB file is separated into two sets of
coordinates - one for the ligand and one for the protein - using the simple text manipulation tools integrated into Galaxy.
\begin{handson_box_colour}{Separate protein and ligand coordinates}
%\textbf{Hands-on: Separate protein and ligand coordinates}\label{hands-on-task-description}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
\textbf{Search in textfiles} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``Select lines from''}: `Hsp90 structure'
\item
\emph{``that''}: \texttt{Don\textquotesingle{}t\ Match}
\item
\emph{``Regular Expression''}: \texttt{HETATM}
\end{itemize}
\item
Rename output to `Protein (PDB)'
\item
\textbf{Search in textfiles} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``Select lines from''}: `Hsp90 structure'
\item
\emph{``that''}: \texttt{Match}
\item
\emph{``Regular Expression''}: \texttt{AG5E}
\end{itemize}
\item
Rename output to `Ligand (PDB)'
\end{enumerate}
\end{handson_box_colour}
The PDB file is filtered twice: once for lines which do not match \texttt{HETATM}, which returns a PDB file containing only protein, not ligand and solvent; and once for lines which match the ligand's identity code \texttt{AG5E}, which returns a PDB file containing only the ligand.
\hypertarget{set-up-protein-topology}{%
\subsection*{Set up protein topology}\label{set-up-protein-topology}}
The topology for the protein file is calculated with the \textbf{GROMACS initial setup} tool.
\begin{handson_box_colour}{Generate protein topology}
%\textbf{Hands-on: Generate protein topology}\label{hands-on-task-description-1}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS initial setup} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``PDB input file''}: `Protein (PDB)' file
\item
\emph{``Force field''}: \texttt{AMBER99SB}
\item
\emph{``Water model''}: \texttt{TIP3P}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
A force field is essentially a function to calculate the potential energy of a system, based on various empirical parameters (for the atoms, bonds, charges, dihedral angles and so on). There are a number of families of force fields; some of the most commonly used include CHARMM~\cite{Vanommeslaeghe2009}, AMBER~\cite{Maier2015}, GROMOS~\cite{reif2012} and OpenFF~\cite{Mobley2018} (for a recent, accessible overview see ~\cite{Lemkul2020}). One of the main AMBER force fields for protein modeling, \texttt{ff99SB}, was selected.
Apart from the force field, a water model was selected to model the solvent; a wide range of models exist for this purpose. The common TIP3P model is selected, which is an example of a `three-site model' - so-called because the molecule is modeled using three points, corresponding to the three atoms of water (four- and five-site models include additional `dummy atoms' representing the negative charges of the lone pairs of the oxygen atom)~\cite{Onufriev2017}.
The tool produces four outputs: a GRO file (containing the coordinates
of the protein), a TOP file (containing other information, including
charges, masses, bonds and angles), an ITP file (which will be used to
restrain the protein position in the equilibration steps later on), and a
log file.
Please note all the GROMACS tools provided in Galaxy output a log file. These files provide useful information for
debugging purposes.
\subsection*{Generate a topology for the
ligand}\label{generate-a-topology-for-the-ligand}
The acpype~\cite{SousadaSilva2012} tool is used to generate a topology for the ligand. This provides a convenient interface to the AmberTools suite and creates the ligand topology in the format required by
GROMACS.
Inspecting the contents of the ligand PDB file shows that it contains no hydrogen atoms. Hydrogens were added to the topology using the `Compound conversion` tool (based on OpenBabel~\cite{OBoyle2011}).
\begin{handson_box_colour}{Generate ligand topology}
%\textbf{Hands-on: Generate ligand topology}\label{hands-on-task-description-2}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
\textbf{Compound conversion}:
\begin{itemize}
\tightlist
\item
\emph{``Molecular input file''}: `Ligand (PDB)'
\item
\emph{``Output format''}: \texttt{Protein Data Bank format (pdb)}
\item
\emph{``Add hydrogens appropriate for pH''}: \texttt{7.0}
\end{itemize}
\item
Rename the output file to ´Hydrated ligand (PDB)'
\item
\textbf{Generate MD topologies for small molecules}:
\begin{itemize}
\tightlist
\item
\emph{``Input file''}: `Ligand (PDB)'
\item
\emph{``Charge of the molecule''}: \texttt{0}
\item
\emph{``Multiplicity''}: \texttt{1}
\item
\emph{``Force field to use for parameterization''}:
\texttt{gaff}
\item
\emph{``Save GRO file?''}: \texttt{Yes}
\end{itemize}
\end{enumerate}
\end{handson_box_colour}
The GAFF (general AMBER force field) is selected, which is a generalized AMBER force field~\cite{Wang2004} which can be applied to almost any small organic molecule.
Appropriate charge and multiplicity parameters are entered. The ligand studied is a simple organic molecule, with no charge; therefore, the charge is set to 0 and the multiplicity to 1. Alternative values for multiplicity need only be considered for more exotic species such as metal complexes or carbenes.
%\subsection*{Solvation and energy minimization}\label{solvation-and-energy-minimization}
Next, the topologies are combined and the simulation box is defined.
\subsection*{Combine topology and GRO
files}\label{combine-topology-and-gro-files}
The separate topology and structure files for both protein and ligand are combined into a single set of files to continue with the simulation setup. A dedicated Galaxy tool is provided for this, using the Python library ParmEd~\cite{Swails2016}.
\begin{handson_box_colour}{Combine GRO and topology files}
%\textbf{Hands-on: Combine GRO and topology files}\label{hands-on-task-description-2}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{Merge GROMACS topologies} with the following
parameters:
\begin{itemize}
\tightlist
\item
\emph{``Protein topology (TOP) file''}: \texttt{TOP} file created by the \textbf{GROMACS initial setup tool}
\item
\emph{``Ligand topology (TOP or ITP) file''}: \texttt{Topology} file created by the \textbf{acpype tool}
\item
\emph{``Protein structure (GRO) file''}: \texttt{GRO} file created by the \textbf{GROMACS initial setup tool}
\item
\emph{``Ligand structure (GRO) file''}: \texttt{Structure file (GRO format)} file created by the \textbf{acpype tool}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
Note that, apart from this tool, the Galaxy platform also provides an integrated text editor for making more advanced changes to GROMACS topologies or configuration files.
\subsection*{\texorpdfstring{Create the simulation box}{Create the simulation box}}\label{create-the-simulation-box-with-gromacs-structure-configuration}
The next step, once combined coordinate (GRO) and topology (TOP) files
have been created, is to create a simulation box in which the system is
situated.
\begin{handson_box_colour}{Create simulation box}
%\hypertarget{hands-on-task-description-3}{%
%\textbf{Hands-on: Create simulation box}\label{hands-on-task-description-3}}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% %\tightlist
% \item
\textbf{GROMACS structure configuration} with the following
parameters:
\begin{itemize}
% \tightlist
\item
\emph{``Input structure''}: \texttt{System\ GRO\ file} (Input
dataset)
\item
\emph{``Configure box?''}: \texttt{Yes}
\begin{itemize}
% \tightlist
\item
\emph{``Box dimensions in nanometers''}: \texttt{1.0}
\item
\emph{``Box type''}: \texttt{Triclinic}
\end{itemize}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
This tool returns a new GRO structure file, containing the same coordinates as before, but defining a simulation box such that every atom is a minimum of 1 nm from the box boundary. A variety of box shapes are available to choose from: triclinic is selected, as it provides efficient packing in space and thus fewer computational resources need to be devoted to simulation of solvent.
\subsection*{Solvation}\label{solvation}
The next step is solvation of the newly created simulation box. Water was chosen as a solvent to in order to simulate biological conditions. Note
that the system is charged (depending on the pH) and it is neutralized by the addition of the sodium and chloride ions (replacing existing water molecules) using the solvation tool.
\begin{handson_box_colour}{Solvation}
%\textbf{Hands-on: Solvation}\label{hands-on-task-description-4}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS solvation and adding ions} with the following
parameters:
\begin{itemize}
\tightlist
\item
\emph{``GRO structure file''}: output of
\textbf{GROMACS structure configuration}
\item
\emph{``System topology''}: \texttt{output}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
\subsection*{Energy minimization}\label{energy-minimization}
After the solvation step, parameterization of the system is complete and preparatory simulations can be performed. The first of these is energy minimization, which can be carried out using the
\textbf{GROMACS energy minimization} tool. The purpose of energy minimization is to relax the structure, removing any steric clashes or unusual geometry which would artificially raise the energy of the system.
\begin{handson_box_colour}{Energy minimization}
%\textbf{Hands-on: Energy minimization}\label{hands-on-task-description-5}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS energy minimization} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``GRO structure file.''}: GRO output of
\textbf{GROMACS solvation and adding ions}
\item
\emph{``Topology (TOP) file.''}: TOP output of
\textbf{GROMACS solvation and adding ions}
\item
\emph{``Parameter input''}:
\texttt{Use\ default\ (partially\ customisable)\ setting}
\begin{itemize}
\tightlist
\item
\emph{``Number of steps for the MD simulation''}: \texttt{50000}
\item
\emph{``EM tolerance''}: \texttt{1000.0}
\end{itemize}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\item
Rename GRO output to \texttt{Minimized\ GRO\ file}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
The EM tolerance here refers to the maximum force which will be allowed in a minimized system. The simulation will be terminated when the maximum force is less than this value, or when 50000 steps have elapsed. The `Extract energy components' tool is used to plot the convergence of the potential energy during the minimization.
\begin{handson_box_colour}{Checking EM convergence}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
\textbf{Extract energy components with GROMACS} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``EDR file''}: EDR output of
\textbf{GROMACS energy minimization}
\item
\emph{``Terms to calculate''}: \texttt{Potential}
\item
\emph{``Output format''}:
\texttt{Galaxy tabular}
\end{itemize}
\item
On the output tabular file, click on the 'Visualize this data' icon. This provides a range of visualization options. Select `Line chart (jqPlot)'.
\item
In the visualization window which appears, click on `Select data.' Enter the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``Provide a label''}: \texttt{Energy potential}
\item
\emph{``Values for x-axis''}: \texttt{Column: 1}
\item
\emph{``Values for y-axis''}: \texttt{Column: 2}
\end{itemize}
\end{enumerate}
\end{handson_box_colour}
As seen in Figure \ref{fig:empot}, the system first drops rapidly in energy, before slowly converging on the minimized state.
\begin{figure}[ht!]
\includegraphics[width=0.4\textwidth]{em_potential}
\caption{\csentence{Energy potential}
during the EM simulation.}
\label{fig:empot}
\end{figure}
\subsection*{Equilibration}\label{equilibration}
At this point equilibration of the solvent around the solute (i.e. the protein) is necessary. This is performed in two stages: equilibration under an NVT (or isothermal-isochoric) ensemble, followed by an NPT (or isothermal-isobaric) ensemble. Use of the NVT ensemble entails maintaining constant number of particles, volume and temperature, while the NPT ensemble maintains constant number of particles, pressure and temperature. Simulation under the NVT ensemble allows the solvent to be brought to the desired temperature, while simulation under the NPT ensemble brings the solvent to the correct pressure.
For equilibration, the protein is held in place while the solvent is allowed to move freely around it. This is achieved using the position restraint file (ITP) created during the system setup. This restraint does not prevent protein movement rather movement is energetically penalized.
\begin{handson_box_colour}{NVT
equilibration}
%\textbf{Hands-on: NVT equilibration}\label{hands-on-nvt-equilibration}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS simulation} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``GRO structure file''}: \texttt{Minimized\ GRO\ file} (from
energy minimization step)
\item
\emph{``Topology (TOP) file''}: TOP file produced by solvation step.
\item
In \emph{``Inputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Position restraint (ITP) file''}: ITP file produced by
initial setup step.
\end{itemize}
\item
In \emph{``Outputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Trajectory output''}:
\texttt{Return\ .xtc\ file\ (reduced\ precision)}
\item
\emph{``Structure output''}: \texttt{Return\ .gro\ file}
\item
\emph{``Produce a checkpoint (CPT) file''}:
\texttt{Produce\ CPT\ output}
\item
\emph{``Produce an energy (EDR) file''}:
\texttt{Produce\ EDR\ output}
\end{itemize}
\item
In \emph{``Settings''}:
\begin{itemize}
\tightlist
\item
\emph{``Parameter input''}:
\texttt{Use\ default\ (partially\ customisable)\ setting}
\begin{itemize}
\tightlist
\item
\emph{``Bond constraints (constraints)''}:
\texttt{All\ bonds\ (all-bonds).}
\item
\emph{``Temperature /K''}: \texttt{300}
\item
\emph{``Step length in ps''}: \texttt{0.002}
\item
\emph{``Number of steps that elapse between saving data points
(velocities, forces, energies)''}: \texttt{1000}
\item
\emph{``Number of steps for the simulation''}: \texttt{50000}
\end{itemize}
\end{itemize}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
Once the NVT equilibration is complete, it is worth using the `Extract energy components' tool again to check whether the system temperature has converged on 300 K. This can be done as described for energy minimization, this time specifying \texttt{Temperature} under \emph{Terms to calculate} rather than \texttt{Potential}. The plot should show the temperature reaching 300 K and remaining there, albeit with some fluctuation.
Having stabilized the temperature of the system with NVT equilibration, the pressure is stabilized by equilibrating using the NPT (constant number of particles, pressure,
temperature) ensemble. The NPT simulation is continued from the NVT simulation by using the checkpoint (CPT) file saved at the end of the NVT simulation.
\begin{handson_box_colour}{NPT
equilibration}
%\textbf{Hands-on: NPT equilibration}\label{hands-on-npt-equilibration}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS simulation} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``GRO structure file''}: GRO output of \textbf{GROMACS
simulation} (NVT equilibration)
\item
\emph{``Topology (TOP) file''}: TOP file produced by solvation step.
\item
In \emph{``Inputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Checkpoint (CPT) file''}: Output of \textbf{GROMACS
simulation} (NVT equilibration))
\item
\emph{``Position restraint (ITP) file''}: ITP file produced by
initial setup step.
\end{itemize}
\item
In \emph{``Outputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Trajectory output''}:
\texttt{Return\ .xtc\ file\ (reduced\ precision)}
\item
\emph{``Structure output''}: \texttt{Return\ .gro\ file}
\item
\emph{``Produce a checkpoint (CPT) file''}:
\texttt{Produce\ CPT\ output}
\item
\emph{``Produce an energy (EDR) file''}:
\texttt{Produce\ EDR\ output}
\end{itemize}
\item
In \emph{``Settings''}:
\begin{itemize}
\tightlist
\item
\emph{``Ensemble''}: \texttt{Isothermal-isobaric\ ensemble\ (NPT)}
\item
\emph{``Parameter input''}:
\texttt{Use\ default\ (partially\ customisable)\ setting}
\begin{itemize}
\tightlist
\item
\emph{``Bond constraints (constraints)''}:
\texttt{All\ bonds\ (all-bonds).}
\item
\emph{``Temperature /K''}: \texttt{300}
\item
\emph{``Step length in ps''}: \texttt{0.002}
\item
\emph{``Number of steps that elapse between saving data points
(velocities, forces, energies)''}: \texttt{1000}
\item
\emph{``Number of steps for the simulation''}: \texttt{50000}
\end{itemize}
\end{itemize}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
After the NPT equilibration is complete, `Extract energy components' can be used again to view the pressure of the system. This is done as described for energy minimization, specifying \texttt{Pressure} under \emph{Terms to calculate}. The plot should show convergence on 1 bar and remain there, although some fluctuation is expected.
\subsection*{Production simulation}\label{production-simulation}
The restraints are removed and a production simulation is carried out for 1 million steps. With a step size of 1 fs, this simulation represents a total time length of 1 ns. This is rather short compared to the state-of-the-art, but sufficient for the purposes of a tutorial.
\begin{handson_box_colour}{Main simulation}
%\hypertarget{hands-on-task-description-6}{%
%\textbf{Hands-on: Task description}\label{hands-on-task-description-6}}
% \begin{enumerate}
% \def\labelenumi{\arabic{enumi}.}
% \tightlist
% \item
\textbf{GROMACS simulation} with the following parameters:
\begin{itemize}
\tightlist
\item
\emph{``GRO structure file''}: Output of \textbf{GROMACS simulation}
(NPT equilibration)
\item
\emph{``Topology (TOP) file''}: Output of the solvation step
\item
In \emph{``Inputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Checkpoint (CPT) file''}: Output of \textbf{GROMACS
simulation} (NPT simulation))
\end{itemize}
\item
In \emph{``Outputs''}:
\begin{itemize}
\tightlist
\item
\emph{``Trajectory output''}:
\texttt{Return\ .xtc\ file\ (reduced\ precision)}
\item
\emph{``Structure output''}: \texttt{Return\ .gro\ file}
\item
\emph{``Produce a checkpoint (CPT) file''}:
\texttt{Produce\ CPT\ output}
\end{itemize}
\item
In \emph{``Settings''}:
\begin{itemize}
\tightlist
\item
\emph{``Ensemble''}: \texttt{Isothermal-isobaric\ ensemble\ (NPT)}
\item
\emph{``Parameter input''}:
\texttt{Use\ default\ (partially\ customisable)\ setting}
\begin{itemize}
\tightlist
\item
\emph{``Temperature /K''}: \texttt{300}
\item
\emph{``Step length in ps''}: \texttt{0.001}
\item
\emph{``Number of steps that elapse between saving data points
(velocities, forces, energies)''}: \texttt{1000}
\item
\emph{``Number of steps for the simulation''}: \texttt{1000000}
\end{itemize}
\end{itemize}
\item
\emph{``Generate detailed log''}: \texttt{Yes}
\end{itemize}
% \end{enumerate}
\end{handson_box_colour}
\hypertarget{analysis}{%
\section*{Methods: Analysis}\label{analysis}}
An analysis of the GROMACS simulation outputs (structure and trajectory file) will be carried out using Galaxy tools developed for computational chemistry~\cite{senapathi_biomolecular_2019} based on popular analysis software, such as MDAnalysis~\cite{michaudagrawal_mdanalysis_2011}, MDTraj~\cite{mcgibbon_mdtraj_2015}, and Bio3D~\cite{skjaerven_integrating_2014}. These tools output both tabular files as well as a variety of attractive plots.
\hypertarget{create-pdb-file-needed-by-most-analysis-tools}{%