diff --git a/manual/Ref_ZAS.bib b/manual/Ref_ZAS.bib index df9bff7..58708d2 100644 --- a/manual/Ref_ZAS.bib +++ b/manual/Ref_ZAS.bib @@ -382,6 +382,21 @@ @article{FriedlaenderEtAl08 Note="" } +@article{GarudEtAl15, + author = {Garud, Nandita R. AND Messer, Philipp W. AND Buzbas, Erkan O. AND Petrov, Dmitri A.}, + journal = {PLOS Genetics}, + publisher = {Public Library of Science}, + title = {Recent Selective Sweeps in North American Drosophila melanogaster Show Signatures of Soft Sweeps}, + year = {2015}, + month = {02}, + volume = {11}, + url = {https://doi.org/10.1371/journal.pgen.1005004}, + pages = {1-32}, + abstract = {Author Summary Evolutionary adaptation is a process in which beneficial mutations increase in frequency in response to selective pressures. If these mutations were previously rare or absent from the population, adaptation should generate a characteristic signature in the genetic diversity around the adaptive locus, known as a selective sweep. Such selective sweeps can be distinguished into hard selective sweeps, where only a single adaptive mutation rises in frequency, or soft selective sweeps, where multiple adaptive mutations at the same locus sweep through the population simultaneously. Here we design a new statistical method that can identify both hard and soft sweeps in population genomic data and apply this method to a Drosophila melanogaster population genomic dataset consisting of 145 sequenced strains collected in North Carolina. We find that selective sweeps were abundant in the recent history of this population. Interestingly, we also find that practically all of the strongest and most recent sweeps show patterns that are more consistent with soft rather than hard sweeps. We discuss the implications of these findings for the discovery and quantification of adaptation from population genomic data in Drosophila and other species with large population sizes.}, + number = {2}, + doi = {10.1371/journal.pgen.1005004} +} + @article{Garrod02, author="A. E. Garrod", year="1902", diff --git a/manual/selscan-manual.bbl b/manual/selscan-manual.bbl index 3ca4d44..fb0b399 100644 --- a/manual/selscan-manual.bbl +++ b/manual/selscan-manual.bbl @@ -21,6 +21,13 @@ Ferrer-Admetlla, A., Liang, M., Korneliussen, T., and Nielsen, R. 2014. structure. \newblock {\em Molecular Biology and Evolution\/}, {31}: 1275--1291. +\bibitem[Garud {\em et~al.}(2015)Garud, Messer, Buzbas, and + Petrov]{GarudEtAl15} +Garud, N.~R., Messer, P.~W., Buzbas, E.~O., and Petrov, D.~A. 2015. +\newblock Recent selective sweeps in north american drosophila melanogaster + show signatures of soft sweeps. +\newblock {\em PLOS Genetics\/}, {11}(2): 1--32. + \bibitem[Purcell {\em et~al.}(2007)Purcell, Neale, K., Thomas, Ferreira, Bender, Maller, Sklar, {\relax de Bakker}, Daly, and Sham]{PurcellEtAl07} Purcell, S., Neale, B., K., T.-B., Thomas, L., Ferreira, M. A.~R., Bender, D., diff --git a/manual/selscan-manual.dvi b/manual/selscan-manual.dvi index 78f9097..cd2c391 100644 Binary files a/manual/selscan-manual.dvi and b/manual/selscan-manual.dvi differ diff --git a/manual/selscan-manual.pdf b/manual/selscan-manual.pdf index 15a0fdd..d041947 100644 Binary files a/manual/selscan-manual.pdf and b/manual/selscan-manual.pdf differ diff --git a/manual/selscan-manual.tex b/manual/selscan-manual.tex index 756f2d4..787edb4 100644 --- a/manual/selscan-manual.tex +++ b/manual/selscan-manual.tex @@ -17,7 +17,7 @@ \newcommand{\negspace}{\!\!\!\!\!\!\!\!\!\!\!\!} -\title{{\tt selscan} v 1.1.0b User Manual} +\title{{\tt selscan} v 1.2.0 User Manual} \date{\today} \author{Zachary A Szpiech} @@ -84,6 +84,11 @@ \section{Basic Usage} {\tt selscan --nsl --hap --map --out } +\noindent To calculate iHH12: + +{\tt selscan --ihh12 --hap --map --out } + + \section{Statistics implemented} Here we describe the various statistics implemented in {\tt selscan}. Sections @@ -214,6 +219,45 @@ \subsection{nSL}\label{sec:nsl} For the nSL option, there is no EHH decay cutoff, but the computation stops when more than $200$ snps are included in building the haplotypes (can be changed with {\tt --max-extend-nsl}). Scaling of $g(x_{i-1},x_i)$ and handling of gaps is done as for iHS, and these parameters are definable on the {\tt selscan} command line. +\subsection{Integrated Haplotype Homozygosity Pooled (iHH12)}\label{sec:ihh12} + +iHH12 is adapted from the H12 statistic \cite[]{GarudEtAl15}, with better power than iHS to detect soft sweeps. For this statistic, we calculate the integrated haplotype homozygosity of the entire sample, but we first collapse the top two most frequent haplotypes into a single class. + +To calculate iHH12 at a site, we first calculate the integrated haplotype +homozygosity (iHH) for the sample using $EHH12$ via trapezoidal quadrature. +\begin{align}\label{eq:ihh12} +iHH =& \notag\\ +&\negspace\sum_{i = 1}^{|\mathcal{D}|} \frac{1}{2}\left(EHH12(x_{i-1}) + EHH12(x_i)\right)g(x_{i-1},x_i) + \notag\\ +&\negspace\sum_{i = 1}^{|\mathcal{U}|} \frac{1}{2}\left(EHH12(x_{i-1}) + EHH12(x_i)\right)g(x_{i-1},x_i), +\end{align} +where $\mathcal{D}$ is the set of markers downstream from the current locus +such that $x_i \in \mathcal{D}$ denotes the $i^{th}$ closest downstream +marker from the locus of interest ($x_0$). $\mathcal{U}$ and $x_i \in \mathcal{U}$ are defined similarly +for upstream markers. $g(x_{i-1},x_i)$ gives the genetic distance between two +markers. $EHH12$ is then defined as +\begin{equation}\label{eq:ehh12} +EHH12(x_i) = \frac{{n_{h_1} + n_{h_2} \choose 2}}{{n \choose 2}} + \sum_{h \in \mathcal{C}(x_i)\setminus \{h_1, h_2\}} \frac{{n_h \choose 2}}{{n \choose 2}}, +\end{equation} +where $h_i$ is the $i^{th}$ most frequent haplotype in the sample and $n_{h_i}$ is the number of observed $h_i$ haplotypes. + +Finally, the scores are normalized across the +entire genome. +\begin{equation} +iHH12 = \frac{iHH12 - E\Big[iHH12\Big]}{SD\Big[iHH12\Big]}, +\end{equation} + +In practice, the summations in Equation \ref{eq:ihh12} are truncated once +$EHH12_c(x_i) < 0.05$ or the computation extends more than $1$Mbp from the core. Additionally with low density SNP data, if the physical +distance $b$ (in kbp) between two markers is $> 20$, then $g(x_{i-1},x_i)$ is +scaled by a factor of $20/b$ in order to reduce possible spurious signals +induced by lengthy gaps. During computation if the start/end of a chromosome +arm is reached before $EHH12_c(x_i) < 0.05$ or if a gap of $b > 200$ is +encountered, the iHH12 calculation is aborted for that locus. iHH12 is not +reported at core sites with minor allele frequency $< 0.05$. In {\tt selscan}, the +EHH truncation value, gap scaling factor, and core site MAF cutoff value are +all flexible parameters definable on the command line. + + \subsection{Mean Pairwise Sequence Difference ($\pi$)}\label{sec:pi} The mean pairwise sequence difference among a sample of $n$ haplotypes is