diff --git a/.Rbuildignore b/.Rbuildignore index 7d41252..ac06f0b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,8 +18,7 @@ _files/ ^docs$ ^pkgdown$ ^\.github$ -^paper.md$ -^paper.bib$ +paper/ ^doc$ ^vignettes/articles$ ^optimize-figures.sh$ diff --git a/paper.bib b/paper.bib deleted file mode 100644 index f47fd74..0000000 --- a/paper.bib +++ /dev/null @@ -1,212 +0,0 @@ - -@ARTICLE{Robinson2020, - title = "{IPD-IMGT/HLA Database}", - author = "Robinson, James and Barker, Dominic J and Georgiou, Xenia and - Cooper, Michael A and Flicek, Paul and Marsh, Steven G E", - abstract = "The IPD-IMGT/HLA Database, http://www.ebi.ac.uk/ipd/imgt/hla/, - currently contains over 25 000 allele sequence for 45 genes, - which are located within the Major Histocompatibility Complex - (MHC) of the human genome. This region is the most polymorphic - region of the human genome, and the levels of polymorphism seen - exceed most other genes. Some of the genes have several thousand - variants and are now termed hyperpolymorphic, rather than just - simply polymorphic. The IPD-IMGT/HLA Database has provided a - stable, highly accessible, user-friendly repository for this - information, providing the scientific and medical community - access to the many variant sequences of this gene system, that - are critical for the successful outcome of transplantation. The - number of currently known variants, and dramatic increase in the - number of new variants being identified has necessitated a - dedicated resource with custom tools for curation and - publication. The challenge for the database is to continue to - provide a highly curated database of sequence variants, while - supporting the increased number of submissions and complexity of - sequences. In order to do this, traditional methods of accessing - and presenting data will be challenged, and new methods will need - to be utilized to keep pace with new discoveries.", - journal = "Nucleic acids research", - volume = 48, - number = "D1", - pages = "D948--D955", - month = jan, - year = 2020, - language = "en", - issn = "0305-1048, 1362-4962", - pmid = "31667505", - doi = "10.1093/nar/gkz950", - pmc = "PMC7145640" -} - - - -@ARTICLE{Pappas2016, - title = "{Bridging ImmunoGenomic Data Analysis Workflow Gaps (BIGDAWG): An - integrated case-control analysis pipeline}", - author = "Pappas, Derek J and Marin, Wesley and Hollenbach, Jill A and - Mack, Steven J", - abstract = "Bridging ImmunoGenomic Data-Analysis Workflow Gaps (BIGDAWG) is - an integrated data-analysis pipeline designed for the - standardized analysis of highly-polymorphic genetic data, - specifically for the HLA and KIR genetic systems. Most modern - genetic analysis programs are designed for the analysis of single - nucleotide polymorphisms, but the highly polymorphic nature of - HLA and KIR data require specialized methods of data analysis. - BIGDAWG performs case-control data analyses of highly polymorphic - genotype data characteristic of the HLA and KIR loci. BIGDAWG - performs tests for Hardy-Weinberg equilibrium, calculates allele - frequencies and bins low-frequency alleles for k$\times$2 and - 2$\times$2 chi-squared tests, and calculates odds ratios, - confidence intervals and p-values for each allele. When - multi-locus genotype data are available, BIGDAWG estimates - user-specified haplotypes and performs the same binning and - statistical calculations for each haplotype. For the HLA loci, - BIGDAWG performs the same analyses at the individual amino-acid - level. Finally, BIGDAWG generates figures and tables for each of - these comparisons. BIGDAWG obviates the error-prone reformatting - needed to traffic data between multiple programs, and streamlines - and standardizes the data-analysis process for case-control - studies of highly polymorphic data. BIGDAWG has been implemented - as the bigdawg R package and as a free web application at - bigdawg.immunogenomics.org.", - journal = "Human immunology", - volume = 77, - number = 3, - pages = "283--287", - month = mar, - year = 2016, - keywords = "Amino-acid analysis; BIGDAWG; Case-control analysis; HLA KIR data - analysis; Haplotype analysis; Hardy--Weinberg testing; R package; - Web app", - language = "en", - issn = "0198-8859, 1879-1166", - pmid = "26708359", - doi = "10.1016/j.humimm.2015.12.006", - pmc = "PMC4828284" -} - -@misc{imgthla, - author = "Robinson, James and Barker, Dominic and Georgiou, Xenia and Cooper, Michael", - title = {A GitHub repository with files currently published in the IPD-IMGT/HLA FTP Directory hosted at the European Bioinformatics Institute}, - year = {2014}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/ANHIG/IMGTHLA} -} - - -@UNPUBLISHED{Sakaue2022, - title = "{A statistical genetics guide to identifying HLA alleles driving - complex disease}", - author = "Sakaue, Saori and Gurajala, Saisriram and Curtis, Michelle and - Luo, Yang and Choi, Wanson and Ishigaki, Kazuyoshi and Kang, - Joyce B and Rumker, Laurie and Deutsch, Aaron J and - Sch{\"o}nherr, Sebastian and Forer, Lukas and LeFaive, Jonathon - and Fuchsberger, Christian and Han, Buhm and Lenz, Tobias L and - de Bakker, Paul I W and Smith, Albert V and Raychaudhuri, Soumya", - abstract = "The human leukocyte antigen (HLA) locus is associated with more - human complex diseases than any other locus. In many diseases it - explains more heritability than all other known loci combined. - Investigators have now demonstrated the accuracy of in silico HLA - imputation methods. These approaches enable rapid and accurate - estimation of HLA alleles in the millions of individuals that are - already genotyped on microarrays. HLA imputation has been used to - define causal variation in autoimmune diseases, such as type I - diabetes, and infectious diseases, such as HIV infection control. - However, there are few guidelines on performing HLA imputation, - association testing, and fine-mapping. Here, we present - comprehensive statistical genetics guide to impute HLA alleles - from genotype data. We provide detailed protocols, including - standard quality control measures for input genotyping data and - describe options to impute HLA alleles and amino acids including - a web-based Michigan Imputation Server. We updated the HLA - imputation reference panel representing global populations - (African, East Asian, European and Latino) available at the - Michigan Imputation Server ( n = 20,349) and achived high - imputation accuracy (mean dosage correlation r = 0.981). We - finally offer best practice recommendations to conduct - association tests in order to define the alleles, amino acids, - and haplotypes affecting human traits. This protocol will be - broadly applicable to the large-scale genotyping data and - contribute to defining the role of HLA in human diseases across - global populations. \#\#\# Competing Interest Statement B.H. is a - CTO of Genealogy Inc. T.L.L. is a co-inventor on a patent - application for using HED in predicting cancer immunotherapy - success.", - journal = "bioRxiv", - pages = "2022.08.24.504550", - month = aug, - year = 2022, - language = "en", - doi = "10.1101/2022.08.24.504550" -} - - -@ARTICLE{Kennedy2017, - title = "{What has GWAS done for HLA and disease associations?}", - author = "Kennedy, A E and Ozbek, U and Dorak, M T", - abstract = "The major histocompatibility complex (MHC) is located in - chromosome 6p21 and contains crucial regulators of immune - response, including human leucocyte antigen (HLA) genes, - alongside other genes with nonimmunological roles. More recently, - a repertoire of noncoding RNA genes, including expressed - pseudogenes, has also been identified. The MHC is the most gene - dense and most polymorphic part of the human genome. The region - exhibits haplotype-specific linkage disequilibrium patterns, - contains the strongest cis- and trans-eQTLs/meQTLs in the genome - and is known as a hot spot for disease associations. Another - layer of complexity is provided to the region by the extreme - structural variation and copy number variations. While the HLA-B - gene has the highest number of alleles, the HLA-DR/DQ subregion - is structurally most variable and shows the highest number of - disease associations. Reliance on a single reference sequence has - complicated the design, execution and analysis of GWAS for the - MHC region and not infrequently, the MHC region has even been - excluded from the analysis of GWAS data. Here, we contrast - features of the MHC region with the rest of the genome and - highlight its complexities, including its functional - polymorphisms beyond those determined by single nucleotide - polymorphisms or single amino acid residues. One of the several - issues with customary GWAS analysis is that it does not address - this additional layer of polymorphisms unique to the MHC region. - We highlight alternative approaches that may assist with the - analysis of GWAS data from the MHC region and unravel - associations with all functional polymorphisms beyond single - SNPs. We suggest that despite already showing the highest number - of disease associations, the true extent of the involvement of - the MHC region in disease genetics may not have been uncovered.", - journal = "International journal of immunogenetics", - volume = 44, - number = 5, - pages = "195--211", - month = oct, - year = 2017, - keywords = "HLA complex; disease predisposition; genetic predisposition to - disease; genetic variation; genome biology; genomewide - association studies", - language = "en", - issn = "1744-3121, 1744-313X", - pmid = "28877428", - doi = "10.1111/iji.12332" -} - -@ARTICLE{Gonzalez-Galarza2020, - title = "{Allele frequency net database (AFND) 2020 update: gold-standard - data classification, open access genotype data and new query - tools}", - author = "Gonzalez-Galarza, Faviel F and McCabe, Antony and Santos, Eduardo - J Melo Dos and Jones, James and Takeshita, Louise and - Ortega-Rivera, Nestor D and Cid-Pavon, Glenda M Del and - Ramsbottom, Kerry and Ghattaoraya, Gurpreet and Alfirevic, Ana - and Middleton, Derek and Jones, Andrew R", - journal = "Nucleic acids research", - volume = 48, - number = "D1", - pages = "D783--D788", - month = jan, - year = 2020, - language = "en", - issn = "0305-1048, 1362-4962", - pmid = "31722398", - doi = "10.1093/nar/gkz1029", - pmc = "PMC7145554" -} diff --git a/paper.md b/paper.md deleted file mode 100644 index a5d059c..0000000 --- a/paper.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: 'hlabud: An R package for analysis of HLA genotypes' -tags: - - R - - genetics - - immunology - - bioinformatics -authors: - - name: Kamil Slowikowski - orcid: 0000-0002-2843-6370 - corresponding: true - affiliation: "1, 2, 3, 4" - - name: Alexandra-Chloé Villani - orcid: 0000-0001-7461-0408 - affiliation: "1, 2, 3, 4" -affiliations: - - name: Center for Immunology and Inflammatory Diseases, Division of Rheumatology, Allergy an Immunology, Department of Medicine, Massachusetts General Hospital, Boston, MA, USA - index: 1 - - name: Massachusetts General Hospital, Cancer Center, Boston, MA, USA - index: 2 - - name: Broad Institute of Massachusetts Institute of Technology and Harvard, Cambridge, MA, USA - index: 3 - - name: Harvard Medical School, Boston, MA, USA - index: 4 -date: 12 June 2023 -bibliography: paper.bib - ---- - -# Summary - -Human leukocyte antigen (HLA) genes encode the proteins that display antigens for the immune system to recognize pathogens like bacteria and viruses. -Genes in the HLA locus on chromosome 6 in the human genome have thousands of different alleles in the human population. -The single-nucleotide polymorphisms (SNPs) encoding different amino acids in HLA genes are the genetic variants with the largest effect sizes, and they are associated with risk of developing autoimmune disease [@Kennedy2017]. -The fields of immunology and genomics aim to discover molecular factors, such as HLA genotypes, that explain the functions of the human immune system in health and disease. -Analysis of this genotype data requires computational methods for managing collections of genetic data and transforming data into different encodings for downstream analyses [@Sakaue2022]. - --![HLA-DRB1 genotypes embedded with UMAP](vignettes/articles/examples_files/figure-html/umap-pos13-1.png) - - -# Statement of need - -`hlabud` is an R package that simplifies the tasks of downloading and parsing data from the IMGT/HLA database of HLA genotypes and sequence alignments [@Robinson2020]. -The R programming language has a comprehensive repository of open-source libraries for statistical modeling and data visualization that can be applied to any data analysis. -The `hlabud` package provides functions that return convenient lists of matrices and tables to facilitate seamless integration with any downstream R packages. -HLA genotype data is lazily downloaded (as-needed) from the IMGT-HLA GitHub repository [@imgthla] and automatically cached in a user-configurable directory. -The documentation includes usage examples for analysis of the one-hot encoding of amino acid positions such as association analysis with logistic regression and low dimensional embedding with UMAP. -`hlabud` also provides direct access to the allele frequencies for all HLA genes from Allele Frequency Net Database (AFND) [@Gonzalez-Galarza2020]. - -`hlabud` can be used by biomedical researchers, but it can also be used by students in courses that teach immunology, genetics, and bioinformatics. -Interested readers might also consider another R package called `BIGDAWG` that provides functions for chi-squared Hardy-Weinberg and case-control association tests of highly polymorphic genetic data like HLA genotypes [@Pappas2016]. - -# Acknowledgments - -This work was supported by a NIAID grant T32AR007258 (to K.S.) and the National Institute of Health Director’s New Innovator Award (DP2CA247831; to A.C.V.) - - -# References - - diff --git a/paper/build.sh b/paper/build.sh new file mode 100755 index 0000000..079fadf --- /dev/null +++ b/paper/build.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# +# Install rustup: +# +# curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +# +# Then install rust and cargo with rustup. +# +# Then install typst: +# +# cargo install --git https://github.com/typst/typst +# + +# Build once +# typst compile main.typ + +# Continously rebuild each time we make an edit +typst watch main.typ + + diff --git a/paper/figure-examples.png b/paper/figure-examples.png new file mode 100644 index 0000000..1ee8137 Binary files /dev/null and b/paper/figure-examples.png differ diff --git a/paper/lapreprint.typ b/paper/lapreprint.typ new file mode 100644 index 0000000..8e5884d --- /dev/null +++ b/paper/lapreprint.typ @@ -0,0 +1,303 @@ +#let template( + // The paper's title. + title: "Paper Title", + subtitle: none, + + // An array of authors. For each author you can specify a name, orcid, and affiliations. + // affiliations should be content, e.g. "1", which is shown in superscript and should match the affiliations list. + // Everything but but the name is optional. + authors: (), + // This is the affiliations list. Include an id and `name` in each affiliation. These are shown below the authors. + affiliations: (), + // The paper's abstract. Can be omitted if you don't have one. + abstract: none, + // The short-title is shown in the running header + short-title: none, + // The short-citation is shown in the running header, if set to auto it will show the author(s) and the year in APA format. + short-citation: auto, + // The venue is show in the footer + venue: none, + // An image path that is shown in the top right of the page. Can also be content. + logo: none, + // A DOI link, shown in the header on the first page. Should be just the DOI, e.g. `10.10123/123456` ,not a URL + doi: none, + heading-numbering: "1.a.i", + // Show an Open Access badge on the first page, and support open science, default is true, because that is what the default should be. + open-access: true, + // A list of keywords to display after the abstract + keywords: (), + // The "kind" of the content, e.g. "Original Research", this is shown as the title of the margin content on the first page. + kind: none, + // Content to put on the margin of the first page + // Should be a list of dicts with `title` and `content` + margin: (), + paper-size: "us-letter", + // A color for the theme of the document + theme: blue.darken(30%), + // Date published, for example, when you publish your preprint to an archive server. + // To hide the date, set this to `none`. You can also supply a list of dicts with `title` and `date`. + date: datetime.today(), + // Feel free to change this, the font applies to the whole document + font-face: "Noto Sans", + // The path to a bibliography file if you want to cite some external works. + bibliography-file: none, + bibliography-style: "apa", + // The paper's content. + body +) = { + + /* Logos */ + let orcidSvg = ``` ```.text + + let spacer = text(fill: gray)[#h(8pt) | #h(8pt)] + + let dates; + if (type(date) == "datetime") { + dates = ((title: "Published", date: date),) + }else if (type(date) == "dictionary") { + dates = (date,) + } else { + dates = date + } + date = dates.at(0).date + + // Create a short-citation, e.g. Cockett et al., 2023 + let year = if (date != none) { ", " + date.display("[year]") } + if (short-citation == auto and authors.len() == 1) { + short-citation = authors.at(0).name.split(" ").last() + year + } else if (short-citation == auto and authors.len() == 2) { + short-citation = authors.at(0).name.split(" ").last() + " & " + authors.at(1).name.split(" ").last() + year + } else if (short-citation == auto and authors.len() > 2) { + short-citation = authors.at(0).name.split(" ").last() + " " + emph("et al.") + year + } + + // Set document metadata. + set document(title: title, author: authors.map(author => author.name)) + + show link: it => [#text(fill: theme)[#it]] + show ref: it => [#text(fill: theme)[#it]] + + set page( + paper-size, + margin: (left: 25%), + header: locate(loc => { + if(loc.page() == 1) { + let headers = ( + if (open-access) {smallcaps[Open Access]}, + if (doi != none) { link("https://doi.org/" + doi, "https://doi.org/" + doi)} + ) + return align(left, text(size: 8pt, fill: gray, headers.filter(header => header != none).join(spacer))) + } else { + return align(right, text(size: 8pt, fill: gray.darken(50%), + (short-title, short-citation).join(spacer) + )) + } + }), + footer: block( + width: 100%, + stroke: (top: 1pt + gray), + inset: (top: 8pt, right: 2pt), + [ + #grid(columns: (75%, 25%), + align(left, text(size: 9pt, fill: gray.darken(50%), + ( + if(venue != none) {emph(venue)}, + if(date != none) {date.display("[month repr:long] [day], [year]")} + ).filter(t => t != none).join(spacer) + )), + align(right)[ + #text( + size: 9pt, fill: gray.darken(50%) + )[ + #counter(page).display() of #locate((loc) => {counter(page).final(loc).first()}) + ] + ] + ) + ] + ) + ) + + // Set the body font. + set text(font: font-face, size: 10pt) + // Configure equation numbering and spacing. + set math.equation(numbering: "(1)") + show math.equation: set block(spacing: 1em) + + // Configure lists. + set enum(indent: 10pt, body-indent: 9pt) + set list(indent: 10pt, body-indent: 9pt) + + // Configure headings. + set heading(numbering: heading-numbering) + show heading: it => locate(loc => { + // Find out the final number of the heading counter. + let levels = counter(heading).at(loc) + set text(10pt, weight: 400) + if it.level == 1 [ + // First-level headings are centered smallcaps. + // We don't want to number of the acknowledgment section. + #let is-ack = it.body in ([Acknowledgment], [Acknowledgement]) + // #set align(center) + #set text(if is-ack { 10pt } else { 12pt }) + #show: smallcaps + #v(20pt, weak: true) + #if it.numbering != none and not is-ack { + numbering(heading-numbering, ..levels) + [.] + h(7pt, weak: true) + } + #it.body + #v(13.75pt, weak: true) + ] else if it.level == 2 [ + // Second-level headings are run-ins. + #set par(first-line-indent: 0pt) + #set text(style: "italic") + #v(10pt, weak: true) + #if it.numbering != none { + numbering(heading-numbering, ..levels) + [.] + h(7pt, weak: true) + } + #it.body + #v(10pt, weak: true) + ] else [ + // Third level headings are run-ins too, but different. + #if it.level == 3 { + numbering(heading-numbering, ..levels) + [. ] + } + _#(it.body):_ + ] + }) + + + if (logo != none) { + place( + top, + dx: -33%, + float: false, + box( + width: 27%, + { + if (type(logo) == "content") { + logo + } else { + image(logo, width: 100%) + } + }, + ), + ) + } + + + // Title and subtitle + box(inset: (bottom: 2pt), text(17pt, weight: "bold", fill: theme, title)) + if subtitle != none { + parbreak() + box(text(14pt, fill: gray.darken(30%), subtitle)) + } + // Authors and affiliations + if authors.len() > 0 { + box(inset: (y: 10pt), { + authors.map(author => { + text(11pt, weight: "semibold", author.name) + h(1pt) + if "affiliations" in author { + super(author.affiliations) + } + if "orcid" in author { + link("https://orcid.org/" + author.orcid)[#box(height: 1.1em, baseline: 13.5%)[#image.decode(orcidSvg)]] + } + }).join(", ", last: ", and ") + }) + } + if affiliations.len() > 0 { + box(inset: (bottom: 10pt), { + affiliations.map(affiliation => { + super(affiliation.id) + h(1pt) + affiliation.name + }).join(", ") + }) + } + + + place( + left + bottom, + dx: -33%, + dy: -10pt, + box(width: 27%, { + if (kind != none) { + show par: set block(spacing: 0em) + text(11pt, fill: theme, weight: "semibold", smallcaps(kind)) + parbreak() + } + if (dates != none) { + let formatted-dates + + grid(columns: (40%, 60%), gutter: 7pt, + ..dates.zip(range(dates.len())).map((formatted-dates) => { + let d = formatted-dates.at(0); + let i = formatted-dates.at(1); + let weight = "light" + if (i == 0) { + weight = "bold" + } + return ( + text(size: 7pt, fill: theme, weight: weight, d.title), + text(size: 7pt, d.date.display("[month repr:short] [day], [year]")) + ) + }).flatten() + ) + } + v(2em) + grid(columns: 1, gutter: 2em, ..margin.map(side => { + text(size: 7pt, { + if ("title" in side) { + text(fill: theme, weight: "bold", side.title) + [\ ] + } + set enum(indent: 0.1em, body-indent: 0.25em) + set list(indent: 0.1em, body-indent: 0.25em) + side.content + }) + })) + }), + ) + + + let abstracts + if (type(abstract) == "content") { + abstracts = (title: "Abstract", content: abstract) + } else { + abstracts = abstract + } + + box(inset: (top: 16pt, bottom: 16pt), stroke: (top: 1pt + gray, bottom: 1pt + gray), { + + abstracts.map(abs => { + set par(justify: true) + text(fill: theme, weight: "semibold", size: 9pt, abs.title) + parbreak() + abs.content + }).join(parbreak()) + }) + if (keywords.len() > 0) { + text(size: 9pt, { + text(fill: theme, weight: "semibold", "Keywords") + h(8pt) + keywords.join(", ") + }) + } + v(10pt) + + show par: set block(spacing: 1.5em) + + // Display the paper's contents. + body + + if (bibliography-file != none) { + show bibliography: set text(8pt) + bibliography(bibliography-file, title: text(10pt, "References"), style: bibliography-style) + } +} \ No newline at end of file diff --git a/paper/logo.png b/paper/logo.png new file mode 100644 index 0000000..fbaa9f5 Binary files /dev/null and b/paper/logo.png differ diff --git a/paper/main.pdf b/paper/main.pdf new file mode 100644 index 0000000..1a77206 Binary files /dev/null and b/paper/main.pdf differ diff --git a/paper/main.typ b/paper/main.typ new file mode 100644 index 0000000..3939a65 --- /dev/null +++ b/paper/main.typ @@ -0,0 +1,147 @@ +//#import "template.typ": * + +#import "lapreprint.typ": template + +// Take a look at the file `template.typ` in the file panel +// to customize this template and discover how it works. +#show: template.with( + title: "hlabud: HLA genotype analysis in R", + short-title: "hlabud", + venue: [bio#text(fill: red.darken(20%))[R]$chi$iv], + // You can make all dates optional, however, `date` is by default `datetime.today()` + //date: ( + // (title: "Published", date: datetime(year: 2023, month: 08, day: 21)), + // (title: "Accepted", date: datetime(year: 2022, month: 12, day: 10)), + // (title: "Submitted", date: datetime(year: 2022, month: 12, day: 10)), + //), + logo: "logo.png", + theme: blue.darken(20%), + font-face: "Noto Sans", + paper-size: "us-letter", + authors: ( + ( + name: "Kamil Slowikowski", + email: "kslowikowski@mgh.harvard.edu", + affiliations: "1,2,3,4", + orcid: "0000-0002-2843-6370" + ), + ( + name: "Alexandra-Chloe Villani", + email: "avillani@mgh.harvard.edu", + affiliations: "1,2,3,4", + orcid: "0000-0001-7461-0408" + ), + ), + affiliations: ( + (id: "1", name: "Center for Immunology and Inflammatory Diseases, Division of Rheumatology, Allergy an Immunology, Department of Medicine, Massachusetts General Hospital"), + (id: "2", name: "Cancer Center, Massachusetts General Hospital"), + (id: "3", name: "Broad Institute"), + (id: "4", name: "Harvard Medical School") + ), + kind: "Pre-Print", + // Insert your abstract after the colon, wrapped in brackets. + abstract: ( + (title: "Summary", content: [The human leukocyte antigen (HLA) genes have thousands of different alleles in the human population, and have more associations with human diseases than any other genes. Data for all known HLA genotypes are curated in the international ImMunoGeneTics (IMGT) database in versioned releases on #link("https://github.com/ANHIG/IMGTHLA")[GitHub]. Here, we introduce _hlabud_, an R package that provides access to data from the IMGT/HLA database and the Allele Frequency Net Database (AFND), functions to encode the data in different formats, and tutorials for association analysis, embedding, and HLA divergence.]), + (title: "Availability", content: [Source code and documentation are available at *#link("https://github.com/slowkow/hlabud")[github.com/slowkow/hlabud]*]), + (title: "Contact", content: [#link("mailto:kslowikowski@mgh.harvard.edu")[kslowikowski\@mgh.harvard.edu]]) + ), + keywords: ("immunoinformatics", "genetics", "immunology", "HLA"), + open-access: true, +) + += Introduction + +Human leukocyte antigen (HLA) genes encode the proteins that display antigens so the immune system can recognize pathogens such as bacteria and viruses. +Geneticists have identified thousands of variants (e.g. single nucleotide polymorphisms) in the human genome that are associated with hundreds of different disease and phenotypes @Kennedy2017. + +The HLA genes encode a protein complex that presents antigens to other cells. + +To facilitate HLA genotype analysis, we developed _hlabud_, a free and open-source software package that downloads information from the IMGT/HLA database of HLA genotypes and sequence alignments @Robinson2020 directly in the R programming environment. +The _hlabud_ package provides functions that return convenient lists of items, where each item is either a matrix or a data frame. +The simple design makes _hlabud_ easy to integrate with any downstream R packages for data analysis or visualization. + +_hlabud_ downloads HLA genotype data from the IMGT-HLA GitHub repository @imgthla and automatically caches it in a user-configurable folder. +Functionality includes parsing the custom IMGT/HLA file format for multiple sequence alignments, converting sequence alignments to a one-hot matrix, and calculating the Grantham divergence between HLA alleles @Pierini2018. + +The documentation includes tutorials for analysis of the one-hot encoding of amino acid positions, including association analysis with logistic regression and low-dimensional embedding with UMAP @McInnes2018. +_hlabud_ also provides direct access to the allele frequencies for all HLA genes from the Allele Frequency Net Database (AFND) @Gonzalez-Galarza2020. + += Description + +Comprehensive HLA genotype data is curated in the IMGT/HLA database, and the data is archived in a GitHub repository (#link("https://github.com/ANHIG/IMGTHLA")[github.com/ANHIG/IMGTHLA]). +We can use _hlabud_ to download the sequence alignment data, read it into R, and automatically encode the data as a one-hot matrix like this: + +```R +a <- hla_alignments("DRB1") +``` + +When the user runs this line of code, _hlabud_ will: + +- Download data from the IMGT/HLA Github repository. + +- Cache data files in a local folder that supports multiple releases of the data. + +- Read the data into data frames and matrices for downstream analysis. + +- Create a one-hot encoding of the multiple sequence alignment data. + +Many amino acid residues at specific loci have been associated with human diseases and blood protein levels @Krishna2023. +Researchers have developed software tools for calling HLA genotypes with high accuracy from DNA-seq or RNA-seq next-generation sequencing reads @Claeys2023, so there are opportunities to use that data for association studies. + +Once we have a list of genotypes for each individual (e.g. `"DRB1*04:01,DRB1*05:01"`), we can use _hlabud_ to prepare data for regression analysis to find which amino acid positions are associated with a phenotype in a sample of individuals. We call `dosage(genotypes, a$onehot)` where `genotypes` is a vector of genotypes and `a$onehot` is a one-hot matrix representation of HLA alleles (from the example above). The `dosage()` function returns the number of copies of each amino acid at each position for each individual, which can then be used for omnibus regression @Sakaue2023 or single-position testing (@fig1\A). + +UMAP accepts the one-hot matrix of HLA alleles as input, and it can be used to visualize the dataset in a latent space with reduced dimensionality (@fig1\B). + +_hlabud_ provides direct access to the allele frequencies HLA genes reported in the Allele Frequency Net Database (AFND) (#link("http://allelefrequences.net")) (@fig1\C). + +Each HLA allele binds a specific set of peptides. +So, an individual with two highly dissimilar alleles can bind a greater number of different peptides than a homozygous individual @Wakeland1990. +_hlabud_ implements the Grantham divergence calculations based on the original Perl code @Pierini2018: + +```R +my_genos <- c("A*23:01:12,A*24:550", "A*25:12N,A*11:27", "A*24:381,A*33:85") +hla_divergence(my_genos, method = "grantham") +#> A*23:01:12,A*24:550 A*25:12N,A*11:27 A*24:381,A*33:85 +#> 0.4924242 3.3333333 4.9015152 +``` + +#figure( + image("figure-examples.png", width: 71%), + caption: [(*A*) Association between amino acid positions and simulated case-control status. The x-axis represents the odds ratio and the y-axis represents $-log_10 P$ from a logistic regression analysis in R. + (*B*) 3,516 HLA-DRB1 alleles represented as dots in a two-dimensional embedding computed by UMAP from a one-hot encoding of amino acids. + (*C*) Allele frequencies for HLA-DQB1*02:01 in the AFND.], +) + + += Installation and documentation + +_hlabud_ can be installed in an R session with: + +```R +remotes::install_github("slowkow/hlabud") +``` + +Each function is documented extensively, and the complete manual can be viewed on the _hlabud_ website at #link("https://slowkow.github.io/hlabud"). _hlabud_ has been tested on Linux/Unix, Mac OS (Darwin) and Windows. + += Discussion + +Our open-source R package _hlabud_ enables easy access to HLA data from two public databases, and provides functions to enable HLA divergence calculations, regression analysis, and low-dimensional embedding. We hope that _hlabud_ will raise awareness of the IMGT/HLA and AFND databases and influence other developers to share more open-source tools for HLA analysis. We envision that _hlabud_ will be used by biomedical researchers, and also by teachers and students who study genetics and bioinformatics. + += Acknowledgments + +This work was supported by a NIAID grant T32AR007258 (to K.S.) and the National Institute of Health Director’s New Innovator Award (DP2CA247831; to A.C.V.) Thanks to Sreekar Mantena for reporting issues with the code. + += Competing Interests + +No competing interest is declared. + += Author contributions statement + +K.S. wrote the software and the manuscript. A.C.V. reviewed the manuscript. + += Related Work + +BIGDAWG is an R package that provides functions for chi-squared Hardy-Weinberg and case-control association tests of highly polymorphic genetic data like HLA genotypes @Pappas2016. HATK is set of Python scripts for processing and analyzing IMGT-HLA data @Choi2020. + + +#bibliography("references.bib") diff --git a/paper/references.bib b/paper/references.bib new file mode 100644 index 0000000..f36de68 --- /dev/null +++ b/paper/references.bib @@ -0,0 +1,205 @@ + +@article{bahdanau2014neural, + title={Neural machine translation by jointly learning to align and translate}, + author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, + journal={arXiv preprint arXiv:1409.0473}, + year={2014} +} + +@article{Robinson2020, + title = "{IPD-IMGT/HLA Database}", + author = "Robinson, James and Barker, Dominic J and Georgiou, Xenia and + Cooper, Michael A and Flicek, Paul and Marsh, Steven G E", + journal = "Nucleic acids research", + volume = 48, + number = "D1", + pages = "D948--D955", + month = jan, + year = 2020, + language = "en", + issn = "0305-1048, 1362-4962", + pmid = "31667505", + doi = "10.1093/nar/gkz950", + pmc = "PMC7145640" +} + +@ARTICLE{Pappas2016, + title = "{Bridging ImmunoGenomic Data Analysis Workflow Gaps (BIGDAWG): An integrated case-control analysis pipeline}", + author = "Pappas, Derek J and Marin, Wesley and Hollenbach, Jill A and + Mack, Steven J", + journal = "Human immunology", + volume = 77, + number = 3, + pages = "283--287", + month = mar, + year = 2016, + keywords = "Amino-acid analysis; BIGDAWG; Case-control analysis; HLA KIR data + analysis; Haplotype analysis; Hardy--Weinberg testing; R package; + Web app", + language = "en", + issn = "0198-8859, 1879-1166", + pmid = "26708359", + doi = "10.1016/j.humimm.2015.12.006", + pmc = "PMC4828284" +} + +@misc{imgthla, + author = "Robinson, James and Barker, Dominic and Georgiou, Xenia and Cooper, Michael", + title = {A GitHub repository with files currently published in the IPD-IMGT/HLA FTP Directory hosted at the European Bioinformatics Institute}, + year = {2014}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/ANHIG/IMGTHLA} +} + + +@UNPUBLISHED{Sakaue2022, + title = "{A statistical genetics guide to identifying HLA alleles driving + complex disease}", + author = "Sakaue, Saori and Gurajala, Saisriram and Curtis, Michelle and + Luo, Yang and Choi, Wanson and Ishigaki, Kazuyoshi and Kang, + Joyce B and Rumker, Laurie and Deutsch, Aaron J and + Sch{\"o}nherr, Sebastian and Forer, Lukas and LeFaive, Jonathon + and Fuchsberger, Christian and Han, Buhm and Lenz, Tobias L and + de Bakker, Paul I W and Smith, Albert V and Raychaudhuri, Soumya", + journal = "bioRxiv", + pages = "2022.08.24.504550", + month = aug, + year = 2022, + language = "en", + doi = "10.1101/2022.08.24.504550" +} + + +@ARTICLE{Kennedy2017, + title = "{What has GWAS done for HLA and disease associations?}", + author = "Kennedy, A E and Ozbek, U and Dorak, M T", + journal = "International journal of immunogenetics", + volume = 44, + number = 5, + pages = "195--211", + month = oct, + year = 2017, + keywords = "HLA complex; disease predisposition; genetic predisposition to disease; genetic variation; genome biology; genomewide association studies", + language = "en", + issn = "1744-3121, 1744-313X", + pmid = "28877428", + doi = "10.1111/iji.12332" +} + +@ARTICLE{Gonzalez-Galarza2020, + title = "{Allele frequency net database (AFND) 2020 update: gold-standard data classification, open access genotype data and new query tools}", + author = "Gonzalez-Galarza, Faviel F and McCabe, Antony and Santos, Eduardo + J Melo Dos and Jones, James and Takeshita, Louise and + Ortega-Rivera, Nestor D and Cid-Pavon, Glenda M Del and + Ramsbottom, Kerry and Ghattaoraya, Gurpreet and Alfirevic, Ana + and Middleton, Derek and Jones, Andrew R", + journal = "Nucleic acids research", + volume = 48, + number = "D1", + pages = "D783--D788", + month = jan, + year = 2020, + language = "en", + issn = "0305-1048, 1362-4962", + pmid = "31722398", + doi = "10.1093/nar/gkz1029", + pmc = "PMC7145554" +} + +@article{Pierini2018, + title = {Divergent Allele Advantage at Human MHC Genes: Signatures of Past and Ongoing Selection}, + volume = {35}, + ISSN = {1537-1719}, + url = {http://dx.doi.org/10.1093/molbev/msy116}, + DOI = {10.1093/molbev/msy116}, + number = {9}, + journal = {Molecular Biology and Evolution}, + publisher = {Oxford University Press (OUP)}, + author = {Pierini, Federica and Lenz, Tobias L}, + editor = {Wilke, Claus}, + year = {2018}, + month = jun, + pages = {2145–2158} +} + +@misc{McInnes2018, + doi = {10.48550/ARXIV.1802.03426}, + url = {https://arxiv.org/abs/1802.03426}, + author = {McInnes, Leland and Healy, John and Melville, James}, + keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, + publisher = {arXiv}, + year = {2018}, + copyright = {arXiv.org perpetual, non-exclusive license} +} + +@article{Krishna2023, + title = {The influence of HLA genetic variation on plasma protein expression}, + url = {http://dx.doi.org/10.1101/2023.07.24.550394}, + DOI = {10.1101/2023.07.24.550394}, + publisher = {Cold Spring Harbor Laboratory}, + author = {Krishna, Chirag and Chiou, Joshua and Lee, Isac and Kim, Hye In and Aksit, Melis Atalar and Sakaue, Saori and Schack, David Von and Raychaudhuri, Soumya and Ziemek, Daniel and Hu, Xinli}, + year = {2023}, + month = jul +} + +@article{Claeys2023, + title = {Benchmark of tools for in silico prediction of MHC class I and class II genotypes from NGS data}, + volume = {24}, + ISSN = {1471-2164}, + url = {http://dx.doi.org/10.1186/s12864-023-09351-z}, + DOI = {10.1186/s12864-023-09351-z}, + number = {1}, + journal = {BMC Genomics}, + publisher = {Springer Science and Business Media LLC}, + author = {Claeys, Arne and Merseburger, Peter and Staut, Jasper and Marchal, Kathleen and Van den Eynden, Jimmy}, + year = {2023}, + month = may +} + +@article{Sakaue2023, + title = {Tutorial: a statistical genetics guide to identifying HLA alleles driving complex disease}, + volume = {18}, + ISSN = {1750-2799}, + url = {http://dx.doi.org/10.1038/s41596-023-00853-4}, + DOI = {10.1038/s41596-023-00853-4}, + number = {9}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media LLC}, + author = {Sakaue, Saori and Gurajala, Saisriram and Curtis, Michelle and Luo, Yang and Choi, Wanson and Ishigaki, Kazuyoshi and Kang, Joyce B. and Rumker, Laurie and Deutsch, Aaron J. and Sch\"{o}nherr, Sebastian and Forer, Lukas and LeFaive, Jonathon and Fuchsberger, Christian and Han, Buhm and Lenz, Tobias L. and de Bakker, Paul I. W. and Okada, Yukinori and Smith, Albert V. and Raychaudhuri, Soumya}, + year = {2023}, + month = jul, + pages = {2625–2641} +} + +@article{Choi2020, + title = {HATK: HLA analysis toolkit}, + volume = {37}, + ISSN = {1367-4811}, + url = {http://dx.doi.org/10.1093/bioinformatics/btaa684}, + DOI = {10.1093/bioinformatics/btaa684}, + number = {3}, + journal = {Bioinformatics}, + publisher = {Oxford University Press (OUP)}, + author = {Choi, Wanson and Luo, Yang and Raychaudhuri, Soumya and Han, Buhm}, + editor = {Valencia, Alfonso}, + year = {2020}, + month = jul, + pages = {416–418} +} + +@article{Wakeland1990, + title = {Ancestral polymorphisms of MHC class II genes: Divergent allele advantage}, + volume = {9}, + ISSN = {1559-0755}, + url = {http://dx.doi.org/10.1007/BF02918202}, + DOI = {10.1007/bf02918202}, + number = {2}, + journal = {Immunologic Research}, + publisher = {Springer Science and Business Media LLC}, + author = {Wakeland, Edward K. and Boehme, Stefen and She, Jin Xiong and Lu, Cheng-Chan and McIndoe, Richard A. and Cheng, Ivan and Ye, Ying and Potts, Wayne K.}, + year = {1990}, + month = jun, + pages = {115–122} +} \ No newline at end of file