diff --git a/bash-cli/yaml_parse/courseparse.sh b/bash-cli/yaml_parse/courseparse.sh new file mode 100755 index 0000000..0209636 --- /dev/null +++ b/bash-cli/yaml_parse/courseparse.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +#################################################### + +#FILE : yaml_parse.sh +#DESCRIPTION : This bash script parses biocswirl course yaml files and imports them as bash variables + +#OPTIONS : --- +#REQUIREMENTS : --- +#BUGS : --- +#NOTES : --- + +#AUTHOR : Lisa N. Cao +#CONTACT : --- +#DATE CREATED : 05/23/2020 +#LAST REVISION: + +#################################################### + +# source scripts +source yaml.sh + + +# debug purposes +set -a -v -e + + +# pulled from jasperes test.sh +DEBUG="$1" + +function is_debug() { + [ "$DEBUG" = "--debug" ] && return 0 || return 1 +} + +if is_debug; then + parse_yaml file.yml && echo +fi + + +# select lesson +read -p "Specify Lesson YAML: " lesson +echo $lesson + + +# Execute +create_variables $lesson + diff --git a/bash-cli/yaml_parse/examples/example.yml b/bash-cli/yaml_parse/examples/example.yml new file mode 100644 index 0000000..2a91022 --- /dev/null +++ b/bash-cli/yaml_parse/examples/example.yml @@ -0,0 +1,163 @@ +- Class: + Course: + Lesson: + Author: + Type: Standard + Language: + Organization: + Version: + +#Introduction + +- Class: text + Output: Welcome to BiocSwirl + +- Class: text + Output: Install + +#Explain alignment + +- Class: text + Output: Next-gen sequencers + +- Class: text + Output: There + +- Class: text + Output: In + +#Alignment QC + +- Class: text + Output: The standard + +- Class: mult_question + Output: What tool can we use to quality check the aligned .bam files? + AnswerChoices: FastQC;samtools;STAR;DESeq2 #uses ;, not like JSON + CorrectAnswer: FastQC + AnswerTests: omnitest(correctVal='FastQC') #instructions for evaluating correct answer + +- Class: text + Output: Go to + +#Import data into R + +- Class: text + Output: Now + +- Class: cmd_question + Output: In the R console, enter (without asterisks) *if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install()* + CorrectAnswer: if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install() + AnswerTests: omnitest(correctExpr='if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + BiocManager::install()') #seems like something to leave for an autograder + Hint: Type the expression into the R console. + +- Class: cmd_question + Output: Install the packages tidyverse, ___. + CorrectAnswer: BiocManager::install(c("tidyverse","___")) + AnswerTests: omnitest(correctExpr='BiocManager::install(c("tidyverse","___"))') + Hint: Refer to the bioconductor install page. Use quotations. + +- Class: cmd_question + Output: Load the installed packages. + CorrectAnswer: library("") + AnswerTests: omnitest(correctExpr='library("")') + Hint: Load each library separately. + +- Class: text + Output: Set working directory. + +- Class: cmd_question + Output: Input the .csv metadata file. + CorrectAnswer: read_csv("cell_metadata.csv") + AnswerTests: omnitest(correctExpr='read_csv("cell_metadata.csv")') + Hint: Use tidyverse. + +- Class: mult_question + Output: What are the dimensions of the metadata table? + AnswerChoices: 1680,16;16,1679;16,1680;1679,16; none of the above + CorrectAnswer: 1679,16 + AnswerTests: omnitest(correctVal='1679,16') + Hint: Rows first, then columns. + +- Class: text + Output: Confirm whether your data was imported correctly using the head() function. + +# Counting reads + +- Class: text + Output: From the aligned sequence BAM files and a list of genomic features, we can now count how many reads map to each feature. First we must obtain the list which contains annotated genomic features. + +- Class: mult_question + Output: Which of the following file types contain annotated genomic features? + AnswerChoices: SAM, BAM, VCF, GTF, PDF + CorrectAnswer: GTF + AnswerTests: omnitest(correctVal='GTF') + +- Class: text + Output: General Transfer Format (GTF) files generally contain the chromosome number, the feature type name, and the nucleotide start/end position of the feature. Find and download your specific species from the Ensembl FTP Downloads https://uswest.ensembl.org/info/data/ftp/index.html. + +- Class: text + Output: Once the GTF file is downloaded, read counting can proceed. htseq-count is a python script that outputs a table with counts for each feature. Follow https://htseq.readthedocs.io/en/release_0.11.1/install.html for installation instructions. + +- Class: cmd_question + Output: Write a function that counts the features of a BAM input file, skipping all reads with alignment quality lower than 10. For the file names, use 'sample.gtf' and count all bam files + CorrectAnswer: htseq-count -f bam -a *.bam sample.gtf + AnswerTests: omnitest(correctExpr='htseq-count -f bam -a *.bam sample.gtf') + Hint: Make sure the correct options are used. Recall the bash wildcard when searching for a pattern to return all BAM files. + +# Remove low read counts + +- Class: text + Output: Raw single-cell data is noisy in that read counts contain many genes with zero counts or insignificantly low count numbers. We will be filtering these low counts before proceeding to differential gene expression. + +- Class: mult_question + Output: For less than how many counts should genes be filtered out? + AnswerChoices: 1;5;10;20;it depends + CorrectAnswer: it depends + AnswerTests: omnitest(correctVal='it depends') + +- Class: text + Output: The filter threshold will depend on the differential expression algorithm that is being used. Refer to the specific differential expression vignette for details. + +# Normalization + +- Class: text + Output: There are many factors to account for which contribute to the number of reads mapped to a gene. Such factors include gene length, GC content, and sequencing depth; during library prep, variability in number of molecules sequenced between samples may contribute to different total read counts for different samples. Normalization is therefore an essential step when converting raw read counts to gene expression values. + +- Class: text + Output: Normalization methods can be categorized into several types - Normalization by library size, normalization by distribution, and normalization by controls. Proceed to learn more about each method. + +- Class: text + Output: Normalization by library size removes differences in sequencing depth by dividing total number of reads in each sample. It assumes that the amount of total expression is the same under different experimental conditions. Reads per kilobase per million mapped reads (RPKM) and fragments per kilobase per million mapped reads (FPKM) are common methods that normalize by library size. + +- Class: text + Output: Normalization by distribution equilibrates expression levels for non-DE genes by comparing distributions of read counts across samples. This method assumes that DE and non-DE genes behave the same, and that there is roughly balanced expression across conditions. + +- Class: text + Output: Normalization by controls uses control genes which are not affected by biological conditions but the same amount of control molecules are present. A disparity in the number of control molecules sequenced allows for normalization. + +- Class: cmd_question + Output: Our sample data was normalized by library size. RPKM data was generated by aligning 100bp single-end reads using RSEM to the mm10 mouse genome. Now, import the file containing RPKM values into R. + CorrectAnswer: read_csv("genes_rpkm.csv") + AnswerTests: omnitest(correctExpr='read_csv("genes_rpkm.csv")') + Hint: Import using tidyverse, similarly to the metadata file. + +# Dimensionality Reduction + +- Class: text + Output: Raw single-cell data is noisy in that the data is high dimensional. High dimensional data is complex and can affect the accuracy of downstream algorithms since there is more data that needs to be generalized. Principal component analysis is a common method to perform dimensionality reduction. + +- Class: cmd_question + Output: Plot a PCA plot of the sample dataset. + CorrectAnswer: plotPCA(genes_rpkm) + AnswerTests: omnitest(correctExpr='plotPCA(genes_rpkm)') + Hint: Use plotPCA. + +# References +#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6171491/ diff --git a/bash-cli/yaml_parse/examples/studentinfo.yml b/bash-cli/yaml_parse/examples/studentinfo.yml new file mode 100644 index 0000000..7f9f714 --- /dev/null +++ b/bash-cli/yaml_parse/examples/studentinfo.yml @@ -0,0 +1,23 @@ +#################################################### + +#FILE : studentinfo.yaml +#DESCRIPTION : This YAML file stores and writes information about the student + +#OPTIONS : --- +#REQUIREMENTS : --- +#BUGS : --- +#NOTES : --- + +#AUTHOR : Lisa Cao +#CONTACT : --- +#DATE CREATED : 05/23/2020 +#LAST REVISION: + +#################################################### + +# ${login} + +student info: + name: '${student_uname}' + password: "${student_pass}" + diff --git a/bash-cli/yaml_parse/installyq.sh b/bash-cli/yaml_parse/installyq.sh new file mode 100755 index 0000000..70f723b --- /dev/null +++ b/bash-cli/yaml_parse/installyq.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys CC86BB64 +sudo add-apt-repository ppa:rmescandon/yq +sudo apt update +sudo apt install yq -y diff --git a/bash-cli/yaml_parse/yaml.sh b/bash-cli/yaml_parse/yaml.sh new file mode 100755 index 0000000..7b991fd --- /dev/null +++ b/bash-cli/yaml_parse/yaml.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1003 + +# Based on https://gist.github.com/pkuczynski/8665367 + +parse_yaml() { + local yaml_file=$1 + local prefix=$2 + local s + local w + local fs + + s='[[:space:]]*' + w='[a-zA-Z0-9_.-]*' + fs="$(echo @|tr @ '\034')" + + ( + sed -e '/- [^\“]'"[^\']"'.*: /s|\([ ]*\)- \([[:space:]]*\)|\1-\'$'\n'' \1\2|g' | + + sed -ne '/^--/s|--||g; s|\"|\\\"|g; s/[[:space:]]*$//g;' \ + -e "/#.*[\"\']/!s| #.*||g; /^#/s|#.*||g;" \ + -e "s|^\($s\)\($w\)$s:$s\"\(.*\)\"$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)${s}[:-]$s\(.*\)$s\$|\1$fs\2$fs\3|p" | + + awk -F"$fs" '{ + indent = length($1)/2; + if (length($2) == 0) { conj[indent]="+";} else {conj[indent]="";} + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i + +#include +#include + +