diff --git a/.dockerignore b/.dockerignore index aa69bb0..eda7e90 100644 --- a/.dockerignore +++ b/.dockerignore @@ -8,3 +8,6 @@ /.git /perl/docs /perl/docs.tar.gz +/.travis.yml +/cwls +/examples diff --git a/.gitignore b/.gitignore index 50901a3..a69a29d 100755 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,16 @@ /perl/docs/ /perl/perltidy.LOG /env +# python code related +/run-cgprna/env/ +__pycache__/ +/run-cgprna/*.py[cod] +/run-cgprna/*$py.class +/run-cgprna/.coverage +/run-cgprna/.cache +/run-cgprna/htmlcov/ +/run-cgprna/.eggs +/run-cgprna/build +/run-cgprna/dist +/run-cgprna/run_cgprna.egg-info +.vscode diff --git a/CHANGES.md b/CHANGES.md index 2103981..bde09f8 100755 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,21 @@ # CHANGES +## 2.4.0 + +* Revised Dockerfile so that the builder stage is properly used and the image size is reduced. Resolved #28. +* Added a CLI, so that user can complete a step in RNA-seq data workflow with just one command, which also eases the development of CWL files. + * CLI is written in Python + * currently 4 subcommands are implemented: + * `map`: uses star_mapping.pl to map and marks duplicates after mapping. + * `stats`: generates mapping stats using bam_stats and RSeQC. + * `bigwig`: generates bigwig file using bamToBw.pl + * `counts`: counts reads using htseq-count. +* Built a new set of reference files for CLI to use. They're available on ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/. +* Added CWL files: + * added a workflow to map sample by lanes, generate mapping stats for lanes, merge lane bams, generate bigwig file and count reads. + * added CWL tools/workflows for the workflow above to use. + * added example JSON for most of the CWL files. + ## 2.3.4 * RG tags are converted to shell safe strings before passing to Star. partially resolve #30. diff --git a/Dockerfile b/Dockerfile index 0770303..d713549 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,26 @@ FROM quay.io/wtsicgp/dockstore-cgpmap:3.1.4 as builder USER root -# Tool version ENVs, some of them are also used in the second build stage, make sure version are consistent between the two stages. +# Version of tools that are installed in both stages, make sure they are consistent. +## VAGrENG dependcies +ENV VER_BEDTOOLS "2.25.0-1" +## cgpRna dependencies +ENV VER_BOWTIE1 "1.1.2-3" +ENV VER_BOWTIE2 "2.2.6-2" +ENV VER_BLAST "2.2.31-4" +ENV VER_GMAP "2015-12-31.v7-1" + +# Version of other tools ## VAGrENG dependcies ENV VER_VCFTOOLS "0.1.16" ENV VER_Set_IntervalTree "0.12" -ENV VER_BEDTOOLS "2.25.0-1" ## CancerIT dependencies ENV VER_CGPVCF "v2.2.1" ENV VER_GRASS "v2.1.1" ENV VER_VAGRENT "v3.3.3" ## cgpRna dependencies +ENV VER_RSEQC "3.0.0" +ENV VER_HTSEQ "0.7.2" ENV VER_File_ShareDir_Install "0.13" ENV VER_Config_IniFiles "3.000002" ENV VER_STAR "2.5.0c" @@ -20,6 +30,13 @@ ENV VER_DEFUSE "v0.8.2" ENV SOURCE_FATOTWOBIT "http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit" ENV SOURCE_BLAT "https://hgwdev.gi.ucsc.edu/~kent/src/blatSrc35.zip" +RUN apt-get -yq update + +RUN apt-get install -qy --no-install-recommends lsb-release + +RUN echo "deb http://cran.rstudio.com/bin/linux/ubuntu `lsb_release -cs`/" >> /etc/apt/sources.list +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 + RUN apt-get -yq update RUN apt-get install -yq --no-install-recommends \ locales \ @@ -28,12 +45,24 @@ make \ gcc \ pkg-config \ zlib1g-dev \ -software-properties-common \ zip \ unzip \ libpng-dev \ -libboost-all-dev +python3 python3-dev python3-pip python3-setuptools \ +r-base r-base-dev \ +libcurl4-gnutls-dev zlib1g-dev \ +bedtools=${VER_BEDTOOLS} \ +bowtie=${VER_BOWTIE1} \ +bowtie2=${VER_BOWTIE2} \ +ncbi-blast+=${VER_BLAST} \ +gmap=${VER_GMAP} + # libboost-all-dev is required to compile defuse. Its installation installs python2 as well, which is required for building Bedtools +RUN apt-get install -yq --no-install-recommends libboost-all-dev +# for building matplotlib +RUN apt-get install -yq --no-install-recommends libfreetype6-dev + +RUN apt-get install -yq --no-install-recommends liblzo2-dev RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 @@ -41,6 +70,9 @@ RUN update-locale LANG=en_US.UTF-8 ENV OPT /opt/wtsi-cgp ENV PATH $OPT/bin:$OPT/biobambam2/bin:$PATH ENV PERL5LIB $OPT/lib/perl5 +ENV R_LIBS $OPT/R-lib +ENV R_LIBS_USER $R_LIBS +ENV PYTHONPATH $OPT/python-lib/lib/python3.5/site-packages ENV LD_LIBRARY_PATH $OPT/lib ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 @@ -52,14 +84,30 @@ RUN bash build/opt-build.sh $OPT # build the tools in this repo, separate to reduce build time on errors COPY . . RUN bash build/opt-build-local.sh $OPT +RUN bash build/config-defuse.sh $OPT FROM ubuntu:16.04 LABEL maintainer="cgphelp@sanger.ac.uk" \ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \ - version="2.3.4" \ + version="2.4.0" \ description="cgpRna docker" +# Version of tools that are installed in both stages, make sure they are consistent. +## VAGrENG dependcies +ENV VER_BEDTOOLS "2.25.0-1" +## cgpRna dependencies +ENV VER_BOWTIE1 "1.1.2-3" +ENV VER_BOWTIE2 "2.2.6-2" +ENV VER_BLAST "2.2.31-4" +ENV VER_GMAP "2015-12-31.v7-1" + +RUN apt-get -yq update +RUN apt-get install -qy --no-install-recommends lsb-release + +RUN echo "deb http://cran.rstudio.com/bin/linux/ubuntu `lsb_release -cs`/" >> /etc/apt/sources.list +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 + RUN apt-get -yq update RUN apt-get install -yq --no-install-recommends \ apt-transport-https \ @@ -74,8 +122,14 @@ zlib1g \ liblzma5 \ libncurses5 \ p11-kit \ -software-properties-common \ -unattended-upgrades && \ +unattended-upgrades \ +python3 \ +r-base \ +bedtools=${VER_BEDTOOLS} \ +bowtie=${VER_BOWTIE1} \ +bowtie2=${VER_BOWTIE2} \ +ncbi-blast+=${VER_BLAST} \ +gmap=${VER_GMAP} && \ unattended-upgrade -d -v && \ apt-get remove -yq unattended-upgrades && \ apt-get autoremove -yq @@ -83,21 +137,12 @@ apt-get autoremove -yq RUN locale-gen en_US.UTF-8 RUN update-locale LANG=en_US.UTF-8 -# dependecy tool versions, some of them are also used in the first stage, make sure they are consistent between stages. -## VAGrENG dependcies -ENV VER_VCFTOOLS "0.1.16" -ENV VER_BEDTOOLS "2.25.0-1" -## cgpRna dependencies -ENV VER_BOWTIE1 "1.1.2-3" -ENV VER_BOWTIE2 "2.2.6-2" -ENV VER_BLAST "2.2.31-4" -ENV VER_GMAP "2015-12-31.v7-1" -ENV VER_RSEQC "3.0.0" -ENV VER_HTSEQ "0.7.2" - ENV OPT /opt/wtsi-cgp -ENV PATH $OPT/bin:$OPT/biobambam2/bin:$PATH +ENV PATH $OPT/bin:$OPT/biobambam2/bin:$OPT/python-lib/bin/:$PATH ENV PERL5LIB $OPT/lib/perl5 +ENV R_LIBS $OPT/R-lib +ENV R_LIBS_USER $R_LIBS +ENV PYTHONPATH $OPT/python-lib/lib/python3.5/site-packages ENV LD_LIBRARY_PATH $OPT/lib ENV LC_ALL en_US.UTF-8 ENV LANG en_US.UTF-8 @@ -105,9 +150,8 @@ ENV LANG en_US.UTF-8 RUN mkdir -p $OPT COPY --from=builder $OPT $OPT -COPY build/opt-build-sys-dependencies.sh ./ -COPY build/config-defuse.sh ./ -RUN bash opt-build-sys-dependencies.sh && rm -f opt-build-sys-dependencies.sh && bash config-defuse.sh $OPT && rm -f config-defuse.sh +COPY run-cgprna $OPT/cgprna +RUN cd $OPT/cgprna && python3 setup.py develop ## USER CONFIGURATION RUN adduser --disabled-password --gecos '' ubuntu && chsh -s /bin/bash && mkdir -p /home/ubuntu diff --git a/build/config-defuse.sh b/build/config-defuse.sh old mode 100644 new mode 100755 diff --git a/build/opt-build-sys-dependencies.sh b/build/opt-build-sys-dependencies.sh deleted file mode 100644 index aaaf50a..0000000 --- a/build/opt-build-sys-dependencies.sh +++ /dev/null @@ -1,48 +0,0 @@ -#! /bin/bash - -set -xe - -# Add python ppa -add-apt-repository -y ppa:deadsnakes/ppa -# Add R key -echo "deb http://cran.rstudio.com/bin/linux/ubuntu xenial/" | tee -a /etc/apt/sources.list -# gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 -# gpg -a --export E084DAB9 | apt-key add - - -# install python3, R, bedtools, bowtie1, bowtie2, blast, gmap and other packages for RSeQC to install -apt-get update - -# using --allow-unauthenticated because the added R key is not recognised, it worked fine without the option before 27 Jul 2019 -apt-get install -yq --no-install-recommends --allow-unauthenticated \ -python3.7 python3.7-dev \ -r-base r-base-dev \ -bedtools=${VER_BEDTOOLS} \ -bowtie=${VER_BOWTIE1} \ -bowtie2=${VER_BOWTIE2} \ -ncbi-blast+=${VER_BLAST} \ -gmap=${VER_GMAP} \ -libcurl4-gnutls-dev zlib1g-dev -apt-get upgrade -yq gcc - -# install R packages: -Rscript -e 'install.packages("ada")' # required by Defuse - -# Replace python3 -update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 - -# install pip3 -curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py -rm -f get-pip.py - -# for bx-python (one of RSeQC package dependencies) -apt-get install -yq --no-install-recommends liblzo2-dev - -# install RSeQC and HTSeq -pip3 install RSeQC=="$VER_RSEQC" -pip3 install HTSeq=="$VER_HTSEQ" - -# if use HTSeq to plot -pip3 install matplotlib - -# cleanning -apt-get autoremove -yq diff --git a/build/opt-build.sh b/build/opt-build.sh index 281569f..26285f8 100755 --- a/build/opt-build.sh +++ b/build/opt-build.sh @@ -39,6 +39,8 @@ echo "Max compilation CPUs set to $CPU" SETUP_DIR=$INIT_DIR/install_tmp mkdir -p $SETUP_DIR/distro # don't delete the actual distro directory until the very end mkdir -p $INST_PATH/bin +mkdir -p $INST_PATH/R-lib +mkdir -p $INST_PATH/python-lib cd $SETUP_DIR # make sure tools installed can see the install loc of libraries @@ -49,6 +51,21 @@ export MANPATH=`echo $INST_PATH/man:$INST_PATH/share/man:$MANPATH | perl -pe 's/ export PERL5LIB=`echo $INST_PATH/lib/perl5:$PERL5LIB | perl -pe 's/:\$//;'` set -u + +# install R packages +Rscript -e "install.packages(\"ada\", \"$INST_PATH/R-lib\")" # required by Defuse + +# install python packages +OPT_BK=$OPT # Somehow OPT affects compilation of numpy +unset OPT +pip3 install --install-option="--prefix=$INST_PATH/python-lib" --ignore-installed numpy # for HTSeq installation. +# matplotlib is required by HTSeq for plotting. Later version of matplotlib requires python3.6 or above. +pip3 install --install-option="--prefix=$INST_PATH/python-lib" --ignore-installed \ + RSeQC=="$VER_RSEQC" \ + HTSeq=="$VER_HTSEQ" \ + matplotlib==3.0 +OPT=$OPT_BK + ## vcftools if [ ! -e $SETUP_DIR/vcftools.success ]; then curl -sSL --retry 10 https://github.com/vcftools/vcftools/releases/download/v${VER_VCFTOOLS}/vcftools-${VER_VCFTOOLS}.tar.gz > distro.tar.gz @@ -63,9 +80,6 @@ if [ ! -e $SETUP_DIR/vcftools.success ]; then touch $SETUP_DIR/vcftools.success fi -# install bedtools so that VAGrENT can be installed properly -apt-get install -yq --no-install-recommends bedtools=${VER_BEDTOOLS} - ## add File::ShareDir::Install for VAGrENT if [ ! -e $SETUP_DIR/File_ShareDir_Install.success ]; then cpanm -l $INST_PATH --mirror http://cpan.metacpan.org File::ShareDir::Install@$VER_File_ShareDir_Install diff --git a/cwls/cgpRna_workflow.cwl b/cwls/cgpRna_workflow.cwl new file mode 100644 index 0000000..bfd8d33 --- /dev/null +++ b/cwls/cgpRna_workflow.cwl @@ -0,0 +1,231 @@ +#!/usr/bin/env cwl-runner + +class: Workflow + +id: "multi-lane-sample-workflow" + +label: "A CGP workflow to generate mapping stats and gene counts from RNA-seq data using tools in cgpRna" + +cwlVersion: v1.0 + +requirements: + - class: ScatterFeatureRequirement + - class: SubworkflowFeatureRequirement + - class: InlineJavascriptRequirement + - class: StepInputExpressionRequirement + +inputs: + raw_reads: + doc: "RAW read input, can be multiple bam files, or several pairs of FastQ files (optionally gzip compressed), but not a mixture of BAM and FastQs. They'll be treated as if they were from the same lane of a sample, i.e. all reads will have the same read group ID in the mapped BAM." + type: + type: array + items: + type: array + items: File + + map_reference: + type: File + doc: "The core STAR reference and a GTF file bundled in a tar.gz." + + sample_name: + type: string + doc: "Sample name, which will used to prefix output file names and SM tag in the BAM file header." + default: '' + + stats_reference: + type: File + doc: "The reference files bundled in a tar.gz." + + count_reference: + type: File + doc: "A reference GTF file." + + bigwig_reference: + type: File + doc: "FASTA file of a reference file, which the input BAM file was mapped to." + secondaryFiles: + - .fai + + bigwig_threads: + type: int? + default: 1 + doc: "Number of threads to use for generating bigwig." + + map_threads: + type: int? + default: 1 + doc: "Number of threads to use for each mapping process." + + merge_threads: + type: int? + default: 1 + doc: "Number of threads to use for merging step." + + rg_id_tags: + type: + type: array + items: ["null", string] + doc: "Readgroup ID tag values. It should have one value for each group of input raw files. Use empty string to use defaults or existing RG ID in the input BAM. It only uses the RG ID value in the first BAM file of a group." + + lb_tags: + type: + type: array + items: ["null", string] + doc: "Sequencing library tag values in the output BAM header. It should have one value for each group of input raw files. Use empty string to set it to none or existing LB tag in the input BAM. It only uses the LB tag value in the first BAM file of a group." + + ds_tags: + type: + type: array + items: ["null", string] + doc: "Description tag value in the output BAM header. It should have one value for each group of input raw files. Use empty string to set it to none or existing DS tag in the input BAM. It only uses the DS tag value in the first BAM file of a group." + + pl_tags: + type: + type: array + items: ["null", string] + doc: "Platform tag value in the output BAM header. It should have one value for each group of input raw files. Use empty string to set it to none or existing PL tag in the input BAM. It only uses the PL tag value in the first BAM file of a group." + + pu_tags: + type: + type: array + items: ["null", string] + doc: "Platform unit tag value in the output BAM header. It should have one value for each group of input raw files. Use empty string to set it to none or existing PU tag in the input BAM. It only uses the PU tag value in the first BAM file of a group." + +outputs: + dup_marked_bam: + type: File + outputSource: merge/dup_marked_merged_bam + + dup_marked_bam_md5: + type: File + outputSource: merge/dup_marked_bam_md5 + + dup_marked_bam_dup_met: + type: File + outputSource: merge/dup_marked_bam_dup_met + + transcriptome_lane_bams: + type: File[] + outputSource: map_and_stats/transcriptome_bam + + dup_marked_lane_bam_dup_mets: + type: File[] + outputSource: map_and_stats/dup_marked_bam_dup_met + + rna_bas_files: + type: File[] + outputSource: map_and_stats/rna_bas + + gene_cover_pngs: + type: File[] + outputSource: map_and_stats/gene_cover_png + + gene_body_coverage_rscripts: + type: File[] + outputSource: map_and_stats/gene_body_coverage_rscript + + gene_body_coverage_txts: + type: File[] + outputSource: map_and_stats/gene_body_coverage_txt + + gene_body_coverage_updated_rscripts: + type: File[] + outputSource: map_and_stats/gene_body_coverage_updated_rscript + + read_dists: + type: File[] + outputSource: map_and_stats/read_dist + + out_bw: + type: File + outputSource: bigwig/out_bw + + out_count: + type: File + outputSource: count/out_count + +steps: + map_and_stats: + in: + raw_reads: + source: raw_reads + map_reference: + source: map_reference + sample_name: + source: sample_name + stats_reference: + source: stats_reference + map_threads: + source: map_threads + rg_id_tag: + source: rg_id_tags + lb_tag: + source: lb_tags + ds_tag: + source: ds_tags + pl_tag: + source: pl_tags + pu_tag: + source: pu_tags + out: [dup_marked_bam, dup_marked_bam_dup_met, transcriptome_bam, rna_bas, gene_cover_png, gene_body_coverage_rscript, gene_body_coverage_txt, gene_body_coverage_updated_rscript, read_dist] + scatter: [raw_reads, rg_id_tag, lb_tag, ds_tag, pl_tag, pu_tag] + scatterMethod: dotproduct + run: tools/lane_map_and_stats.cwl + + merge: + in: + sorted_bams: + source: map_and_stats/dup_marked_bam + threads: + source: merge_threads + out_bam_name: + source: sample_name + valueFrom: $(self).bam + out_bam_index_name: + source: sample_name + valueFrom: $(self).bam.bai + md5_file_name: + source: sample_name + valueFrom: $(self).bam.md5 + dup_met_file_name: + source: sample_name + valueFrom: $(self).bam.met + out: [dup_marked_merged_bam, dup_marked_bam_dup_met, dup_marked_bam_md5] + run: tools/merge_and_mark_dups.cwl + + bigwig: + in: + sample_bam: + source: merge/dup_marked_merged_bam + reference: + source: bigwig_reference + threads: + source: bigwig_threads + out: [out_bw] + run: tools/run-cgprna_bigwig.cwl + + count: + in: + sample_bam: + source: merge/dup_marked_merged_bam + reference: + source: count_reference + out: [out_count] + run: tools/run-cgprna_htseq-count.cwl + +doc: | + A workflow to generate mapping stats and gene counts from RNA-seq data using cgpRna container. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0 + +s:author: + - class: s:Person + s:email: mailto:yx2@sanger.ac.uk + s:name: Yaobo Xu diff --git a/cwls/tools/lane_map_and_stats.cwl b/cwls/tools/lane_map_and_stats.cwl new file mode 100644 index 0000000..b941a02 --- /dev/null +++ b/cwls/tools/lane_map_and_stats.cwl @@ -0,0 +1,98 @@ +class: Workflow +cwlVersion: v1.0 + +requirements: + - class: InlineJavascriptRequirement + +inputs: + raw_reads: + type: + type: array + items: File + map_reference: + type: File + sample_name: + type: string + stats_reference: + type: File + map_threads: + type: int? + rg_id_tag: + type: string? + lb_tag: + type: string? + ds_tag: + type: string? + pl_tag: + type: string? + pu_tag: + type: string? + +outputs: + dup_marked_bam: + type: File + outputSource: map/dup_marked_bam + dup_marked_bam_dup_met: + type: File + outputSource: map/dup_marked_bam_dup_met + transcriptome_bam: + type: File + outputSource: map/star_transcriptome_bam + rna_bas: + type: File + outputSource: stats/rna_bas + gene_cover_png: + type: File + outputSource: stats/gene_cover_png + gene_body_coverage_rscript: + type: File + outputSource: stats/gene_body_coverage_rscript + gene_body_coverage_txt: + type: File + outputSource: stats/gene_body_coverage_txt + gene_body_coverage_updated_rscript: + type: File + outputSource: stats/gene_body_coverage_updated_rscript + read_dist: + type: File + outputSource: stats/read_dist + +steps: + map: + in: + raw_reads: + source: raw_reads + reference: + source: map_reference + sample_name: + source: sample_name + output_file_prefix: + source: sample_name + valueFrom: | + ${ + return self + '.lane.' + inputs.raw_reads[0].nameroot; + } + threads: + source: map_threads + rg_id_tag: + source: rg_id_tag + lb_tag: + source: lb_tag + ds_tag: + source: ds_tag + pl_tag: + source: pl_tag + pu_tag: + source: pu_tag + out: [star_transcriptome_bam, dup_marked_bam, dup_marked_bam_dup_met] + run: run-cgprna_star-map.cwl + stats: + in: + sample_bam: + source: map/dup_marked_bam + reference: + source: stats_reference + transcriptome_bam: + source: map/star_transcriptome_bam + out: [rna_bas, gene_cover_png, gene_body_coverage_rscript, gene_body_coverage_txt, gene_body_coverage_updated_rscript, read_dist] + run: run-cgprna_stats.cwl diff --git a/cwls/tools/merge_and_mark_dups.cwl b/cwls/tools/merge_and_mark_dups.cwl new file mode 100644 index 0000000..666fb62 --- /dev/null +++ b/cwls/tools/merge_and_mark_dups.cwl @@ -0,0 +1,108 @@ +#!/usr/bin/env cwl-runner + +class: CommandLineTool + +id: "run-cgprna_merge-mark-dups" + +label: "cgpRna merge mark dups" + +cwlVersion: v1.0 + +doc: | + ![build_status](https://quay.io/repository/wtsicgp/cgprna/status) + A Docker container for the cgpRna mapping flow. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + + Please read the relevant [changes](https://github.com/cancerit/cgpRna/blob/dev/CHANGES.md) when upgrading. + + Parameters for a CWL definition are generally described in a json file, but parameters can be provided on the command line. + + To see the parameters descriptions please run: cwltool --tool-help path_to.cwl + +requirements: + - class: DockerRequirement + dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + +hints: + - class: ResourceRequirement + coresMin: 4 + ramMin: 8000 + +inputs: + sorted_bams: + doc: "BAM files to be merged." + type: + type: array + items: File + inputBinding: + prefix: I= + separate: false + + threads: + type: int? + doc: "Number of threads to use." + inputBinding: + prefix: markthreads= + separate: false + + out_bam_name: + type: string? + default: 'out_merged.bam' + inputBinding: + prefix: O= + separate: false + + out_bam_index_name: + type: string? + doc: "if specified, make sure it matches '{out_bam_name}.bai'." + default: 'out_merged.bam.bai' + inputBinding: + prefix: indexfilename= + separate: false + + md5_file_name: + type: string? + default: 'out_merged.bam.md5' + inputBinding: + prefix: md5filename= + separate: false + + dup_met_file_name: + type: string? + default: 'out_merged.bam.met' + inputBinding: + prefix: M= + separate: false + +outputs: + dup_marked_merged_bam: + type: File + outputBinding: + glob: $(inputs.out_bam_name) + secondaryFiles: + - .bai + + dup_marked_bam_dup_met: + type: File + outputBinding: + glob: $(inputs.md5_file_name) + + dup_marked_bam_md5: + type: File + outputBinding: + glob: $(inputs.dup_met_file_name) + +baseCommand: ["bammarkduplicates2", "md5=1", "index=1"] + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0-only + +s:author: + - class: s:Person + s:email: mailto:cgphelp@sanger.ac.uk + s:name: Yaobo Xu diff --git a/cwls/tools/run-cgprna_bigwig.cwl b/cwls/tools/run-cgprna_bigwig.cwl new file mode 100644 index 0000000..cb536bf --- /dev/null +++ b/cwls/tools/run-cgprna_bigwig.cwl @@ -0,0 +1,77 @@ +#!/usr/bin/env cwl-runner + +class: CommandLineTool + +id: "run-cgprna_bigwig" + +label: "cgpRna bigwig" + +cwlVersion: v1.0 + +doc: | + ![build_status](https://quay.io/repository/wtsicgp/cgprna/status) + A Docker container for the cgpRna mapping flow. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + + Please read the relevant [changes](https://github.com/cancerit/cgpRna/blob/dev/CHANGES.md) when upgrading. + + Parameters for a CWL definition are generally described in a json file, but parameters can be provided on the command line. + + To see the parameters descriptions please run: cwltool --tool-help path_to.cwl + +requirements: + - class: DockerRequirement + dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + +hints: + - class: ResourceRequirement + coresMin: 1 + ramMin: 4000 + +inputs: + sample_bam: + type: File + doc: "Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome)." + inputBinding: + prefix: --input + separate: true + secondaryFiles: + - .bai + + reference: + type: File + doc: "FASTA file of a reference file, which the input BAM file was mapped to." + inputBinding: + prefix: --reference + separate: true + secondaryFiles: + - .fai + + threads: + type: int? + doc: "Number of threads to use." + inputBinding: + prefix: --threads + separate: true + shellQuote: true + +outputs: + out_bw: + type: File + outputBinding: + glob: '*.bw' + +baseCommand: ["run-cgprna", "bigwig"] + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0-only + +s:author: + - class: s:Person + s:email: mailto:cgphelp@sanger.ac.uk + s:name: Yaobo Xu diff --git a/cwls/tools/run-cgprna_htseq-count.cwl b/cwls/tools/run-cgprna_htseq-count.cwl new file mode 100644 index 0000000..43eb7de --- /dev/null +++ b/cwls/tools/run-cgprna_htseq-count.cwl @@ -0,0 +1,65 @@ +#!/usr/bin/env cwl-runner + +class: CommandLineTool + +id: "run-cgprna_htseq-count" + +label: "cgpRna htseq-count" + +cwlVersion: v1.0 + +doc: | + ![build_status](https://quay.io/repository/wtsicgp/cgprna/status) + A Docker container for the cgpRna mapping flow. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + + Please read the relevant [changes](https://github.com/cancerit/cgpRna/blob/dev/CHANGES.md) when upgrading. + + Parameters for a CWL definition are generally described in a json file, but parameters can be provided on the command line. + + To see the parameters descriptions please run: cwltool --tool-help path_to.cwl + +requirements: + - class: DockerRequirement + dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + +hints: + - class: ResourceRequirement + coresMin: 1 + ramMin: 4000 + +inputs: + sample_bam: + type: File + doc: "Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome)." + inputBinding: + prefix: --input + separate: true + + reference: + type: File + doc: "A reference GTF file." + inputBinding: + prefix: --reference + separate: true + +outputs: + out_count: + type: File + outputBinding: + glob: rna_htseqcount.gz + +baseCommand: ["run-cgprna", "count"] + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0-only + +s:author: + - class: s:Person + s:email: mailto:cgphelp@sanger.ac.uk + s:name: Yaobo Xu diff --git a/cwls/tools/run-cgprna_star-map.cwl b/cwls/tools/run-cgprna_star-map.cwl new file mode 100644 index 0000000..04c3da9 --- /dev/null +++ b/cwls/tools/run-cgprna_star-map.cwl @@ -0,0 +1,182 @@ +#!/usr/bin/env cwl-runner + +class: CommandLineTool + +id: "run-cgprna_map" + +label: "cgpRna mapping" + +cwlVersion: v1.0 + +doc: | + ![build_status](https://quay.io/repository/wtsicgp/cgprna/status) + A Docker container for the cgpRna mapping flow. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + + Please read the relevant [changes](https://github.com/cancerit/cgpRna/blob/dev/CHANGES.md) when upgrading. + + Parameters for a CWL definition are generally described in a json file, but parameters can be provided on the command line. + + To see the parameters descriptions please run: cwltool --tool-help path_to.cwl + +requirements: + - class: DockerRequirement + dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + - class: InlineJavascriptRequirement + +hints: + - class: ResourceRequirement + coresMin: 4 + ramMin: 30000 + outdirMin: 20000 + +inputs: + raw_reads: + doc: "RAW read input, can be multiple bam files, or several pairs of FastQ files (optionally gzip compressed), but not a mixture of BAM and FastQs." + type: + type: array + items: File + inputBinding: + prefix: --input + separate: true + itemSeparator: ' ' + + reference: + type: File + doc: "The core STAR reference and a GTF file bundled in a tar.gz." + inputBinding: + prefix: --reference + separate: true + + sample_name: + type: string + doc: "Sample name, which will used to prefix output file names and SM tag in the BAM file header." + default: '' + inputBinding: + prefix: --sample-name + separate: true + shellQuote: true + + output_file_prefix: + type: string? + doc: "Output files prefix, if final outputs should have prefix different from sample_name. This should not contain any folder path." + inputBinding: + prefix: --output-file-prefix + separate: true + shellQuote: true + + threads: + type: int? + doc: "Number of threads to use." + inputBinding: + prefix: --threads + separate: true + shellQuote: true + + rg_id_tag: + type: string? + doc: "Readgroup ID tag value in the output BAM. Default: taken from the input raw BAM file or 1." + inputBinding: + prefix: --rg-id-tag + separate: true + shellQuote: true + + lb_tag: + type: string? + doc: "Sequencing library tag value in the output BAM header. Default: None or taken from the input raw BAM file." + inputBinding: + prefix: --lb-tag + separate: true + shellQuote: true + + ds_tag: + type: string? + doc: "Description tag value in the output BAM header. Default: None or taken from the input raw BAM file." + inputBinding: + prefix: --ds-tag + separate: true + shellQuote: true + + pl_tag: + type: string? + doc: "Platform tag value in the output BAM header. Default: None or taken from the input raw BAM file." + inputBinding: + prefix: --pl-tag + separate: true + shellQuote: true + + pu_tag: + type: string? + doc: "Platform unit tag value in the output BAM header. Default: None or taken from the input raw BAM file." + inputBinding: + prefix: --pu-tag + separate: true + shellQuote: true + +outputs: + star_transcriptome_bam: + type: File + outputBinding: + glob: | + ${ + if (inputs.output_file_prefix != null) { + return inputs.output_file_prefix + '.star.AlignedtoTranscriptome.out.bam'; + } else { + return inputs.sample_name + '.star.AlignedtoTranscriptome.out.bam'; + } + } + secondaryFiles: + - .bai + + dup_marked_bam: + type: File + outputBinding: + glob: | + ${ + if (inputs.output_file_prefix != null) { + return inputs.output_file_prefix + '.bam'; + } else { + return inputs.sample_name + '.bam'; + } + } + secondaryFiles: + - .bai + + dup_marked_bam_dup_met: + type: File + outputBinding: + glob: | + ${ + if (inputs.output_file_prefix != null) { + return inputs.output_file_prefix + '.bam.met'; + } else { + return inputs.sample_name + '.bam.met'; + } + } + + dup_marked_bam_md5: + type: File + outputBinding: + glob: | + ${ + if (inputs.output_file_prefix != null) { + return inputs.output_file_prefix + '.bam.md5'; + } else { + return inputs.sample_name + '.bam.md5'; + } + } + +baseCommand: ["run-cgprna", "map"] + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0-only + +s:author: + - class: s:Person + s:email: mailto:cgphelp@sanger.ac.uk + s:name: Yaobo Xu diff --git a/cwls/tools/run-cgprna_stats.cwl b/cwls/tools/run-cgprna_stats.cwl new file mode 100644 index 0000000..0b88681 --- /dev/null +++ b/cwls/tools/run-cgprna_stats.cwl @@ -0,0 +1,101 @@ +#!/usr/bin/env cwl-runner + +class: CommandLineTool + +id: "run-cgprna_stats" + +label: "cgpRna stats" + +cwlVersion: v1.0 + +doc: | + ![build_status](https://quay.io/repository/wtsicgp/cgprna/status) + A Docker container for the cgpRna mapping flow. See the [cgpRna](https://github.com/cancerit/cgpRna) website for more information. + + Please read the relevant [changes](https://github.com/cancerit/cgpRna/blob/dev/CHANGES.md) when upgrading. + + Parameters for a CWL definition are generally described in a json file, but parameters can be provided on the command line. + + To see the parameters descriptions please run: cwltool --tool-help path_to.cwl + +requirements: + - class: DockerRequirement + dockerPull: "quay.io/wtsicgp/cgprna:2.4.0" + +hints: + - class: ResourceRequirement + coresMin: 1 + ramMin: 2000 + +inputs: + sample_bam: + type: File + doc: "Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome)" + inputBinding: + prefix: --input + separate: true + secondaryFiles: + - .bai + + reference: + type: File + doc: "The reference files bundled in a tar.gz." + inputBinding: + prefix: --reference + separate: true + + transcriptome_bam: + type: File + doc: "BAM file, in which reads are mapped to a reference transciptome (NOT genome)." + inputBinding: + prefix: --transcriptome-bam + separate: true + secondaryFiles: + - .bai + +outputs: + rna_bas: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).RNA.bas + + gene_cover_png: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).geneBodyCoverage.curves.png + + gene_body_coverage_rscript: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).geneBodyCoverage.r + + gene_body_coverage_txt: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).geneBodyCoverage.txt + + gene_body_coverage_updated_rscript: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).geneBodyCoverage_UPDATED.r + + read_dist: + type: File + outputBinding: + glob: $(inputs.sample_bam.nameroot).read_dist.txt + +baseCommand: ["run-cgprna", "stats"] + +$schemas: + - http://schema.org/docs/schema_org_rdfa.html + +$namespaces: + s: http://schema.org/ + +s:codeRepository: https://github.com/cancerit/cgpRna +s:license: https://spdx.org/licenses/AGPL-3.0-only + +s:author: + - class: s:Person + s:email: mailto:cgphelp@sanger.ac.uk + s:name: Yaobo Xu diff --git a/examples/json/bigwig.json b/examples/json/bigwig.json new file mode 100644 index 0000000..87667cb --- /dev/null +++ b/examples/json/bigwig.json @@ -0,0 +1,14 @@ +{ + "sample_bam": { + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq_mapped_dupMarked.bam" + }, + "reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/genome.fa" + }, + "out_bw": { + "class":"File", + "path": "/tmp/small_RNAseq.bw" + } +} diff --git a/examples/json/cgpRna_workflow.json b/examples/json/cgpRna_workflow.json new file mode 100644 index 0000000..5f772a8 --- /dev/null +++ b/examples/json/cgpRna_workflow.json @@ -0,0 +1,89 @@ +{ + "raw_reads": [ + [{ + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq.bam" + }], + [{ + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/part_of_EM-2_0.5.bam" + }] + ], + "rg_id_tags": ["1", "B"], + "lb_tags": ["libA", ""], + "ds_tags": ["a small test sample", "a sample that have a bad header"], + "pl_tags": ["Illumina", "Illumina"], + "pu_tags": ["", "PlatFormUnitB"], + "map_reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/cgpRna-mapRefBundle-star2.4.1c-GRCh38_full_analysis_set_plus_decoy_hla-ensembl77.tar.gz" + }, + "sample_name": "cgpRna_test_RNAseq", + "stats_reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/GRCh38_rseqc_ref.tar.gz" + }, + "count_reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/ensembl.gtf" + }, + "bigwig_reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/genome.fa" + }, + "map_threads": 8, + "bigwig_threads": 8, + "merge_threads": 8, + "dup_marked_bam": { + "class":"File", + "path": "/tmp/cgpRna_test_RNAseq.bam" + }, + "dup_marked_bam_md5": { + "class":"File", + "path": "/tmp/cgpRna_test_RNAseq.bam.md5" + }, + "dup_marked_bam_dup_met": { + "class":"File", + "path": "/tmp/cgpRna_test_RNAseq.bam" + }, + "transcriptome_lane_bams": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "dup_marked_lane_bam_dup_mets": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "rna_bas_files": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "gene_cover_pngs": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "gene_body_coverage_rscripts": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "gene_body_coverage_txts": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "gene_body_coverage_updated_rscripts": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "read_dists": { + "class":"Directory", + "path":"/tmp/cgpRna_test_RNAseq_lane_stats/" + }, + "out_bw": { + "class":"File", + "path": "/tmp/cgpRna_test_RNAseq.bw" + }, + "out_count": { + "class":"File", + "path": "/tmp/cgpRna_test_RNAseq.htseq_count.txt.gz" + } +} diff --git a/examples/json/htseq_count.json b/examples/json/htseq_count.json new file mode 100644 index 0000000..82710d9 --- /dev/null +++ b/examples/json/htseq_count.json @@ -0,0 +1,14 @@ +{ + "sample_bam": { + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq_mapped_dupMarked.bam" + }, + "reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/ensembl.gtf" + }, + "out_count": { + "class":"File", + "path": "/tmp/htseq_count.gz" + } +} diff --git a/examples/json/mapping_stats.json b/examples/json/mapping_stats.json new file mode 100644 index 0000000..716bbb7 --- /dev/null +++ b/examples/json/mapping_stats.json @@ -0,0 +1,38 @@ +{ + "sample_bam": { + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq_mapped_dupMarked.bam" + }, + "reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/GRCh38_rseqc_ref.tar.gz" + }, + "transcriptome_bam": { + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq.star.AlignedtoTranscriptome.out.bam" + }, + "rna_bas": { + "class":"File", + "path": "/tmp/small_RNAseq.RNA.bas" + }, + "gene_cover_png": { + "class":"File", + "path": "/tmp/small_RNAseq.geneBodyCoverage.curves.png" + }, + "gene_body_coverage_rscript":{ + "class":"File", + "path": "/tmp/small_RNAseq.geneBodyCoverage.r" + }, + "gene_body_coverage_txt":{ + "class":"File", + "path": "/tmp/small_RNAseq.geneBodyCoverage.txt" + }, + "gene_body_coverage_updated_rscript": { + "class":"File", + "path": "/tmp/small_RNAseq.geneBodyCoverage_UPDATED.r" + }, + "read_dist": { + "class":"File", + "path": "/tmp/small_RNAseq.read_dist.txt" + } +} diff --git a/examples/json/merge_and_mark_dups.json b/examples/json/merge_and_mark_dups.json new file mode 100644 index 0000000..edf1a54 --- /dev/null +++ b/examples/json/merge_and_mark_dups.json @@ -0,0 +1,25 @@ +{ + "sorted_bams": [ + { + "class":"File", + "path": "/home/ubuntu/data/test_cgpRna_cwls/test_bam_map/test_small_bam.bam" + }, + { + "class":"File", + "path": "/home/ubuntu/data/test_cgpRna_cwls/test_fq_map/test_small_fq.bam" + } + ], + "threads": 8, + "dup_marked_merged_bam":{ + "class":"File", + "path": "/tmp/a_small_sample.bam" + }, + "dup_marked_bam_dup_met": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.met" + }, + "dup_marked_bam_md5": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.md5" + } +} diff --git a/examples/json/star_map_bam.json b/examples/json/star_map_bam.json new file mode 100644 index 0000000..c7b05ee --- /dev/null +++ b/examples/json/star_map_bam.json @@ -0,0 +1,29 @@ +{ + "raw_reads": [ + { + "class":"File", + "path": "ftp://ngs.sanger.ac.uk/production/cancer/dockstore/cgprna/small_RNAseq/small_RNAseq.bam" + } + ], + "reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/cgpRna-mapRefBundle-star2.4.1c-GRCh38_full_analysis_set_plus_decoy_hla-ensembl77.tar.gz" + }, + "sample_name": "a_small_sample", + "star_transcriptome_bam":{ + "class":"File", + "path": "/tmp/a_small_sample.star.AlignedtoTranscriptome.bam" + }, + "dup_marked_bam":{ + "class":"File", + "path": "/tmp/a_small_sample.bam" + }, + "dup_marked_bam_dup_met": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.met" + }, + "dup_marked_bam_md5": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.md5" + } +} diff --git a/examples/json/star_map_fastqs.json b/examples/json/star_map_fastqs.json new file mode 100644 index 0000000..b69a96d --- /dev/null +++ b/examples/json/star_map_fastqs.json @@ -0,0 +1,37 @@ +{ + "raw_reads": [ + { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/examples/tiny_1.fq.gz" + }, + { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/examples/tiny_2.fq.gz" + } + ], + "reference": { + "class":"File", + "path": "ftp://ftp.sanger.ac.uk/pub/cancer/support-files/cgpRna_container/GRCh38_full_analysis_set_plus_decoy_hla-ensembl77/cgpRna-mapRefBundle-star2.4.1c-GRCh38_full_analysis_set_plus_decoy_hla-ensembl77.tar.gz" + }, + "sample_name": "a_small_sample", + "star_sample_bam": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam" + }, + "star_transcriptome_bam":{ + "class":"File", + "path": "/tmp/a_small_sample.star.AlignedtoTranscriptome.bam" + }, + "dup_marked_bam":{ + "class":"File", + "path": "/tmp/a_small_sample.bam" + }, + "dup_marked_bam_dup_met": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.met" + }, + "dup_marked_bam_md5": { + "class":"File", + "path": "/tmp/a_small_sample.star.aligned.bam.md5" + } +} diff --git a/perl/bin/star_mapping.pl b/perl/bin/star_mapping.pl index 142d7e4..d82e46a 100755 --- a/perl/bin/star_mapping.pl +++ b/perl/bin/star_mapping.pl @@ -106,7 +106,7 @@ sub setup { 'p|process=s' => \$opts{'process'}, 'i|index=i' => \$opts{'index'}, 'c|config=s' => \$opts{'config'}, - 'l|lane-id=i' => \$opts{'ID'}, + 'l|lane-id=s' => \$opts{'ID'}, 'b|library=s' => \$opts{'LB'}, 'ds|ds-tag=s' => \$opts{'DS'}, 'y|machine-type=s' => \$opts{'PL'}, diff --git a/perl/lib/Sanger/CGP/CgpRna.pm b/perl/lib/Sanger/CGP/CgpRna.pm index d6a8962..0301cf3 100755 --- a/perl/lib/Sanger/CGP/CgpRna.pm +++ b/perl/lib/Sanger/CGP/CgpRna.pm @@ -36,7 +36,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '2.3.4'; +our $VERSION = '2.4.0'; our @EXPORT = qw($VERSION); 1; diff --git a/perl/lib/Sanger/CGP/Star/Implement.pm b/perl/lib/Sanger/CGP/Star/Implement.pm index ae0f11c..3ba792e 100755 --- a/perl/lib/Sanger/CGP/Star/Implement.pm +++ b/perl/lib/Sanger/CGP/Star/Implement.pm @@ -154,14 +154,18 @@ sub filter_fusions { } sub format_rg_tags { - my $options = shift; + my $options = shift; - my $sample = $options->{'sample'}; - - # Format the PU RG tag if the npg_run and lane_pos parameters have been provided - $options->{'PU'} = $options->{'npg'}."_".$options->{'lane_pos'} if(defined $options->{'npg'} && $options->{'lane_pos'}); - - # Check the input data + my $sample = $options->{'sample'}; + + # Format the PU RG tag if the npg_run and lane_pos parameters have been provided + if(defined $options->{'npg'} && defined $options->{'lane_pos'}) { + $options->{'PU'} = $options->{'npg'}."_".$options->{'lane_pos'}; + } elsif (defined $options->{'npg'}) { # Use ngp_run alone if lane_pos is not defined, which allows users to set PU tag more easily. + $options->{'PU'} = $options->{'npg'}; + } + + # Check the input data my $input_meta = $options->{'meta_set'}; # Get the RG header information to format the @RG line for the mapped BAM diff --git a/run-cgprna/run_cgprna/__init__.py b/run-cgprna/run_cgprna/__init__.py new file mode 100644 index 0000000..bfeccec --- /dev/null +++ b/run-cgprna/run_cgprna/__init__.py @@ -0,0 +1,38 @@ +from subprocess import Popen, PIPE, STDOUT +import sys +import os +import tarfile + + +def run_shell_command(command): + print('+' + command, flush=True) + with Popen( + command, shell=True, universal_newlines=True, bufsize=1, stdout=PIPE, stderr=STDOUT + ) as p: + for out in p.stdout: + print(out, end='') + if p.returncode != 0: + sys.exit('Exit code: %d' % p.returncode) + + +def untar(tar_file, untar_to): + print('untar files to %s ...' % untar_to, flush=True, end='') + # extract files from tar ball + try: + with tarfile.open(tar_file, 'r:gz') as tar: + tar.extractall(path=untar_to) + print('Done.') + except Exception as e: + sys.exit('Error: unexpected exception. When extracting files: %s' % str(e)) + + +def run_templates_in_shell(list_of_templates, mapping): + for template in list_of_templates: + run_shell_command(template.substitute(mapping)) + + +def mkdir(dir_path): + try: + os.makedirs(dir_path, exist_ok=True) + except Exception as e: + sys.exit('Error: unexpected exception. failed to create the directory %s: %s' % (dir_path, str(e))) diff --git a/run-cgprna/run_cgprna/bigwig.py b/run-cgprna/run_cgprna/bigwig.py new file mode 100644 index 0000000..f4f6b47 --- /dev/null +++ b/run-cgprna/run_cgprna/bigwig.py @@ -0,0 +1,28 @@ +import os +from string import Template +from . import run_templates_in_shell, mkdir + +BIGWIG_TEMPLATE = Template('bamToBw.pl -o $out_dir -t $threads -r $ref -b $input') + + +# NOTE: Require secondary input: the BAM index file +def generate_bigwig(args): + ''' + Top level entry point for generating bigwig coverage files from mapped RNA-Seq sequence files. + ''' + # prepare the output dir + mkdir(args.out_dir) + + # gathering parameters + params = { + 'threads': args.threads, + 'input': os.path.abspath(args.input), + 'out_dir': os.path.abspath(args.out_dir), + 'ref': os.path.abspath(args.ref), + } + + run_templates_in_shell( + [ + BIGWIG_TEMPLATE + ], + params) diff --git a/run-cgprna/run_cgprna/command_line.py b/run-cgprna/run_cgprna/command_line.py new file mode 100644 index 0000000..73f0693 --- /dev/null +++ b/run-cgprna/run_cgprna/command_line.py @@ -0,0 +1,194 @@ +""" +Handle the command line parsing and select the correct sub process. +""" + +import argparse +import sys +import pkg_resources # part of setuptools + +from .map import map_seq_files +from .mapping_stats import generate_stats +from .htseq_count import count +from .bigwig import generate_bigwig + +version = pkg_resources.require("run_cgprna")[0].version + + +def main(): + """ + Sets up the parser and handles triggereing of correct sub-command + """ + common_parser = argparse.ArgumentParser('parent', add_help=False) + common_parser.add_argument('-v', '--version', + action='version', + version='%(prog)s ' + version) + + parser = argparse.ArgumentParser(prog='run-cgprna', parents=[common_parser]) + + subparsers = parser.add_subparsers(help='sub-command help') + + # mapping arguments + parser_a = subparsers.add_parser( + 'map', + parents=[common_parser], + description='Use STAR to map RNA-Seq reads to a reference genome', + epilog='Input can be either bam or \'f(ast)?q(\.gz)?\'.') + parser_a.add_argument( + '-i', '--input', dest='input', + metavar='FILE', + nargs='+', + help='An input raw bam file, or a pair of FastQ files split with spaces. (optionally gzip compressed).', + required=True) + parser_a.add_argument( + '-r', '--reference', dest='ref', + metavar='TAR|PATH', + help='A reference bundle tar file or the path to reference root directory.', + required=True) + parser_a.add_argument( + '-s', '--sample-name', dest='sample_name', + metavar='STR', + help='Sample name, which will used to prefix output file names and SM tag in the BAM file header.', + required=True) + parser_a.add_argument( + '-sp', '--species', dest='species', + metavar='STR', + help='Species name. No need to set if using a pre-built reference bundle. If using a folder as the reference, it\' be used to locate reference files.', + required=False) + parser_a.add_argument( + '-rb', '--reference-build', dest='ref_build', + metavar='STR', + help='Reference build name. No need to set if using a pre-built reference bundle. If using a folder as the reference, it\' be used to locate reference files.', + required=False) + parser_a.add_argument( + '-gb', '--gene-build', dest='gene_build', + metavar='STR', + help='Gene build name. No need to set if using a pre-built reference bundle. If using a folder as the reference, it\' be used to locate for GTF file.', + required=False) + parser_a.add_argument( + '-gtf', '--gene-build-gtf-name', dest='gene_build_gtf_name', + metavar='STR', + help='File name of the gene build file. No need to set if using a pre-built reference bundle. If using a folder as the reference, it\' be used to locate the GTF file.', + required=False) + parser_a.add_argument( + '-od', '--output-directory', dest='out_dir', + metavar='DIR', default='.', + help='Output directory. Default: current directory.', + required=False) + parser_a.add_argument( + '-op', '--output-file-prefix', dest='out_file_prefix', + metavar='STR', + help='Output file name prefix. Default: value of --sample-name.', + required=False) + parser_a.add_argument( + '-t', '--threads', dest='threads', + metavar='INT', type=int, default=1, + help='Number of threads to use.', + required=False) + parser_a.add_argument( + '--rg-id-tag', dest='rg_id_tag', + metavar='STR', + help='Readgroup ID tag value in the output BAM. Default: "1" or taken from the first input raw BAM file.', + required=False) + parser_a.add_argument( + '--lb-tag', dest='lb_tag', + metavar='STR', + help='Sequencing library tag value in the output BAM header. Default: None or taken from the input raw BAM file.', + required=False) + parser_a.add_argument( + '--ds-tag', dest='ds_tag', + metavar='STR', + help='Description tag value in the output BAM header. Default: None or taken from the input raw BAM file.', + required=False) + parser_a.add_argument( + '--pl-tag', dest='pl_tag', + metavar='STR', + help='Platform tag value in the output BAM header. Default: None or taken from the input raw BAM file.', + required=False) + parser_a.add_argument( + '--pu-tag', dest='pu_tag', + metavar='STR', + help='Platform unit tag value in the output BAM header. Default: None or taken from the input raw BAM file.', + required=False) + parser_a.set_defaults(func=map_seq_files) + + # create the parser for the "stats" command + parser_b = subparsers.add_parser( + 'stats', + parents=[common_parser], + description='Generate mapping stats from a BAM file, with/without a BAM file in which reads were mapped to the transcriptome instead of genome.') + parser_b.add_argument( + '-i', '--input', dest='input', + metavar='FILE', + help='Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome).', + required=True) + parser_b.add_argument( + '-r', '--reference', dest='ref', + metavar='TAR|PATH', + help='A reference bundle tar file or the path to reference root directory.', + required=True) + parser_b.add_argument( + '-tb', '--transcriptome-bam', dest='trans_bam', + metavar='FILE', + help='BAM file, in which reads are mapped to a reference transciptome (NOT genome).', + required=False) + parser_b.add_argument( + '-od', '--output-directory', dest='out_dir', + metavar='DIR', default='.', + help='Output directory. Default: current directory.', + required=False) + parser_b.set_defaults(func=generate_stats) + + # create the parser for the "bigwig" command + parser_c = subparsers.add_parser( + 'bigwig', + parents=[common_parser], + description='Generate bigwig file from a BAM file.') + parser_c.add_argument( + '-i', '--input', dest='input', + metavar='FILE', + help='Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome).', + required=True) + parser_c.add_argument( + '-r', '--reference', dest='ref', + metavar='FASTA_FILE', + help='FASTA file of a reference file, which the input BAM file was mapped to.', + required=True) + parser_c.add_argument( + '-od', '--output-directory', dest='out_dir', + metavar='DIR', default='.', + help='Output directory. Default: current directory.', + required=False) + parser_c.add_argument( + '-t', '--threads', dest='threads', + metavar='INT', type=int, default=1, + help='Number of threads to use.', + required=False) + parser_c.set_defaults(func=generate_bigwig) + + # create the parser for the "count" command + parser_d = subparsers.add_parser( + 'count', + parents=[common_parser], + description='Generate gene counts from a BAM file.') + parser_d.add_argument( + '-i', '--input', dest='input', + metavar='FILE', + help='Input BAM file, in which reads are mapped to a reference genome (NOT transcriptome).', + required=True) + parser_d.add_argument( + '-r', '--reference', dest='ref', + metavar='GTF_FILE', + help='A reference GTF file.', + required=True) + parser_d.add_argument( + '-od', '--output-directory', dest='out_dir', + metavar='DIR', default='.', + help='Output directory. Default: current directory.', + required=False) + parser_d.set_defaults(func=count) + + args = parser.parse_args() + if len(sys.argv) > 1: + args.func(args) + else: + sys.exit('\nError: missed required arguments.\n\tPlease run: run-cgprna --help\n') diff --git a/run-cgprna/run_cgprna/htseq_count.py b/run-cgprna/run_cgprna/htseq_count.py new file mode 100644 index 0000000..2b0e4af --- /dev/null +++ b/run-cgprna/run_cgprna/htseq_count.py @@ -0,0 +1,37 @@ +import os +import shutil +from string import Template +from . import run_templates_in_shell, untar, mkdir + +BAMCLOLLATE_TEMPLATE = Template('bamcollate2 collate=1 filename=$input inputformat=bam outputformat=bam level=1 exclude=SECONDARY,SUPPLEMENTARY O=$temp_dir/tmpCollated.bam') +HTSEQ_COUNT_TEMPLATE = Template('htseq-count --format=bam --order=name --stranded="no" --type="exon" --idattr="gene_id" --mode="union" --quiet $temp_dir/tmpCollated.bam $ref | bgzip -c > $out_dir/rna_htseqcount.gz') + + +def count(args): + ''' + Top level entry point for generating gene counts from mapped RNA-Seq sequence files. + ''' + # temp_dir is for the temp bam + temp_dir = os.path.join(os.path.abspath(args.out_dir), 'cgpRna_count_temp') + + # prepare the output dir and temp dir + mkdir(args.out_dir) + mkdir(temp_dir) + + # gathering parameters + params = { + 'input': os.path.abspath(args.input), + 'out_dir': os.path.abspath(args.out_dir), + 'ref': os.path.abspath(args.ref), + 'temp_dir': temp_dir + } + + run_templates_in_shell( + [ + BAMCLOLLATE_TEMPLATE, + HTSEQ_COUNT_TEMPLATE + ], + params) + + # clean temp dir + shutil.rmtree(temp_dir) diff --git a/run-cgprna/run_cgprna/map.py b/run-cgprna/run_cgprna/map.py new file mode 100644 index 0000000..8a902f8 --- /dev/null +++ b/run-cgprna/run_cgprna/map.py @@ -0,0 +1,158 @@ +import os +import sys +import re +import shutil +import fnmatch +from string import Template +from . import run_templates_in_shell, untar, mkdir + +STAR_MAP_TEMPLATE = Template('star_mapping.pl -s $sample_name -o $out_dir -t $threads -r $reference_data_root -sp $species -rb $ref_build -gb $gene_build -g $gene_build_gtf_name $other_options $raw_reads_string') +MARK_DUPS_TEMPLATE = Template('bammarkduplicates2 I=$out_dir/$sample_name.star.Aligned.out.bam O=$out_dir/$sample_name.bam md5=1 index=1 markthreads=$threads md5filename=$out_dir/$sample_name.bam.md5 indexfilename=$out_dir/$sample_name.bam.bai M=$out_dir/$sample_name.bam.met tmpfile=$out_dir/biormdup') +BAM_INDEX_TEMPLATE = Template('bamindex < $out_dir/$sample_name.star.AlignedtoTranscriptome.out.bam > $out_dir/$sample_name.star.AlignedtoTranscriptome.out.bam.bai') +RENAME_OUTPUT_TEMPLATE = Template('mv "$out_dir/${sample_name}.$file_ext" "$out_dir/${out_file_prefix}.$file_ext"') + + +def map_seq_files(args): + ''' + Top level entry point for mapping RNA-Seq sequence files. + ''' + # keys should be the same as what they have in command_line.py + ref_related_args = { + '--species': args.species, + '--reference-build': args.ref_build, + '--gene-build': args.gene_build, + '--gene-build-gtf-name': args.gene_build_gtf_name + } + + # only because star_mapping.pl will try to find files in a particular structure + ref_related_defaults = { + '--species': 'unspecified_species', + '--reference-build': 'unspecified_ref_build', + '--gene-build': 'unspecified_gene_build' + } + + # only use temp_dir when needed to extract reference files + temp_dir = os.path.join(os.path.abspath(args.out_dir), 'cgpRna_map_temp') + clean_temp = 0 + + other_options = [] + if args.rg_id_tag: + other_options.append('-lane-id "%s"' % args.rg_id_tag) + if args.lb_tag: + other_options.append('-library "%s"' % args.lb_tag) + if args.ds_tag: + other_options.append('-ds-tag "%s"' % args.ds_tag) + if args.pl_tag: + other_options.append('-machine-type "%s"' % args.pl_tag) + if args.pu_tag: + other_options.append('-npg-run "%s"' % args.pu_tag) + + # prepare the output dir + mkdir(args.out_dir) + + reference_data_root = os.path.abspath(args.ref) + # Anything not a file will be treated as a reference root folder + if not os.path.isfile(reference_data_root) and any(ele is None for ele in ref_related_args.values()): + sys.exit( + 'Error: missing required input. When "--reference" is not a reference bundle tar file, you have to provide: %s' % ', '.join( + [ key for key, value in ref_related_args.items() if value is None]) + ) + + # check if input ref file has valid file extensions + if not os.path.basename(reference_data_root).endswith('.tar.gz'): + sys.exit('Error: wrong input format. "--reference" can only be a tar.gz file or a folder.') + + # If a pre-built ref bundle tar file is supplied, prepare the reference + if re.match(r'.*\.tar\.gz$', os.path.basename(reference_data_root)): + mkdir(temp_dir) + clean_temp = 1 + + # use the tar file name as the ref_root_dir_name, so that in BAM header, people can tell which bundle file was used + ref_root_dir_name = re.match(r'(.*)\.tar\.gz$', os.path.basename(reference_data_root)).group(1) + reference_data_root=os.path.join(temp_dir, ref_root_dir_name) + + # set ref related values for star_mapping.pl + for key,value in ref_related_args.items(): + # GTF file name will be the same as in the bundle + if key!= '--gene-build-gtf-name': + if value is None: + ref_related_args[key] = ref_related_defaults[key] + else: + if value is not None: + print('Warning: provided "--gene-build-gtf-name" will be overwritten by the GTF file name in the reference bundle.') + + # make the folder structure + bundle_decompress_path = os.path.join(temp_dir, ref_root_dir_name, ref_related_args['--species'], ref_related_args['--reference-build'], 'star') + final_gtf_folder = os.path.join(bundle_decompress_path, ref_related_args['--gene-build']) + mkdir(final_gtf_folder) + + # dump reference bundle + untar(args.ref, bundle_decompress_path) + + # find the GTF file + gtfs = find('*.gtf', bundle_decompress_path) + if len(gtfs) == 1: + ref_related_args['--gene-build-gtf-name'] = os.path.basename(gtfs[0]) + # link the file to final_gtf_folder + os.symlink( + gtfs[0], + os.path.join(final_gtf_folder, ref_related_args['--gene-build-gtf-name']) + ) + else: + sys.exit('Error: none or too many GTF files in refence bundle. Found GTF(s): %s' % ','.join(gtfs)) + + # gathering parameters + params = { + **vars(args), + 'raw_reads_string': ' '.join([os.path.abspath(path) for path in args.input]), + 'reference_data_root': reference_data_root, + 'other_options': ' '.join(other_options), + 'out_dir': os.path.abspath(args.out_dir), # overwrite the value in args with absolute path + 'species': ref_related_args['--species'], # overwrite the value in args + 'ref_build': ref_related_args['--reference-build'], # overwrite the value in args + 'gene_build': ref_related_args['--gene-build'], # overwrite the value in args + 'gene_build_gtf_name': ref_related_args['--gene-build-gtf-name'] # overwrite the value in args + } + + run_templates_in_shell( + [ + STAR_MAP_TEMPLATE, + MARK_DUPS_TEMPLATE, + BAM_INDEX_TEMPLATE + ], + params) + + if args.out_file_prefix: + to_rename = [ + "bam", + "bam.bai", + "bam.md5", + "bam.met", + "star.Aligned.out.bam", + "star.AlignedtoTranscriptome.out.bam", + "star.AlignedtoTranscriptome.out.bam.bai" + ] + for file_ext in to_rename: + run_templates_in_shell( + [ + RENAME_OUTPUT_TEMPLATE + ], + { + 'out_dir': os.path.abspath(args.out_dir), + 'sample_name': args.sample_name, + 'out_file_prefix': args.out_file_prefix, + 'file_ext': file_ext + } + ) + + # clean temp dir + if clean_temp: + shutil.rmtree(temp_dir) + + +def find(pattern, path): + return [ + os.path.join(root, name) + for root, _, files in os.walk(path) if files + for name in files if fnmatch.fnmatch(name, pattern) + ] diff --git a/run-cgprna/run_cgprna/mapping_stats.py b/run-cgprna/run_cgprna/mapping_stats.py new file mode 100644 index 0000000..d5ebad2 --- /dev/null +++ b/run-cgprna/run_cgprna/mapping_stats.py @@ -0,0 +1,79 @@ +import os +import sys +import re +import shutil +from string import Template +from . import run_templates_in_shell, untar, mkdir + +BAMSTAT_GENOME_TEMPLATE = Template('bam_stats -r $fai_file -i $input -o $out_dir/$sample_name.bam.bas') +BAMSTAT_TRANSCRIPTOME_TEMPLATE = Template('bam_stats -i $trans_bam -o $out_dir/$sample_name.transcriptome.bas') +RSEQC_RRNA_TEMPLATE = Template('split_bam.py -i $input -r $ribsomal_rna_bed -o $out_dir/$sample_name.rRNA > $out_dir/$sample_name.rrna.txt') +RSEQC_GENE_COVERAGE_TEMPLATE = Template('geneBody_coverage.py -i $input -r $house_keeping_gene_bed -f png -o $out_dir/$sample_name') +RSEQC_READ_DISTRIBUTION_TEMPLATE = Template('read_distribution.py -i $input -r $reference_bed > $out_dir/$sample_name.read_dist.txt') +PROCESS_RNA_LANE_STATS_TEMPLATE = Template('process_qcstats.pl -s $sample_name -i $out_dir -o $out_dir') +COLLATE_RNA_LANE_STATS_TEMPLATE = Template('paste $out_dir/$sample_name.bam.bas $out_dir/$sample_name.insert.bas $out_dir/$sample_name.read.dist.bas $out_dir/$sample_name.rrna.bas $out_dir/$sample_name.gene.cov.bas > $out_dir/$sample_name.RNA.bas') + +FAI_FILE='genome.fa.fai' +RIBSOMAL_RNA_BED='rRNA.bed' +HOUSE_KEEPING_GENE_BED='HouseKeepingGenes.bed' +REFERENCE_BED='RefSeq.bed' + + +def generate_stats(args): + ''' + Top level entry point for generating stats from mapped RNA-Seq sequence files. + ''' + # only use temp_dir when needed to extract reference files + temp_dir = os.path.join(os.path.abspath(args.out_dir), 'cgpRna_mappingStats_temp') + clean_temp = 0 + + # guess a sample name from input file name + # this sample_name will be used as output file name prefix. + # NOTE: geneBody_coverage.py (from RSeQC) uses bam file name without extension as a variable name in its output r script, unless this is changed, there's no way to use other source as output file prefix. + sample_name, _ = os.path.splitext(os.path.basename(args.input)) + + # prepare the output dir + mkdir(args.out_dir) + + # prepare the reference + reference_data_root=os.path.abspath(args.ref) + # If a ref bundle tar file is supplied + # Anything else will be treated as a reference folder + if args.ref.endswith('.tar.gz'): + mkdir(temp_dir) + reference_data_root=os.path.join(temp_dir, 'ref') + untar(args.ref, reference_data_root) + + # gathering parameters + params = { + 'sample_name': sample_name, + 'input': os.path.abspath(args.input), + 'out_dir': os.path.abspath(args.out_dir), + 'trans_bam': None if args.trans_bam is None else os.path.abspath(args.trans_bam), + 'fai_file': os.path.join(reference_data_root, FAI_FILE), + 'ribsomal_rna_bed': os.path.join(reference_data_root, RIBSOMAL_RNA_BED), + 'house_keeping_gene_bed': os.path.join(reference_data_root, HOUSE_KEEPING_GENE_BED), + 'reference_bed': os.path.join(reference_data_root, REFERENCE_BED) + } + + run_templates_in_shell( + [ + BAMSTAT_GENOME_TEMPLATE, + RSEQC_RRNA_TEMPLATE, + RSEQC_GENE_COVERAGE_TEMPLATE, + RSEQC_READ_DISTRIBUTION_TEMPLATE + ], + params) + + if args.trans_bam is not None: + run_templates_in_shell( + [ + BAMSTAT_TRANSCRIPTOME_TEMPLATE, + PROCESS_RNA_LANE_STATS_TEMPLATE, + COLLATE_RNA_LANE_STATS_TEMPLATE + ], + params) + + # clean temp dir + if clean_temp: + shutil.rmtree(temp_dir) diff --git a/run-cgprna/run_tests.sh b/run-cgprna/run_tests.sh new file mode 100644 index 0000000..10e44f1 --- /dev/null +++ b/run-cgprna/run_tests.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -e +pytest --cov-branch --cov-report term --cov-report html --cov=run_cgprna --cov-fail-under=10 +set +e + +# these should not die: + +echo -e "\n#################################" +echo "# Running pycodestyle (style) #" +echo "#################################" +pycodestyle run_cgprna + +echo -e "\n#########################################" +echo "# Running radon (cyclomatic complexity) #" +echo "#########################################" +radon cc -nc run_cgprna + +echo -e "\n#########################################" +echo "# Running radon (maintainability index) #" +echo "#########################################" +radon mi -s run_cgprna + +echo -e "\n##############################" +echo "# Running mdl (markdownlint) #" +echo "##############################" +mdl -r ~MD013 . # ignore line length rule. + +exit 0 # don't die based on assements of code quality diff --git a/run-cgprna/setup.py b/run-cgprna/setup.py new file mode 100644 index 0000000..e9d1bce --- /dev/null +++ b/run-cgprna/setup.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 + +from setuptools import setup + +config = { + 'name': 'run-cgprna', + 'description': 'Toolkits for RNA-Seq data analysis using cgpRna', + 'author': 'Yaobo Xu', + 'url': 'https://github.com/cancerit/cgpRna/run_cgprna', + 'download_url': '', + 'author_email': 'cgphelp@sanger.ac.uk', + 'version': '0.1.0', + 'python_requires': '>= 3.5', + 'setup_requires': ['pytest'], + 'install_requires': [], + 'packages': ['run_cgprna'], + 'package_data': {'run_cgprna': ['config/*.json']}, + 'entry_points': { + 'console_scripts': ['run-cgprna=run_cgprna.command_line:main'], + }, +} + +setup(**config) diff --git a/run-cgprna/tox.ini b/run-cgprna/tox.ini new file mode 100644 index 0000000..21943c5 --- /dev/null +++ b/run-cgprna/tox.ini @@ -0,0 +1,5 @@ +[pep8] +max-line-length = 100 + +[pycodestyle] +max-line-length = 100