Merge pull request #235 from lcdb/v1.5.3.rc

V1.5.3.rc
lcdb · Apr 20, 2020 · a9d9443 · a9d9443
2 parents 63dd9b3 + fef118f
commit a9d9443
Show file tree

Hide file tree

Showing 18 changed files with 215 additions and 198 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -5,7 +5,7 @@ variables:
   # default settings for all steps
   defaults: &defaults
     docker:
-      - image: bioconda/bioconda-utils-build-env
+      - image: continuumio/miniconda3
 
   # --------------------------------------------------------------------------
   # The caching dramatically speeds up testing time, because we can do the
@@ -28,7 +28,7 @@ variables:
     save_cache:
       key: v1-{{ checksum "requirements.txt" }}-{{ checksum ".circleci/setup.sh" }}
       paths:
-        - miniconda
+        - /opt/conda
 
   restore_cache: &restore_cache
     restore_cache:
@@ -49,11 +49,8 @@ variables:
     run:
       name: Download example data
       command: |
-        yum install -yy rsync
-        # container has a default python of 2.6, and we're using argparse, so
-        # we need to use the environment.
         source activate lcdb-wf-test
-
+        apt install -y rsync
         # Note that $DEPLOY is set in the "set-paths" step configured below.
         python deploy.py --flavor full --dest $DEPLOY
 
@@ -184,8 +181,10 @@ variables:
       run:
         name: Set path
         command: |
-          echo 'export PATH=$PATH:/root/project/miniconda/bin' >> $BASH_ENV
+          apt install -y locales-all locales
           echo 'export DEPLOY=/tmp/lcdb-wf-test' >> $BASH_ENV
+          echo 'export LC_ALL=en_US.utf8' >> $BASH_ENV
+          echo 'export LANG=en_US.utf8' >> $BASH_ENV
           source $BASH_ENV
 jobs:
 
@@ -301,9 +300,6 @@ jobs:
       - run:
           name: OK for unknown github host
           command: mkdir -p ~/.ssh/ && echo -e "Host github.com\n\tStrictHostKeyChecking no\n" > ~/.ssh/config
-      - run:
-          name: Install git
-          command: yum install git -y
       - add_ssh_keys:
           fingerprints:
             - 2d:0c:b4:27:44:cf:f4:50:cc:14:a4:2b:c2:3c:09:06

diff --git a/.circleci/setup.sh b/.circleci/setup.sh
@@ -1,39 +1,14 @@
 #!/bin/bash
 set -e
 
-WORKSPACE=`pwd`
-MINICONDA_VER=4.3.21
-
-# Set path
-echo "export PATH=$WORKSPACE/miniconda/bin:$PATH" >> $BASH_ENV
-source $BASH_ENV
-
-if ! type conda > /dev/null; then
+apt install -y locales-all locales
+export LC_ALL=en_US.utf8
+export LANG=en_US.utf8
+if ! conda env list | grep -q "lcdb-wf-test"; then
     echo "Setting up conda..."
-
-    # setup conda if not loaded from cache
-    mkdir -p $WORKSPACE
-
-    # step 1: download and install miniconda
-    if [[ $OSTYPE == darwin* ]]; then
-        tag="MacOSX"
-    elif [[ $OSTYPE == linux* ]]; then
-        tag="Linux"
-    else
-        echo "Unsupported OS: $OSTYPE"
-        exit 1
-    fi
-    curl -L -o miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-$MINICONDA_VER-$tag-x86_64.sh
-    bash miniconda.sh -b -p $WORKSPACE/miniconda
-
     conda config --system --add channels defaults
     conda config --system --add channels bioconda
     conda config --system --add channels conda-forge
-
-    # After SSHing in, for some reason this seems to fix it...
-    #conda install -y r-base=3.4.1 bioconductor-genomeinfodbdata bioconductor-annotationhub
-    conda update -y conda
     conda create -n lcdb-wf-test -y --file requirements.txt
-    yum install -y git
 fi
 
diff --git a/.gitignore b/.gitignore
@@ -6,16 +6,21 @@ workflows/rnaseq/downstream/group.tsv
 workflows/rnaseq/downstream/rnaseq.html
 workflows/rnaseq/downstream/rnaseq_cache
 workflows/rnaseq/downstream/rnaseq_files
+workflows/rnaseq/downstream/clusterprofiler
+workflows/rnaseq/downstream/upset_plots
+workflows/rnaseq/downstream/final_clusters
+workflows/rnaseq/downstream/rnaseq.log
 workflows/rnaseq/downstream/*tsv
 workflows/chipseq/data
 workflows/rnaseq/data
 workflows/colocalization/results
+work
 logs
 slurm*out
 workflows/figures/figures
 docs/_build
 \.cache/v/cache/
-/workflows/chipseq/Snakefile.test
-/workflows/rnaseq/Snakefile.test
+*Snakefile.test
 workflows/rnaseq/staging/
 env
+include/AnnotationHubCache
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # `lcdb-wf`
 
+
 A collection of [snakemake](https://snakemake.readthedocs.io/en/stable/)
 workflows and tools for common high-throughput sequencing analysis, along with
 associated infrastructure.

diff --git a/deploy.py b/deploy.py
@@ -91,7 +91,7 @@
     default="full",
     help="""Options are {0}. Default is full.""".format(list(flavors.keys())),
 )
-ap.add_argument("--dest", help="""Destination directory in which to copy files""")
+ap.add_argument("--dest", help="""Destination directory in which to copy files""", required=True)
 ap.add_argument(
     "--build-env",
     action="store_true",

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,8 +1,60 @@
 Changelog
 =========
 
-v1.5.1rc
---------
+v1.5.3
+------
+
+General
+~~~~~~~
+- default 12-hr wall time in WRAPPER_SLURM
+- update .gitignore (`#223 <https://github.com/lcdb/lcdb-wf/issues/223>`_)
+- remove the FastQC status checks section from the MultiQC report (which shows
+  up in recent MultiQC versions) (`#246 <https://github.com/lcdb/lcdb-wf/issues/246>`_
+
+Bugs
+~~~~
+
+- add bed12 conversion for all species with default reference configs
+- presence of an orig_filename_R2 in sampletable is sufficient to consider the
+  experiment PE
+- ensure DEGpattern output only contains unique genes
+- bring back featurecounts in multiqc report
+- "attach" chunk in rnaseq.Rmd was not properly set to depend on the "results" chunk
+
+RNA-seq
+~~~~~~~
+
+- dds objects can now be created from a full featureCounts input file and
+  a subsetted colData table, if subset.counts=TRUE
+- improve the dependencies between rnaseq.Rmd chunks so that cache=TRUE behaves
+  as expected: (`#232 <https://github.com/lcdb/lcdb-wf/issues/232>`_)
+- add plots for rnaseq.Rmd size factors (`#222 <https://github.com/lcdb/lcdb-wf/issues/222>`_)
+- run rseqc instead of CollectRnaSeqMetrics (the multiqc output is nicer for
+  it, and it's pretty much doing the same thing) (`#218 <https://github.com/lcdb/lcdb-wf/issues/218>`_)
+- when converting Ensembl to symbol, if there is no symbol then fall back to
+  the Ensembl ID to avoid NA (`#246
+  <https://github.com/lcdb/lcdb-wf/issues/246>`_)
+- in rnaseq.Rmd, all caches will be invalidated if the sampletable or the
+  featurecounts table have changed.
+
+Tests
+~~~~~
+- using continuumio/miniconda3 container; finally got en_US.utf8 locale
+  installed and working correctly in that container so that multiqc works.
+
+
+v1.5.2
+------
+
+Bug fixes
+~~~~~~~~~
+
+- When some samples were substrings of other samples (e.g., `WT_1_1` and
+  `WT_1_10`), DESeqDataSetFromCombinedFeatureCounts was assigning the wrong
+  names. This has now been fixed in `helpers.Rmd`.
+
+v1.5.1
+------
 
 Bug fixes
 ~~~~~~~~~

diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --job-name="lcdb-wf"
 #SBATCH --partition="norm"
-#SBATCH --time=24:00:00
+#SBATCH --time=12:00:00
 
 # make logdir
 if [[ ! -e logs ]]; then mkdir -p logs; fi

diff --git a/include/reference_configs/Homo_sapiens.yaml b/include/reference_configs/Homo_sapiens.yaml
@@ -23,6 +23,7 @@ references:
           args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/human/gencode_GRCh38.28_to_ucsc_hg38/mappings_gencode_GRCh38.28_to_ucsc_hg38.tsv'
         conversions:
           - 'refflat'
+          - 'bed12'
 
     gencode-v28-transcriptome:
       fasta:

diff --git a/include/reference_configs/Mus_musculus.yaml b/include/reference_configs/Mus_musculus.yaml
@@ -22,6 +22,7 @@ references:
           args: 'https://raw.githubusercontent.com/NICHD-BSPC/chrom-name-mappings/d73fdd4d62ca7e845f9357ea5f08d7a918c17e97/mappings/mouse/gencode_GRCm38.m18_to_ucsc_mm10/mappings_gencode_GRCm38.m18_to_ucsc_mm10.tsv'
         conversions:
           - 'refflat'
+          - 'bed12'
 
 
     gencode_m18_transcriptome:
@@ -46,6 +47,7 @@ references:
         url: 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M12/gencode.vM12.annotation.gtf.gz'
         conversions:
           - 'refflat'
+          - 'bed12'
 
 
     gencode_m12_transcriptome:

diff --git a/lib/common.py b/lib/common.py
@@ -664,6 +664,8 @@ def is_paired_end(sampletable, sample):
         Assumed to be found in the first column of `sampletable`
     """
     row = sampletable.set_index(sampletable.columns[0]).loc[sample]
+    if 'orig_filename_R2' in row:
+        return True
     if 'layout' in row and 'LibraryLayout' in row:
         raise ValueError("Expecting column 'layout' or 'LibraryLayout', "
                          "not both")

diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile
@@ -419,7 +419,7 @@ rule multiqc:
         outdir = os.path.dirname(c.targets['multiqc'][0])
         basename = os.path.basename(c.targets['multiqc'][0])
         shell(
-            'LC_ALL=en_US.UTF.8 LC_LANG=en_US.UTF-8 '
+            'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 '
             'multiqc '
             '--quiet '
             '--outdir {outdir} '

diff --git a/workflows/colocalization/scripts/colocalization_heatmap.py b/workflows/colocalization/scripts/colocalization_heatmap.py
@@ -43,7 +43,7 @@ def dataframe_for_domain(domain, algorithm):
         _df['query'] = query
         _df['reference'] = reference
         df.append(
-            _df.ix[0].to_dict()
+            _df.iloc[0].to_dict()
         )
     return pd.DataFrame(df)
 
@@ -197,7 +197,7 @@ def plot_heatmap(fill_piv, vmin, vmax, title, units, metric='euclidean',
     fill_piv = fill_piv.astype(float)
     # subset if requested
     if idx is not None:
-        fill_piv = fill_piv.ix[idx, idx]
+        fill_piv = fill_piv.loc[idx, idx]
 
     # Distance matrix, setting NaN to zero if necessary
     dist = distance.pdist(fill_piv.values, metric=metric)

diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile
@@ -407,6 +407,7 @@ rule featurecounts:
         counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt'
     log:
         '{sample_dir}/rnaseq_aggregation/featurecounts.txt.log'
+    threads: 4
     run:
         # NOTE: By default, we use -p for paired-end
         p_arg = ''
@@ -561,7 +562,7 @@ rule multiqc:
         outdir = os.path.dirname(c.targets['multiqc'][0])
         basename = os.path.basename(c.targets['multiqc'][0])
         shell(
-            'LC_ALL=en_US.UTF.8 LC_LANG=en_US.UTF-8 '
+            'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 '
             'multiqc '
             '--quiet '
             '--outdir {outdir} '
@@ -733,18 +734,6 @@ rule salmon:
             )
 
 
-rule rseqc_bam_stat:
-    """
-    Calculate various BAM stats with RSeQC
-    """
-    input:
-        bam=c.patterns['bam']
-    output:
-        txt=c.patterns['rseqc']['bam_stat']
-    shell:
-        'bam_stat.py -i {input.bam} > {output.txt}'
-
-
 rule rseqc_infer_experiment:
     """
     Infer strandedness of experiment
@@ -760,6 +749,19 @@ rule rseqc_infer_experiment:
     shell:
         'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}'
 
+rule rseqc_read_distribution:
+    """
+    read distribution plots
+    """
+    input:
+        bam=c.patterns['bam'],
+        bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'],
+    output:
+        txt=c.patterns['rseqc']['read_distribution']
+    log:
+        c.patterns['rseqc']['read_distribution'] + '.log'
+    shell:
+        'read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}'
 
 rule bigwig_neg:
     """

diff --git a/workflows/rnaseq/config/multiqc_config.yaml b/workflows/rnaseq/config/multiqc_config.yaml
@@ -14,7 +14,6 @@ extra_fn_clean_exts:
   - '.salmon'
   - '_R1'
   - '_R2'
-  - '_bam_stat'
 
 
 # Modify the module search patterns to match what we're creating in the
@@ -67,11 +66,15 @@ module_order:
     - picard
     - rseqc
     - salmon
-    - featureCounts:
+    - featurecounts:
         name: 'featureCounts'
         path_filters:
             - 'featurecounts.txt.summary'
 
+remove_sections:
+  - fastqc_status_checks
+
+
 # This organizes the FastQC general sample table columns so that the different
 # stages are right next to each other, making it easier to compare the effects
 # of the different stages on the stats.

diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml
@@ -37,8 +37,8 @@ dupradar:
 preseq: 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt'
 salmon: 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf'
 rseqc:
-   bam_stat: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt'
    infer_experiment: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt'
+   read_distribution: 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt'
 bigwig:
    pos: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig'
    neg: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig'