Skip to content

Commit

Permalink
Merge pull request #359 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
prep release 0.41.0
  • Loading branch information
dozy authored Jan 31, 2025
2 parents 5e76668 + d083fa8 commit 660fd6b
Show file tree
Hide file tree
Showing 23 changed files with 660 additions and 87 deletions.
10 changes: 10 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Referenced from:
# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot

version: 2
updates:

- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
6 changes: 3 additions & 3 deletions .github/workflows/testing_and_building_repo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ jobs:
runs-on: ubuntu-latest
name: Distribution Perl on ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

# Caching cpanm external modules
- name: Cache cpanm external modules
id: cpanmCache
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: ~/perl5ext
key: ${{ runner.os }}-build-cpanm-external
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
# Archive logs if failure
- name: Archive CPAN logs
if: ${{ failure() }}
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: cpan_log
path: /home/runner/.cpanm/work/*/build.log
Expand Down
2 changes: 2 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
CHANGES LOG
-----------

- Added .github/dependabot.yml file to auto-update GitHub actions

0.40.0
- change permissions of STAR working directory to allow deletion by standard pipeline tools
- add Dockerfile to create container with appropriate tools for basic analyses
Expand Down
42 changes: 21 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ ARG BASE_IMAGE=ubuntu:22.04

ARG BAMBI_VERSION="0.18.0"
ARG BIOBAMBAM2_VERSION="2.0.185-release-20221211202123"
ARG BWA_VERSION="0.7.18"
ARG BWA_VERSION="0.7.17"
ARG BWA_MEM2_VERSION="2.2.1"
ARG DEFLATE_VERSION="1.20"
ARG HTSLIB_VERSION="1.20"
ARG HTSLIB_VERSION="1.21"
ARG IO_LIB_VERSION="1.15.0"
ARG LIBMAUS2_VERSION="2.0.813-release-20221210220409"
ARG NPG_SEQ_COMMON_VERSION="51.1"
ARG SAMTOOLS_VERSION="1.20"
ARG SAMTOOLS_VERSION="1.21"
ARG TEEPOT_VERSION="1.2.0"
ARG PCAP_CORE_VERSION="5.7.0"

Expand Down Expand Up @@ -82,21 +82,6 @@ RUN SLUG=$(echo ${IO_LIB_VERSION} | tr '.' '-') && \
make -j $(nproc) install && \
ldconfig

ARG LIBMAUS2_VERSION
RUN curl -sSL -O "https://gitlab.com/german.tischler/libmaus2/-/archive/${LIBMAUS2_VERSION}/libmaus2-${LIBMAUS2_VERSION}.tar.bz2" && \
tar xfj libmaus2-${LIBMAUS2_VERSION}.tar.bz2 && \
cd libmaus2-${LIBMAUS2_VERSION} && \
./configure --prefix=/usr/local --with-io_lib --with-nettle && \
make -j $(nproc) install && \
ldconfig

ARG BIOBAMBAM2_VERSION
RUN curl -sSL -O "https://gitlab.com/german.tischler/biobambam2/-/archive/${BIOBAMBAM2_VERSION}/biobambam2-${BIOBAMBAM2_VERSION}.tar.bz2" && \
tar xfj biobambam2-${BIOBAMBAM2_VERSION}.tar.bz2 && \
cd biobambam2-${BIOBAMBAM2_VERSION} && \
./configure && \
make -j $(nproc) install

ARG TEEPOT_VERSION
RUN curl -sSL -O "https://github.com/wtsi-npg/teepot/releases/download/${TEEPOT_VERSION}/teepot-${TEEPOT_VERSION}.tar.gz" && \
tar xzf teepot-${TEEPOT_VERSION}.tar.gz && \
Expand All @@ -121,9 +106,10 @@ RUN curl -sSL -O "https://github.com/samtools/samtools/releases/download/${SAMTO

ARG BWA_VERSION
RUN curl -sSL -O "https://github.com/lh3/bwa/archive/refs/tags/v${BWA_VERSION}.tar.gz" && \
tar xzf v${BWA_VERSION}.tar.gz && \
cd bwa-${BWA_VERSION} && \
make -j $(nproc) && \
tar xzvf ./v${BWA_VERSION}.tar.gz && \
cd ./bwa-${BWA_VERSION} && \
pwd && \
make CC='gcc -fcommon' -j $(nproc) && \
cp ./bwa /usr/local/bin/ && \
chmod +x /usr/local/bin/bwa && \
ln -s /usr/local/bin/bwa /usr/local/bin/bwa0_6
Expand All @@ -146,6 +132,20 @@ RUN git clone --single-branch --branch="$BAMBI_VERSION" --depth=1 "https://githu
./configure && \
make -j $(nproc) install

ARG LIBMAUS2_VERSION
RUN curl -sSL -O "https://gitlab.com/german.tischler/libmaus2/-/archive/${LIBMAUS2_VERSION}/libmaus2-${LIBMAUS2_VERSION}.tar.bz2" && \
tar xfj libmaus2-${LIBMAUS2_VERSION}.tar.bz2 && \
cd libmaus2-${LIBMAUS2_VERSION} && \
./configure --prefix=/usr/local --with-io_lib --with-nettle && \
make -j $(nproc) install && \
ldconfig

ARG BIOBAMBAM2_VERSION
RUN curl -sSL -O "https://gitlab.com/german.tischler/biobambam2/-/archive/${BIOBAMBAM2_VERSION}/biobambam2-${BIOBAMBAM2_VERSION}.tar.bz2" && \
tar xfj biobambam2-${BIOBAMBAM2_VERSION}.tar.bz2 && \
cd biobambam2-${BIOBAMBAM2_VERSION} && \
./configure && \
make -j $(nproc) install

ARG PCAP_CORE_VERSION
RUN git clone --single-branch --branch="$PCAP_CORE_VERSION" --depth=1 "https://github.com/cancerit/PCAP-core.git" && \
Expand Down
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# P<sup>4</sup> | Process and Pipe Pipeline Panacea

p4 is a tool to help streaming of data between processes (creating a data pipeline).

## Rationale

The UNIX pipe `|` is typically used for this purpose of streaming data between processes.

When there is a need for part of the same data stream to be processed in more than one way the `tee` tool may be used together with shell process substitution e.g.

```
tool1 | tee >(tool2a > 2a.out) | tool2b > 2b.out
```

Should another tool need to combine those two outputs (in a streaming manner) then UNIX fifos can be used.

Scripts to perform such combinations of streaming data flow can rapidly become somewhat messy and tricky to maintain.

Better, perhaps, to declare this streaming data flow as a graph and create the processes (nodes) and appropriate pipes and fifos between them (edges) in a standard manner from that declaration: this is p4.

## Nuance

- (unless using async IO) pipes will only allow an `open` to complete when the other end of the pipe also has an `open` called on it. Deadlocks can be created when one tool is waiting on another which is effectively waiting on the original.
- pipes have a limited size. Where there are bubble-like structures in a dataflow, one side of the bubble may try to grab data in very different chunk sizes to the other side. This can then lead to a deadlock when joining the data flows together.
- `SIGPIPE` on one output will terminate `tee` - if only the beginning of the data stream is required on one output this needs to be dealt with to allow the stream to continue on the other outputs.

To help resolve the later two issues we can use the [`teepot`](https://github.com/wtsi-npg/teepot) tool in the graph of processes in place of `tee`.


## Component scripts

There are two key scripts in p4:

- [`viv.pl`](./README) creates the processes, and the fifos and pipes to stream data between them for a particular graph (provided in a JSON file).
- [`vtfp.pl`](README.vtfp) allows for reuse of standard pipelines by taking parameters and template JSON files to create the graph JSON file fed to `viv.pl`


## Motivation and Bioinformatics

In the early 2010s, NPG, a core informatics team at Wellcome Sanger Institute, found ourselves having to process a rapidly increasing amount of short read sequencing data on "fat" NFS servers rather than high performance filesystems, like Lustre, to keep hardware costs at an acceptable level. This drove us to avoid disk IO (to avoid IO wait and poor CPU usage) where possible by streaming data between tools.

We still advocate this as it avoids:
- the need for performant staging disk for many intermediate files
- the added latency of writing and reading data from such staging disk
- the CPU spent on compressing and decompressing data as it is shuttled on and off the staging disk.

The downsides of streaming data are that
- restarting a failed job cannot be from intermediate files which do not exist (we find this only really impacts when developing pipelines rather than once in production),
- the size of the data pipeline in terms of CPU and RAM required is limited by the size of individual machines available, and that
- contemporary (2024) bio-informatics frameworks only support such streaming in a limited way and often advise against it (as not fitting well with reusability of components in their paradigm. However, p4 or something like it could fit happily as a functionally and computationally big step in such a pipeline).

34 changes: 29 additions & 5 deletions bin/vtfp.pl
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,12 @@ sub apply_subst {
$ewi->{removelabel}->();
}

if($cfg->{subgraph_io}) {
$ewi->{addlabel}->(q{subgraph_io});
$cfg->{subgraph_io} = subst_walk($cfg->{subgraph_io}, $params, $ewi);
$ewi->{removelabel}->();
}

return;
}

Expand Down Expand Up @@ -1412,9 +1418,9 @@ sub validate_splice_candidates {
}
}

# all edge termini must be unique (over replacement and pruning edges) except for STDIN/STDOUT
# all edge termini must be unique (over replacement edges) except for STDIN/STDOUT
my %endpoints;
for my $edge (@{$splice_candidates->{replacement_edges}}, @{$prune_edges}) {
for my $edge (@{$splice_candidates->{replacement_edges}}) {
my $from_end = $edge->{from};
if($from_end and $from_end !~ /:/) { $from_end .= q[:STDOUT] };

Expand Down Expand Up @@ -1472,15 +1478,33 @@ sub final_splice {
# add new edges
push @{$flat_graph->{edges}}, @{$splice_candidates->{replacement_edges}};

# remove pruned ports - prune edges are not required to be two-ended; just disregard undefined to/from attributes
# remove pruned ports - prune edges are not required to be two-ended; just disregard undefined to/from attributes; only remove ports
# that do not appear in splice edges (aka replacement edges)
for my $prune_edge (@{$splice_candidates->{prune_edges}}) {
if($prune_edge->{from}) { remove_port($prune_edge->{from}, $SRC, $flat_graph); }
if($prune_edge->{to}) { remove_port($prune_edge->{to}, $DST, $flat_graph); }
if($prune_edge->{from} and not _in_replacement_edges($prune_edge->{from}, $splice_candidates, $SRC)) { remove_port($prune_edge->{from}, $SRC, $flat_graph); }
if($prune_edge->{to} and not _in_replacement_edges($prune_edge->{to}, $splice_candidates, $DST)) { remove_port($prune_edge->{to}, $DST, $flat_graph); }
}

return $flat_graph;
}

sub _in_replacement_edges {
my ($port_spec, $splice_candidates, $type) = @_;

my $direction = ($type == $SRC)? q[from]: q[to];
my $std_port = ($type == $SRC)? q[STDIN]: q[STDOUT];

for my $edge (@{$splice_candidates->{replacement_edges}}) {
my $end = $edge->{$direction};
if($end and $end !~ /:/) { $end .= qq[:$std_port] };

if($end eq $port_spec) { return 1; }
}

return 0;

}

################################################################################################
# resolve_ports:
# given a splice_pair specification, fully determine the [set of] source and destination ports
Expand Down
13 changes: 13 additions & 0 deletions data/static_params/stage2_reanalysis/align_bwa_mem2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"assign": [
{
"alignment_method": "bwa_mem",
"bwa_executable": "bwa-mem2"
}
],
"assign_local": {},
"ops": {
"splice": [],
"prune": []
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"assign": [
{
"spatial_filter_switch":"off",
"markdup_optical_distance_value": "100",
"s2_se_pe": "pe",
"samtools_executable": "samtools",
"s2_input_format": "cram",
"markdup_method": "duplexseq",
"s2_ppi_switch":"s2_ppi",
"pp_read2tags":"on",
"pp_import_method":"crammerge",
"fastq_s2_pi_fq1": "DUMMY",
"fastq_s2_pi_fq2": "DUMMY",
"fastq_s2_pi_RG_ID": "DUMMY",
"s2_filter_files": "DUMMY",
"spatial_filter_file": "DUMMY",
"phix_reference_genome_fasta":"DUMMY",
"realignment_switch":0
}
],
"assign_local": {},
"ops": {
"splice": [
"aln_bam12auxmerge:-foptgt_000_fixmate:",
"foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk"
],
"prune": [
"foptgt.*_bmd_multiway:calibration_pu-",
"foptgt_cram_tee:c2a-",
"foptgt.*samtools_stats_F0.*_target.*-",
"foptgt.*samtools_stats_F0.*00_bait.*-",
"aln_tee3_tee3:to_phix_aln-scs_cmp_seqchksum:outputchk",
"ssfqc_tee_ssfqc:subsample-"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"assign": [
{
"spatial_filter_switch":"off",
"markdup_optical_distance_value": "100",
"s2_se_pe": "pe",
"samtools_executable": "samtools",
"s2_input_format": "cram",
"markdup_method": "duplexseq",
"s2_ppi_switch":"s2_ppi",
"pp_read2tags":"on",
"pp_import_method":"fastq",
"incrams": "DUMMY",
"s2_filter_files": "DUMMY",
"spatial_filter_file": "DUMMY",
"phix_reference_genome_fasta":"DUMMY",
"realignment_switch":0
}
],
"assign_local": {},
"ops": {
"splice": [
"aln_bam12auxmerge:-foptgt_000_fixmate:",
"foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk"
],
"prune": [
"foptgt.*_bmd_multiway:calibration_pu-",
"foptgt_cram_tee:c2a-",
"foptgt.*samtools_stats_F0.*_target.*-",
"foptgt.*samtools_stats_F0.*00_bait.*-",
"aln_tee3_tee3:to_phix_aln-scs_cmp_seqchksum:outputchk",
"ssfqc_tee_ssfqc:subsample-"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"assign": [
{
"spatial_filter_switch":"off",
"markdup_optical_distance_value": "100",
"s2_se_pe": "pe",
"samtools_executable": "samtools",
"s2_input_format": "cram",
"markdup_method": "samtools",
"s2_ppi_switch":"s2_ppi",
"pp_import_method":"crammerge",
"fastq_s2_pi_fq1": "DUMMY",
"fastq_s2_pi_fq2": "DUMMY",
"fastq_s2_pi_RG_ID": "DUMMY",
"s2_filter_files": "DUMMY",
"spatial_filter_file": "DUMMY",
"phix_reference_genome_fasta":"DUMMY",
"realignment_switch":1
}
],
"assign_local": {},
"ops": {
"splice": [
"aln_bam12auxmerge:-foptgt_000_fixmate:",
"foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk"
],
"prune": [
"foptgt.*_bmd_multiway:calibration_pu-",
"foptgt_cram_tee:c2a-",
"foptgt.*samtools_stats_F0.*_target.*-",
"foptgt.*samtools_stats_F0.*00_bait.*-",
"aln_tee3_tee3:to_phix_aln-scs_cmp_seqchksum:outputchk",
"ssfqc_tee_ssfqc:subsample-"
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"assign": [
{
"spatial_filter_switch":"off",
"markdup_optical_distance_value": "100",
"s2_se_pe": "pe",
"samtools_executable": "samtools",
"s2_input_format": "cram",
"markdup_method": "samtools",
"s2_ppi_switch":"s2_ppi",
"pp_read2tags":"off",
"pp_import_method":"crammerge",
"fastq_s2_pi_fq1": "DUMMY",
"fastq_s2_pi_fq2": "DUMMY",
"fastq_s2_pi_RG_ID": "DUMMY",
"s2_filter_files": "DUMMY",
"spatial_filter_file": "DUMMY",
"phix_reference_genome_fasta":"DUMMY",
"realignment_switch":1
}
],
"assign_local": {},
"ops": {
"splice": [],
"prune": [
"fop[ht].*_bmd_multiway:calibration_pu-",
"foptgt_cram_tee:c2a-",
"fop[ht].*samtools_stats_F0.*_target.*-",
"fop[ht].*samtools_stats_F0.*00_bait.*-",
"aln_tee4_tee4:to_phix_aln-alignment_filter:phix_bam_in",
"alignment_filter:phix_bam_out-scs_merge_output_seqchksum:__PHIX_CHKSUM_IN__",
"ssfqc_tee_ssfqc:subsample-"
]
}
}
Loading

0 comments on commit 660fd6b

Please sign in to comment.