diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c6020da..0000000 --- a/.travis.yml +++ /dev/null @@ -1,32 +0,0 @@ -language: python -python: - - 3.7 -install: - - sudo apt-get -y update - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -p $HOME/miniconda - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - - conda info -a - - conda config --append channels bioconda - - conda config --append channels conda-forge - - conda install pandas numpy scipy scikit-learn - - conda install -c bioconda sra-tools - - conda install -c bioconda meme=5.1.1 - - conda install -c bioconda bedtools homer star samtools pysam - - conda install -c bioconda pybedtools - - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION bedtools homer star samtools pysam sra-tools meme pandas numpy scipy scikit-learn pybedtools - - conda activate test-environment - - pip install nose codecov pathos - - python setup.py install -script: - - coverage run --source=inferelator_prior setup.py test -after_success: - - codecov -after_failure: - - python -c "import os; print(repr(os.name))" - - pwd - - find . - - conda list \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d0423f..3594b85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Version 0.3.0 + +* Added `link_atac_bed_to_genes` module to link specific peaks from a BED file to nearby genes + ### Version 0.2.3 * Added additional messaging and a `--debug` mode diff --git a/inferelator_prior/link_atac_bed_to_genes.py b/inferelator_prior/link_atac_bed_to_genes.py new file mode 100644 index 0000000..d6e33e8 --- /dev/null +++ b/inferelator_prior/link_atac_bed_to_genes.py @@ -0,0 +1,100 @@ +from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, GTF_CHROMOSOME, + SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME) +from inferelator_prior.processor.bedtools import load_bed_to_bedtools, intersect_bed + +import argparse +import pandas as pd + + +def main(): + ap = argparse.ArgumentParser(description="Link ATAC peaks in a BED file to genes in a GTF file") + + ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True) + ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+") + ap.add_argument("-b", "--bed", dest="bed", help="Peak BED file", default=None) + ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const', + const=False, default=True) + ap.add_argument("--no_intergenic", dest="no_intergenic", help="Drop peaks not linked to a gene", action='store_const', + const=True, default=False) + ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./peaks_to_genes.bed") + + + args = ap.parse_args() + link_bed_to_genes(args.bed, args.annotation, args.out, use_tss=args.tss, window_size=args.window_size, + non_gene_key=None if args.no_intergenic else "Intergenic") + + +def link_bed_to_genes(bed_file, gene_annotation_file, out_file, use_tss=True, window_size=1000, dprint=print, + non_gene_key="Intergenic"): + """ + Link a BED file (of arbitraty origin) to a set of genes from a GTF file based on proximity + + :param bed_file: Path to the BED file + :type bed_file: str + :param gene_annotation_file: Path to the genome annotation file (GTF) + :type gene_annotation_file: str + :param out_file: Path to the output file + :type out_file: str + :param use_tss: Base gene proximity on the TSS, not the gene body; defaults to True + :type use_tss: bool, optional + :param window_size: Window size (N, M) for proximity, where N is upstream of the gene and M is downstream. + If given as an integer K, interpreted as (K, K); defaults to 1000 + :type window_size: int, tuple, optional + :param dprint: Debug message function (can be overridden to silence), defaults to print + :type dprint: callable, optional + :param non_gene_key: Name for BED peaks that aren't in the genome feature windows. + Set to None to drop peaks that aren't in the genome feature windows; defaults to "Intergenic" + :type non_gene_key: str, optional + :return: Number of peaks before mapping, number of peaks after mapping, dataframe of peaks + :rtype: int, int, pd.DataFrame + """ + + dprint("Loading genes from file ({f})".format(f=gene_annotation_file)) + # Load genes and open a window + genes = load_gtf_to_dataframe(gene_annotation_file) + dprint("{n} genes loaded".format(n=genes.shape[0])) + + + _msg = "Promoter regions defined with window {w} around {g}".format(w=window_size, g="TSS" if use_tss else "gene") + dprint(_msg) + + genes_window = open_window(genes, window_size=window_size, use_tss=use_tss, include_entire_gene_body=True) + + # Create a fake bed file with the gene promoter + genes_window = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy() + genes_window[[SEQ_START, SEQ_STOP]] = genes_window[[SEQ_START, SEQ_STOP]].astype(int) + genes_window = genes_window.sort_values(by=[GTF_CHROMOSOME, SEQ_START]) + + gene_bed = load_bed_to_bedtools(genes_window) + bed_locs = load_bed_to_bedtools(bed_file) + + ia = intersect_bed(gene_bed, bed_locs, wb=True).to_dataframe() + ia.rename({'score': 'gene'}, axis=1, inplace=True) + + # Rebuild an A/B bed file + ia.columns = ['a_chrom', 'a_start', 'a_end', 'a_strand', 'gene', 'b_chrom', 'b_start', 'b_end'] + ia = ia[['b_chrom', 'b_start', 'b_end', 'a_strand', 'gene']] + ia.columns = ['chrom', 'start', 'end', 'strand', 'gene'] + + # Add an intergenic key if set; otherwise peaks that don't overlap will be dropped + if non_gene_key is not None: + ia = ia.merge(bed_locs.to_dataframe(), how="outer", on=['chrom', 'start', 'end']) + ia['gene'] = ia['gene'].fillna(non_gene_key) + + # Make unique peak IDs based on gene + ia['peak'] = ia['gene'].groupby( + ia['gene'] + ).transform( + lambda x: pd.Series(map(lambda y: "_" + str(y), range(len(x))), index=x.index) + ) + ia['peak'] = ia['gene'].str.cat(ia['peak']) + + # Sort for output + ia = ia.sort_values(by=['chrom', 'start']) + ia.to_csv(out_file, sep="\t", index=False, header=False) + + return bed_locs.count(), len(ia), ia + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/make_regulator_bed.py b/inferelator_prior/make_regulator_bed.py new file mode 100644 index 0000000..fcf4d40 --- /dev/null +++ b/inferelator_prior/make_regulator_bed.py @@ -0,0 +1,49 @@ +from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, select_genes, GTF_CHROMOSOME, + SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME, get_fasta_lengths) + +import argparse +import pandas as pd + + +def main(): + ap = argparse.ArgumentParser(description="Create a BED file from a GTF file") + + ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True) + ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True) + ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+") + ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const', + const=False, default=True) + ap.add_argument("--intergenic", dest="intergenic", help="Only consider intergenic regions", action='store_const', + const=True, default=None) + ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./gene.bed") + + + args = ap.parse_args() + + _intergenic = args.intergenic if args.intergenic is not None else False + _use_tss = args.tss + + print("Loading genes from file ({f})".format(f=args.annotation)) + # Load genes and open a window + fasta_gene_len = get_fasta_lengths(args.fasta) + genes = load_gtf_to_dataframe(args.annotation, fasta_record_lengths=fasta_gene_len) + print("{n} genes loaded".format(n=genes.shape[0])) + + + _msg = "Promoter regions defined with window {w} around {g}".format(w=args.window_size, g="TSS" if _use_tss else "gene") + _msg += " [Intergenic]" if _intergenic else "" + print(_msg) + + genes = open_window(genes, window_size=args.window_size, use_tss=_use_tss, fasta_record_lengths=fasta_gene_len, + constrain_to_intergenic=_intergenic) + + # Create a fake bed file with the gene promoter + gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy() + gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int) + gene_locs = gene_locs.sort_values(by=[GTF_CHROMOSOME, SEQ_START]) + + gene_locs.to_csv(args.out, sep="\t", index=False) + + +if __name__ == '__main__': + main() diff --git a/inferelator_prior/network_from_motifs.py b/inferelator_prior/network_from_motifs.py index a616bc9..bd17d48 100644 --- a/inferelator_prior/network_from_motifs.py +++ b/inferelator_prior/network_from_motifs.py @@ -132,8 +132,6 @@ def parse_common_arguments(args): _tandem = SPECIES_MAP[_species]['tandem'] if args.tandem is None else args.tandem - - # Load gene and regulator lists _gl = pd.read_csv(args.genes, index_col=None, header=None)[0].tolist() if args.genes is not None else None _tfl = pd.read_csv(args.tfs, index_col=None, header=None)[0].tolist() if args.tfs is not None else None @@ -160,7 +158,7 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file shuffle=None, lowmem=False, intergenic_only=True): """ Build a motif-based prior from windows around annotated genes. - + :param motif_file: Path to motif file (meme or transfac format) :type motif_file: str :param annotation_file: Path to GTF file containing gene annotations diff --git a/inferelator_prior/network_from_motifs_fasta.py b/inferelator_prior/network_from_motifs_fasta.py index 2838498..77a5bf3 100644 --- a/inferelator_prior/network_from_motifs_fasta.py +++ b/inferelator_prior/network_from_motifs_fasta.py @@ -20,6 +20,7 @@ def main(): # Process common arguments into values add_common_arguments(ap) args = ap.parse_args() + out_prefix, _window, _tandem, _use_tss, _gl, _tfl, _minfo, _ = parse_common_arguments(args) _, _, prior_data = build_motif_prior_from_fasta(args.motif, args.fasta, diff --git a/inferelator_prior/processor/bedtools.py b/inferelator_prior/processor/bedtools.py index fb92f51..bc2b6fe 100644 --- a/inferelator_prior/processor/bedtools.py +++ b/inferelator_prior/processor/bedtools.py @@ -1,8 +1,5 @@ -from inferelator_prior.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, SEQ_START, SEQ_STOP, GTF_STRAND import pandas as pd import pybedtools -import os -import subprocess import tempfile BEDTOOLS_EXTRACT_SUFFIX = ".extract.fasta" @@ -48,12 +45,10 @@ def load_bed_to_bedtools(bed): return pybedtools.BedTool(bed) -def intersect_bed(*beds): +def intersect_bed(*beds, wa=False, wb=False): if len(beds) == 1: return beds[0] beds = [b.sort() for b in beds] - return beds[0].intersect(beds[1:], sorted=True) - - + return beds[0].intersect(beds[1:], sorted=True, wa=wa, wb=wb) diff --git a/inferelator_prior/processor/gtf.py b/inferelator_prior/processor/gtf.py index e78ba7e..3b746e1 100644 --- a/inferelator_prior/processor/gtf.py +++ b/inferelator_prior/processor/gtf.py @@ -74,7 +74,7 @@ def load_gtf_to_dataframe(gtf_path, fasta_record_lengths=None): def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_lengths=None, - constrain_to_intergenic=False): + constrain_to_intergenic=False, include_entire_gene_body=False): """ This needs to adjust the start and stop in the annotation dataframe with window sizes @@ -111,6 +111,13 @@ def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_l window_annotate.loc[window_annotate[WINDOW_UP] < 1, WINDOW_UP] = 1 + if include_entire_gene_body: + to_fix_pos = (window_annotate[GTF_STRAND] == "+") & (window_annotate[WINDOW_DOWN] < window_annotate[SEQ_STOP]) + to_fix_neg = (window_annotate[GTF_STRAND] == "-") & (window_annotate[WINDOW_UP] > window_annotate[SEQ_STOP]) + + window_annotate.loc[to_fix_pos, WINDOW_DOWN] = window_annotate.loc[to_fix_pos, SEQ_STOP] + window_annotate.loc[to_fix_neg, WINDOW_UP] = window_annotate.loc[to_fix_neg, SEQ_START] + if fasta_record_lengths is not None: _gtf_fasta_match = set(window_annotate[GTF_CHROMOSOME].unique()).intersection(set(fasta_record_lengths.keys()))