Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to make this pipeline work with the new modular environment as well #13

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ Toolkit for annotation
| Repository name | branch | URL|
|-----------------|--------|----|
| ensembl | default | https://github.com/Ensembl/ensembl.git |
| ensembl-analysis | experimental/gbiab | https://github.com/Ensembl/ensembl-analysis.git | (need to make sure depencies are on main and update this to main/default for branch)
| ensembl-analysis | experimental/gbiab | https://github.com/Ensembl/ensembl-analysis.git | (need to make sure dependencies are on main and update this to main/default for branch)
| ensembl-io | default | https://github.com/Ensembl/ensembl-io.git |
| ensembl-taxonomy | default | https://github.com/Ensembl/ensembl-taxonomy.git |
| ensembl-variation | default | https://github.com/Ensembl/ensembl-variation.git |


Expand Down
4 changes: 2 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"maxperiod" : 500
},
"trnascan": {
"software": "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/tRNAscan-SE",
"filter_path" : "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/EukHighConfidenceFilter"
"software": "tRNAscan-SE",
"filter_path" : "EukHighConfidenceFilter"
}
}
72 changes: 24 additions & 48 deletions ensembl_anno.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,12 @@
import simple_feature_utils
import utils

with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f:

_REPO_ROOT = pathlib.Path(__file__).parent


config_file = _REPO_ROOT / "config.json"
with config_file.open("r") as f:
config = json.load(f)


Expand Down Expand Up @@ -524,7 +529,7 @@ def run_trnascan_regions(
utils.check_exe(trnascan_path)
logger.info(trnascan_path)
# check_exe(trnascan_filter_path)
check_file(trnascan_filter_path)
utils.check_file(trnascan_filter_path)
logger.info(trnascan_filter_path)

trnascan_output_dir = utils.create_dir(main_output_dir, "trnascan_output")
Expand Down Expand Up @@ -1440,22 +1445,12 @@ def run_genblast_align(
else:
logger.info("No gtf file, go on with the analysis")

genblast_output_file = os.path.join(genblast_dir, "genblast")

asnb_file = masked_genome_file + ".asnb"
logger.info("ASNB file: %s" % asnb_file)

if not os.path.exists("alignscore.txt"):
shutil.copy(
os.environ["ENSCODE"] + "/ensembl-anno/support_files/alignscore.txt", "./"
)
# subprocess.run(
# [
# "cp",
# os.environ["ENSCODE"] + "/ensembl-anno/support_files/alignscore.txt",
# "./",
# ]
# )
alignscore_path = pathlib.Path().absolute() / "alignscore.txt"
if not alignscore_path.exists():
shutil.copyfile(_REPO_ROOT / "support_files" / "alignscore.txt", alignscore_path)

if not os.path.exists(masked_genome_file):
raise IOError("Masked genome file does not exist: %s" % masked_genome_file)
Expand Down Expand Up @@ -1735,7 +1730,7 @@ def run_makeblastdb(makeblastdb_path, masked_genome_file, asnb_file):
"-mask_data",
asnb_file,
"-max_file_sz",
"10000000000",
"4000000000",
]
)
logger.info("Completed running makeblastdb")
Expand Down Expand Up @@ -3544,8 +3539,8 @@ def validate_coding_transcripts(
subprocess.run(cpc2_cmd)
cpc2_output_path = cpc2_output_path + ".txt"

check_file(rnasamba_output_path)
check_file(cpc2_output_path)
utils.check_file(rnasamba_output_path)
utils.check_file(cpc2_output_path)

logger.info("diamond validation")
diamond_results = None
Expand Down Expand Up @@ -4192,12 +4187,6 @@ def create_paired_paths(fastq_file_paths):
return final_list


def check_file(file_path):

if not os.path.exists(file_path):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)


def coallate_results(main_output_dir):

results_dir = utils.create_dir(main_output_dir, "results")
Expand Down Expand Up @@ -4227,6 +4216,7 @@ def coallate_results(main_output_dir):
parser.add_argument(
"--output_dir",
type=str,
default="",
help="Path where the output and temp files will write to. \
Uses current dir by default",
)
Expand Down Expand Up @@ -4490,7 +4480,7 @@ def coallate_results(main_output_dir):
)
args = parser.parse_args()

work_dir = args.output_dir
work_dir = pathlib.Path(args.output_dir).absolute()
genome_file = args.genome_file
num_threads = args.num_threads
# masked_genome_file = genome_file # This will be updated later if Red is run
Expand Down Expand Up @@ -4555,17 +4545,12 @@ def coallate_results(main_output_dir):
species = args.repeatmasker_species

main_script_dir = os.path.dirname(os.path.realpath(__file__))
# work_dir=glob.glob(work_dir)
if not os.path.exists(genome_file):
raise IOError("File does not exist: %s" % genome_file)

if not work_dir:
work_dir = os.getcwd()
# work_dir=glob.glob(work_dir)

# set up logger
log_file_path = pathlib.Path(work_dir) / "ensembl_anno.log"
loginipath = pathlib.Path(os.environ["ENSCODE"] + "/ensembl-anno/logging.conf")
log_file_path = work_dir / "ensembl_anno.log"
loginipath = _REPO_ROOT / "logging.conf"
logging.config.fileConfig(
loginipath,
defaults={"logfilename": log_file_path},
Expand All @@ -4575,25 +4560,16 @@ def coallate_results(main_output_dir):
logger.propagate = False

logger.info("work directory: %s" % work_dir)
if not os.path.exists(work_dir):
logger.info("Work dir does not exist, will create")
utils.create_dir(work_dir, None)
work_dir.mkdir(parents=True, exist_ok=True)

if num_threads == 1:
logger.info("Thread count is set to the default value 1; this might be slow.")

if os.path.exists(
os.path.join(work_dir, "red_output", "mask_output")
) or os.path.join(work_dir, "red_output", "mask_output").endswith(".msk"):
red_genome_file = [
f
for f in os.listdir(os.path.join(work_dir, "red_output", "mask_output"))
if f.endswith(".msk")
]
mask_output_path = work_dir / "red_output" / "mask_output"
if mask_output_path.exists() or (mask_output_path.suffix == ".msk"):
red_genome_file = [f for f in mask_output_path.iterdir() if f.suffix == ".msk"]
logger.info("red_genome_file %s", red_genome_file)
masked_genome_file = os.path.join(
work_dir, "red_output", "mask_output", red_genome_file[0]
)
masked_genome_file = mask_output_path / red_genome_file[0]
else:
masked_genome_file = genome_file
logger.info("Masked genome file %s", masked_genome_file)
Expand Down Expand Up @@ -4783,7 +4759,7 @@ def coallate_results(main_output_dir):
genblast_path,
convert2blastmask_path,
makeblastdb_path,
os.path.join(work_dir, "genblast_output"),
work_dir / "genblast_output",
protein_file,
masked_genome_file,
max_intron_length,
Expand All @@ -4800,7 +4776,7 @@ def coallate_results(main_output_dir):
genblast_path,
convert2blastmask_path,
makeblastdb_path,
os.path.join(work_dir, "busco_output"),
work_dir / "busco_output",
busco_protein_file,
masked_genome_file,
max_intron_length,
Expand Down
21 changes: 13 additions & 8 deletions repeatmasking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,22 @@

import utils


_REPO_ROOT = pathlib.Path(__file__).parent


logger = logging.getLogger(__name__)
with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f:
config_file = _REPO_ROOT / "config.json"
with config_file.open("r") as f:
config = json.load(f)


def run_repeatmasker_regions( # pylint: disable=too-many-arguments
genome_file: typing.Union[pathlib.Path, str],
genome_file: os.PathLike,
repeatmasker_path: str,
library: str,
species: str,
main_output_dir: str,
main_output_dir: os.PathLike,
num_threads: int,
):
"""
Expand Down Expand Up @@ -228,9 +233,9 @@ def create_repeatmasker_gtf( # pylint: disable=too-many-locals


def run_dust_regions(
genome_file: typing.Union[pathlib.Path, str],
genome_file: os.PathLike,
dust_path: str,
main_output_dir: str,
main_output_dir: os.PathLike,
num_threads: int,
):
"""
Expand Down Expand Up @@ -366,9 +371,9 @@ def create_dust_gtf(


def run_trf_repeats( # pylint: disable=too-many-locals
genome_file: typing.Union[pathlib.Path, str],
genome_file: os.PathLike,
trf_path: str,
main_output_dir: str,
main_output_dir: os.PathLike,
num_threads: int,
):
"""
Expand Down Expand Up @@ -563,7 +568,7 @@ def create_trf_gtf(


def run_red(
red_path: str, main_output_dir: str, genome_file: typing.Union[pathlib.Path, str]
red_path: str, main_output_dir: os.PathLike, genome_file: os.PathLike
):
"""
Run Red on genome file
Expand Down
15 changes: 10 additions & 5 deletions simple_feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,21 @@

import utils


_REPO_ROOT = pathlib.Path(__file__).parent


logger = logging.getLogger(__name__)
with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f:
config_file = _REPO_ROOT / "config.json"
with config_file.open("r") as f:
config = json.load(f)


def run_eponine_regions( # pylint: disable=too-many-locals
genome_file: typing.Union[pathlib.Path, str],
genome_file: os.PathLike,
java_path: str,
eponine_path: str,
main_output_dir: str,
main_output_dir: os.PathLike,
num_threads: int,
):
"""
Expand Down Expand Up @@ -193,9 +198,9 @@ def create_eponine_gtf(


def run_cpg_regions(
genome_file: typing.Union[pathlib.Path, str],
genome_file: os.PathLike,
cpg_path: str,
main_output_dir: str,
main_output_dir: os.PathLike,
num_threads: int,
):
"""
Expand Down
39 changes: 19 additions & 20 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,20 @@ def create_dir(main_output_dir, dir_name):
Returns:
str Path to the created directory
"""
target_dir = pathlib.Path(main_output_dir)
if dir_name:
target_dir = os.path.join(main_output_dir, dir_name)
else:
target_dir = main_output_dir

if os.path.exists(target_dir):
target_dir = target_dir / dir_name
if target_dir.exists():
logger.warning("Directory already exists, will not create again")
return target_dir

logger.info("Attempting to create target dir: %s", target_dir)

try:
os.mkdir(target_dir)
except OSError:
logger.error("Creation of the dir failed, path used: %s", target_dir)
else:
logger.info("Successfully created the dir on the following path: %s", target_dir)

return target_dir
logger.info("Attempting to create target dir: %s", target_dir)
try:
target_dir.mkdir(parents=True)
except OSError:
logger.error("Creation of the dir failed, path used: %s", target_dir)
else:
logger.info("Successfully created the dir on the following path: %s", target_dir)
return str(target_dir)


def check_exe(exe_path):
Expand Down Expand Up @@ -388,13 +383,17 @@ def reverse_complement(sequence):
return sequence.translate(rev_matrix)[::-1]


def check_file(file_path: pathlib.Path):
def check_file(file_path: os.PathLike):
"""
Raise an error when the file doesn't exist
Args:
file_path: pathlib.Path
file_path: File to path
Returns:
FileNotFoundError
"""
if not file_path.is_file():
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
fpath = pathlib.Path(file_path)
if not fpath.is_file():
# Check if the given file path needs to be resolved, e.g. which EukHighConfidenceFilter
fpath = shutil.which(file_path)
if not fpath or not pathlib.Path(fpath).is_file():
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)