diff --git a/.github/workflows/kinship_lint.yml b/.github/workflows/kinship_lint.yml new file mode 100644 index 0000000..3e109cc --- /dev/null +++ b/.github/workflows/kinship_lint.yml @@ -0,0 +1,17 @@ +name: Kinship Lint +on: + pull_request: + paths: [Kinship/**] + push: + branches: [master, develop] + paths: [Kinship/**] + +jobs: + ruff: + runs-on: ubuntu-latest + defaults: + run: + working-directory: Kinship/ + steps: + - uses: actions/checkout@v4 + - uses: chartboost/ruff-action@v1 diff --git a/.github/workflows/kinship_test.yml b/.github/workflows/kinship_test.yml new file mode 100644 index 0000000..25acb4e --- /dev/null +++ b/.github/workflows/kinship_test.yml @@ -0,0 +1,63 @@ +# Source: https://github.com/marketplace/actions/install-poetry-action +name: Kinship Test +on: + pull_request: + paths: [Kinship/**] + push: + branches: [master, develop] + paths: [Kinship/**] + +jobs: + pytest: + runs-on: ubuntu-latest + defaults: + run: + working-directory: Kinship/ + steps: + #---------------------------------------------- + # check-out repo and set-up python + #---------------------------------------------- + - name: Check out repository + uses: actions/checkout@v4 + - name: Set up python + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: '3.11.5' + #---------------------------------------------- + # install & configure poetry + #---------------------------------------------- + - name: Install Poetry + uses: snok/install-poetry@v1 + with: + version: 1.8.3 + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + #---------------------------------------------- + # load cached venv if cache exists + #---------------------------------------------- + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v4 + with: + path: .venv + key: venv_kinship-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + #---------------------------------------------- + # install dependencies if cache does not exist + #---------------------------------------------- + - name: Install dependencies + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + #---------------------------------------------- + # install root project + #---------------------------------------------- + - name: Install project + run: poetry install --no-interaction + #---------------------------------------------- + # run pytest + #---------------------------------------------- + - name: Run tests + run: | + poetry run pytest . diff --git a/Kinship/Dockerfile b/Kinship/Dockerfile index c626fb2..0a9db8a 100644 --- a/Kinship/Dockerfile +++ b/Kinship/Dockerfile @@ -1,10 +1,44 @@ -FROM --platform=linux/amd64 continuumio/miniconda3 +################## BASE IMAGEs ###################### +FROM continuumio/miniconda3:24.5.0-0 -LABEL base_image="continuumio/miniconda3" -LABEL version="1.0.0" -LABEL extra.binaries="vcftools, plink, king" +################## METADATA ###################### +LABEL base_image="continuumio/miniconda3:24.5.0-0" +LABEL version="1.1.1" +LABEL extra.binaries="vcftools, plink, king, pandas" +################## INSTALLATIONS ###################### +# Use conda to install other dependencies. RUN conda config --add channels bioconda && \ conda config --add channels conda-forge && \ conda install -y vcftools=0.1.14 plink=1.90b4 king=2.2.7 +# Use poetry to install virtualenv. +ENV POETRY_VERSION=1.8.3 +ENV POETRY_HOME=/opt/poetry +ENV POETRY_VENV=/opt/poetry-venv + +# Tell Poetry where to place its cache and virtual environment +ENV POETRY_CACHE_DIR=/tmp/poetry_cache + +# Do not ask any interactive question +ENV POETRY_NO_INTERACTION=1 + +# Make poetry create the virtual environment in the project's root +# it gets named `.venv` +ENV POETRY_VIRTUALENVS_IN_PROJECT=1 +ENV POETRY_VIRTUALENVS_CREATE=1 + +# Set virtual_env variable +ENV VIRTUAL_ENV=/.venv +# Prepend virtual environments path +ENV PATH="${VIRTUAL_ENV}/bin:${POETRY_VENV}/bin:${PATH}" + +# Creating a virtual environment just for poetry and install it with pip +RUN python3 -m venv $POETRY_VENV \ + && $POETRY_VENV/bin/pip3 install poetry==${POETRY_VERSION} + +# Copy project requirement files here to ensure they will be cached. +COPY pyproject.toml poetry.lock ./ + +# Install dependencies. +RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --without dev --no-interaction --no-cache diff --git a/Kinship/Kinship.nf b/Kinship/Kinship.nf index 9241dbf..3e670b0 100644 --- a/Kinship/Kinship.nf +++ b/Kinship/Kinship.nf @@ -1,7 +1,8 @@ process Kinship { tag {"Kinship ${analysis_id}"} label 'Kinship' - container = 'docker.io/umcugenbioinf/kinship:1.0.0' + container = 'ghcr.io/umcugenetics/kinship:1.1.1' + shell = ['/bin/bash', '-euo', 'pipefail'] input: @@ -17,6 +18,6 @@ process Kinship { plink --file out --make-bed --noweb king -b plink.bed --kinship cp king.kin0 ${analysis_id}.kinship - python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} > ${analysis_id}.kinship_check.out + python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} --output_prefix ${analysis_id} --output_path . """ -} \ No newline at end of file +} diff --git a/Kinship/check_kinship.py b/Kinship/check_kinship.py index 82c23dc..0c9f4cf 100644 --- a/Kinship/check_kinship.py +++ b/Kinship/check_kinship.py @@ -1,97 +1,250 @@ #! /usr/bin/env python +# Import statements, alphabetic order of main package. import argparse +from errno import ENOENT as errno_ENOENT +from os import strerror as os_strerror +from pathlib import Path +from sys import argv, exit +import tempfile + +# Third party libraries alphabetic order of main package. +from pandas import read_table + + +def validate_non_empty_existing_path(file_or_dir): + """ + This function checks whether the provided file or dir exists and is not empty. + + Args: + file_or_dir (string): Input file or directory + + Raises: + FileNotFoundError: If input string file_or_dir is neither a file nor a dir. + OSError: If input is not a dir and file is empty. + + Returns: + string: Provided input file or directory. If dir, suffix '/' might be added. + """ + input_path = Path(file_or_dir) + if not input_path.is_file() and not input_path.is_dir(): + raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir) + elif not input_path.is_dir() and not input_path.stat().st_size: + raise OSError(f"File {file_or_dir} is empty.") + elif input_path.is_dir() and file_or_dir[::-1][0] != '/': + return f"{file_or_dir}/" + else: + return file_or_dir + + +def parse_arguments_and_check(args_in): + """ + Parses arguments and validates / checks format of input. + + Args: + args_in (list of strings): Commandline input arguments. + + Returns: + Namespace: Convert argument strings to objects and assign them as attributes of the namespace. + """ + parser = argparse.ArgumentParser(description='Check kinship output based on ped file.') + parser.add_argument('kinship_file', type=validate_non_empty_existing_path, help='Kinship file') + parser.add_argument('ped_file', type=validate_non_empty_existing_path, help='PED file') + parser.add_argument( + '-p', '--output_path', type=validate_non_empty_existing_path, default=None, + help='Kinship output path where output file will be stored.', + ) + parser.add_argument( + '-o', '--output_prefix', type=str, default=None, + help='Kinship output prefix for all output file names.' + ) + parser.add_argument( + '-s', '--kinship_settings', type=float, nargs=2, metavar=('minimum', 'maximum'), default=[0.177, 0.354], + help='Kinship settings defining minimum and maximum threshold.' + ) + arguments = parser.parse_args(args_in) + return arguments def parse_ped(ped_file): + """ + Parse ped file to a samples dict where per sample (key) a metadata dict (value) is created + with the family ID (string), parents (list) and children (list) as content. + + Args: + ped_file (open file): Open file object to a table with sample metadata, including the following values: + familyID (string): Unique identifier per family. In other words, s + samples from the same family will have the same familyID. + sampleID (string): Unique identifier of sample. + father (string): reference to another sampleID. + mother (string): reference to another sampleID. + sex (int): 0 (unknown), 1 (male) or 2 (female) + phenotype (int): 0 (unknown), 1 (unaffected) or 2 (affected) + + Returns: + dict: Per sample (key) a metadata dict (value) with + the family ID (string), parents (list) and children (list). + """ samples = {} # 'sample_id': {'family': 'fam_id', 'parents': ['sample_id', 'sample_id']} - for line in ped_file: - ped_data = line.strip().split() - family, sample, father, mother, sex, phenotype = ped_data - - # Create samples - if sample not in samples: - samples[sample] = {'family': family, 'parents': [], 'children': []} - if father != '0' and father not in samples: - samples[father] = {'family': family, 'parents': [], 'children': []} - if mother != '0' and mother not in samples: - samples[mother] = {'family': family, 'parents': [], 'children': []} - - # Save sample relations - if father != '0': - samples[sample]['parents'].append(father) - samples[father]['children'].append(sample) - if mother != '0': - samples[sample]['parents'].append(mother) - samples[mother]['children'].append(sample) + with open(ped_file, "r") as ped_file_open: + for line in ped_file_open: + ped_data = line.strip().split() + try: + family, sample, father, mother, sex, phenotype = ped_data + except ValueError as error: + print(f"Failed to parse ped file data {line}; {error}") + exit(1) + + # Create samples + if sample not in samples: + samples[sample] = {'family': family, 'parents': [], 'children': []} + if father != '0' and father not in samples: + samples[father] = {'family': family, 'parents': [], 'children': []} + if mother != '0' and mother not in samples: + samples[mother] = {'family': family, 'parents': [], 'children': []} + + # Save sample relations + if father != '0': + samples[sample]['parents'].append(father) + samples[father]['children'].append(sample) + if mother != '0': + samples[sample]['parents'].append(mother) + samples[mother]['children'].append(sample) return samples -def check_kinship(kinship_file, samples, kinship_setting): - kinship_errors = False - print_kinship('sample_1', 'sample_2', 'kinship', 'related', 'type', 'status') # header - for line in kinship_file: - # Parse kinship data - if line.startswith('FID1'): - continue # skip header line - kinship_data = line.strip().split() - sample_1, sample_2, kinship = kinship_data[1], kinship_data[3], float(kinship_data[7]) - - # Check kinship data +def read_and_modify_kinship(kinship_file, kinship_min, kinship_max): + """ + Read and modify kinship file content by renaming and adding columns. + + Args: + kinship_file (string): File with retrieved kinship values as result of running the tool + 'KING' (Kinship-based INference for Gwas) + kinship_min (float): Minimum threshold to check if samples are kin. + kinship_max (float): Maximum threshold to check if samples are kin, without being self-self relationship. + + Returns: + pandas DataFrame: Retrieved kinship data input with additional columns. + """ + kinship_data = ( + # Read kinship data specific columns + read_table(kinship_file, delimiter='\t', usecols=['FID1', 'FID2', 'Kinship']) + # Rename columns + .rename(columns={'FID1': 'sample_1', 'FID2': 'sample_2', 'Kinship': 'kinship'}) + # Add columns with default values + .assign( + related=None, type=None, status=None, + thresholds=f"{kinship_min},{kinship_max}", + message='' + ) + ) + return kinship_data + + +def check_and_annotate_kinship(kinship_data, samples, kinship_min, kinship_max): + """ + Calculated kinship values are judged and results are added to the dataframe. + Results include: + related (boolean): Are samples related to each other, aka kin. + type (string): The relationship type in words, one of: unrelated, parent_parent, parent_child, sibling_sibling + status (string): Whether the kinship value is within the expected range: 'FAIL' or 'OK' + message (string): User friendly message to explain error if status equals 'FAIL'. Empty when status equals 'OK'. + + Args: + kinship_data (pandas DataFrame): Retrieved kinship data input with additional columns. + samples (dict): Per sample (key) a metadata dict (values) with + the family ID (string), parents (list) and children (list). + kinship_min (float): Minimum threshold to check if samples are kin. + kinship_max (float): Maximum threshold to check if samples are kin, without being self-self relationship. + + Returns: + pandas DataFrame: Retrieved kinship data with annotated / judged results. + """ + for index, row in kinship_data.iterrows(): + status = 'OK' + message = '' # Related - if samples[sample_1]['family'] == samples[sample_2]['family']: + if samples[row.sample_1]['family'] == samples[row.sample_2]['family']: + related = True # Parent - child - if sample_2 in samples[sample_1]['parents'] or sample_1 in samples[sample_2]['parents']: - if kinship > kinship_setting[0] and kinship < kinship_setting[1]: - print_kinship(sample_1, sample_2, kinship, True, 'parent_child', 'OK') - else: - print_kinship(sample_1, sample_2, kinship, True, 'parent_child', 'FAIL') - kinship_errors = True + if row.sample_2 in samples[row.sample_1]['parents'] or row.sample_1 in samples[row.sample_2]['parents']: + type = 'parent_child' + if row.kinship <= kinship_min or row.kinship >= kinship_max: + status = 'FAIL' + expected_value_range = f"> {kinship_min} and < {kinship_max}" # Parent - Parent -> both samples have the same children - elif samples[sample_1]['children'] and samples[sample_1]['children'] == samples[sample_2]['children']: - if kinship <= kinship_setting[0]: - print_kinship(sample_1, sample_2, kinship, True, 'parent_parent', 'OK') - else: - print_kinship(sample_1, sample_2, kinship, True, 'parent_parent', 'FAIL') - kinship_errors = True - # Aassume siblings + elif samples[row.sample_1]['children'] and samples[row.sample_1]['children'] == samples[row.sample_2]['children']: + type = 'parent_parent' + if row.kinship > kinship_min: + status = 'FAIL' + expected_value_range = f"<= {kinship_min}" + # Assume siblings else: - if kinship > kinship_setting[0] and kinship < kinship_setting[1]: - print_kinship(sample_1, sample_2, kinship, True, 'sibling_sibling', 'OK') - else: - print_kinship(sample_1, sample_2, kinship, True, 'sibling_sibling', 'FAIL') - kinship_errors = True + type = 'sibling_sibling' + if row.kinship <= kinship_min or row.kinship >= kinship_max: + status = 'FAIL' + expected_value_range = f"> {kinship_min} and < {kinship_max}" # Unrelated else: - if kinship <= kinship_setting[0]: - print_kinship(sample_1, sample_2, kinship, False, 'NA', 'OK') - else: - print_kinship(sample_1, sample_2, kinship, False, 'NA', 'FAIL') - kinship_errors = True - - return kinship_errors + related = False + type = 'unrelated' + if row.kinship > kinship_min: + status = 'FAIL' + expected_value_range = f"<= {kinship_min}" + + # Create end user message if status is fail + if status == 'FAIL': + message = ( + f"Kinship value {row.kinship} between " + f"{row.sample_1} ({samples[row.sample_1]['family']}) and {row.sample_2} ({samples[row.sample_2]['family']}) " + f"is not between expected values for {type}: {expected_value_range}" + ) + # Update row with retrieved related (boolean), relationship type, status (OK / FAIL) and message. + kinship_data.loc[index, ['related', 'type', 'status', 'message']] = related, type, status, message + return kinship_data + + +def write_kinship(df_kinship_out, output_path, output_prefix): + """ + Write the retrieved and annoted kinship data to file or stdout. Include comments as header. + + Args: + df_kinship_out (pandas DataFrame): Retrieved kinship data with annotated / judged results. + output_path (string): Path to output dir where outputfile is stored. + output_prefix (string): Prefix to use output filename. + """ + # Collect all comments + comments = [] + if any(df_kinship_out.status == 'FAIL'): + comments.append('# WARNING: Kinship errors found.\n') + else: + comments.append('# No kinship errors found.\n') + # Assume all row values of column thresholds are the same. + comments.append(f"# Used kinship check settings: {df_kinship_out.loc[0, 'thresholds']}\n") + # Write to provided output settings or to a tempfile + if output_path and output_prefix: + file_out = open(f"{output_path}/{output_prefix}.kinship_check.out", 'a+') + else: + file_out = tempfile.TemporaryFile(mode='a+') -def print_kinship(sample_1, sample_2, kinship, fam_status, relation_status, kinship_status): - print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(sample_1, sample_2, kinship, fam_status, relation_status, kinship_status)) + # Write comments as header + file_out.writelines(comments) + # Append annotated kinship results + df_kinship_out.to_csv(file_out, sep='\t', index=False, header=True) + # Decide if file should be printed to stdout instead + if not output_path or not output_prefix: + file_out.seek(0) + print(file_out.read()) + # Closing a tempfile will delete it as well + file_out.close() -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Check kinship output based on ped file.') - parser.add_argument('kinship_file', type=argparse.FileType('r'), help='Kinship file') - parser.add_argument('ped_file', type=argparse.FileType('r'), help='PED file') - arguments = parser.parse_args() +if __name__ == '__main__': + arguments = parse_arguments_and_check(args_in=argv[1:]) + kinship_min, kinship_max = arguments.kinship_settings - # settings - kinship_setting = [0.177, 0.354] - - # Parse ped file and check kinship samples = parse_ped(arguments.ped_file) - kinship_errors = check_kinship(arguments.kinship_file, samples, kinship_setting) - - # Print summary - if kinship_errors: - print("\n# WARNING: Kinship errors found.") - else: - print("\n# No kinship errors found.") - print("# Used kinship check settings: {0}".format(kinship_setting)) + df_kinship_in = read_and_modify_kinship(arguments.kinship_file, kinship_min, kinship_max) + df_kinship_out = check_and_annotate_kinship(df_kinship_in, samples, kinship_min, kinship_max) + write_kinship(df_kinship_out, arguments.output_path, arguments.output_prefix) diff --git a/Kinship/poetry.lock b/Kinship/poetry.lock new file mode 100644 index 0000000..a6bcba0 --- /dev/null +++ b/Kinship/poetry.lock @@ -0,0 +1,278 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "numpy" +version = "1.26.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "pandas" +version = "2.1.4" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdec823dc6ec53f7a6339a0e34c68b144a7a1fd28d80c260534c39c62c5bf8c9"}, + {file = "pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:294d96cfaf28d688f30c918a765ea2ae2e0e71d3536754f4b6de0ea4a496d034"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b728fb8deba8905b319f96447a27033969f3ea1fea09d07d296c9030ab2ed1d"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00028e6737c594feac3c2df15636d73ace46b8314d236100b57ed7e4b9ebe8d9"}, + {file = "pandas-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:426dc0f1b187523c4db06f96fb5c8d1a845e259c99bda74f7de97bd8a3bb3139"}, + {file = "pandas-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:f237e6ca6421265643608813ce9793610ad09b40154a3344a088159590469e46"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b7d852d16c270e4331f6f59b3e9aa23f935f5c4b0ed2d0bc77637a8890a5d092"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7d5f2f54f78164b3d7a40f33bf79a74cdee72c31affec86bfcabe7e0789821"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0aa6e92e639da0d6e2017d9ccff563222f4eb31e4b2c3cf32a2a392fc3103c0d"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d797591b6846b9db79e65dc2d0d48e61f7db8d10b2a9480b4e3faaddc421a171"}, + {file = "pandas-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2d3e7b00f703aea3945995ee63375c61b2e6aa5aa7871c5d622870e5e137623"}, + {file = "pandas-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:dc9bf7ade01143cddc0074aa6995edd05323974e6e40d9dbde081021ded8510e"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:482d5076e1791777e1571f2e2d789e940dedd927325cc3cb6d0800c6304082f6"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a706cfe7955c4ca59af8c7a0517370eafbd98593155b48f10f9811da440248b"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0513a132a15977b4a5b89aabd304647919bc2169eac4c8536afb29c07c23540"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9f17f2b6fc076b2a0078862547595d66244db0f41bf79fc5f64a5c4d635bead"}, + {file = "pandas-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:45d63d2a9b1b37fa6c84a68ba2422dc9ed018bdaa668c7f47566a01188ceeec1"}, + {file = "pandas-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:f69b0c9bb174a2342818d3e2778584e18c740d56857fc5cdb944ec8bbe4082cf"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3f06bda01a143020bad20f7a85dd5f4a1600112145f126bc9e3e42077c24ef34"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab5796839eb1fd62a39eec2916d3e979ec3130509930fea17fe6f81e18108f6a"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbaf9e8d3a63a9276d707b4d25930a262341bca9874fcb22eff5e3da5394732"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ebfd771110b50055712b3b711b51bee5d50135429364d0498e1213a7adc2be8"}, + {file = "pandas-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8ea107e0be2aba1da619cc6ba3f999b2bfc9669a83554b1904ce3dd9507f0860"}, + {file = "pandas-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:d65148b14788b3758daf57bf42725caa536575da2b64df9964c563b015230984"}, + {file = "pandas-2.1.4.tar.gz", hash = "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +aws = ["s3fs (>=2022.05.0)"] +clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] +compression = ["zstandard (>=0.17.0)"] +computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2022.05.0)"] +gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] +hdf5 = ["tables (>=3.7.0)"] +html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] +mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] +spss = ["pyreadstat (>=1.1.5)"] +sql-other = ["SQLAlchemy (>=1.4.36)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.8.0)"] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "8.3.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-datadir" +version = "1.5.0" +description = "pytest plugin for test data directories and files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-datadir-1.5.0.tar.gz", hash = "sha256:1617ed92f9afda0c877e4eac91904b5f779d24ba8f5e438752e3ae39d8d2ee3f"}, + {file = "pytest_datadir-1.5.0-py3-none-any.whl", hash = "sha256:34adf361bcc7b37961bbc1dfa8d25a4829e778bab461703c38a5c50ca9c36dc8"}, +] + +[package.dependencies] +pytest = ">=5.0" + +[[package]] +name = "pytest-mock" +version = "3.14.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"}, + {file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"}, +] + +[package.dependencies] +pytest = ">=6.2.5" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + +[[package]] +name = "pytest-unordered" +version = "0.5.2" +description = "Test equality of unordered collections in pytest" +optional = false +python-versions = "*" +files = [ + {file = "pytest-unordered-0.5.2.tar.gz", hash = "sha256:8187e6d68a7d54e5447e88c229cbeafa38205e55baf7da7ae57cc965c1ecdbb3"}, + {file = "pytest_unordered-0.5.2-py3-none-any.whl", hash = "sha256:b01bb0e8ba80db6dd8c840fe24ad1804c8672919303dc9302688221390a7dc29"}, +] + +[package.dependencies] +pytest = ">=6.0.0" + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.2" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"}, + {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, +] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tzdata" +version = "2024.2" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, + {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "a3517d391acbd94caec47c2dca39bc638a86888ade39588f514f467486362547" diff --git a/Kinship/pyproject.toml b/Kinship/pyproject.toml new file mode 100644 index 0000000..0214f6d --- /dev/null +++ b/Kinship/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "kinship" +version = "1.1.1" +description = "" +authors = ["Bioinformatica Genetica "] +license = "MIT" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.11" +pandas = "2.1.4" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.3" +pytest-unordered = "0.5.2" +pytest-datadir = "^1.5.0" +pytest-mock = "^3.14.0" + +[tool.ruff] +line-length = 127 +indent-width = 4 + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/Kinship/test_check_kinship.py b/Kinship/test_check_kinship.py new file mode 100644 index 0000000..c1dcd36 --- /dev/null +++ b/Kinship/test_check_kinship.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python +# Import statements, alphabetic order of main package. +from pathlib import Path + + +# Third party libraries alphabetic order of main package. +from pandas import DataFrame +import pytest +from pytest_unordered import unordered + +# Custom libraries alphabetic order of main package. +import check_kinship + + +@pytest.fixture(scope="module", autouse=True) +def setup_test_path(tmp_path_factory): + test_tmp_path = str(tmp_path_factory.mktemp("test")) + "/" + open(str(test_tmp_path) + "/empty.txt", "a").close() + return test_tmp_path + + +@pytest.fixture(scope="module", autouse=True) +def kinship_settings(): + return 0.177, 0.354 + +class TestNonEmptyExistingPath(): + def test_existing_dir(self, setup_test_path): + file_or_dir = check_kinship.validate_non_empty_existing_path(setup_test_path) + assert file_or_dir + + def test_not_file_not_dir(self): + fake_string = "fake_string" + with pytest.raises(FileNotFoundError) as file_dir_error: + check_kinship.validate_non_empty_existing_path(fake_string) + assert fake_string in str(file_dir_error.value) + + def test_empty_file(self, setup_test_path): + with pytest.raises(OSError) as empty_error: + check_kinship.validate_non_empty_existing_path(setup_test_path + "empty.txt") + assert f"File {setup_test_path}empty.txt is empty." in str(empty_error.value) + + def test_append_suffix(self, setup_test_path): + dir_without_suffix = setup_test_path.rstrip("/") + dir_with_suffix = check_kinship.validate_non_empty_existing_path(dir_without_suffix) + assert dir_without_suffix[-1] != "/" + assert dir_with_suffix[-1] == "/" + + +class TestParsePed(): + @pytest.mark.parametrize("input_file,exp_dict_samples", [ + ( + "multi_family.ped", + { + "2024D00001": {'family': 'U000001', 'parents': [], 'children': ["2024D00003"]}, + "2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003"]}, + "2024D00003": {'family': 'U000001', 'parents': ["2024D00001", "2024D00002"], 'children': []}, + "2024D00004": {'family': 'U000002', 'parents': [], 'children': ["2024D00006"]}, + "2024D00005": {'family': 'U000002', 'parents': [], 'children': ["2024D00006"]}, + "2024D00006": {'family': 'U000002', 'parents': ["2024D00004", "2024D00005"], 'children': []} + }, + ), + ( + "multi_siblings.ped", + { + "2024D00001": {'family': 'U000001', 'parents': [], 'children': ["2024D00003", "2024D00004"]}, + "2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003", "2024D00004"]}, + "2024D00003": {'family': 'U000001', 'parents': ["2024D00001", "2024D00002"], 'children': []}, + "2024D00004": {'family': 'U000001', 'parents': ["2024D00001", "2024D00002"], 'children': []} + }, + ), + ( + "multi_unrelated_samples.ped", + { + "2024D00001": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00002": {'family': 'U000002', 'parents': [], 'children': []} + }, + ), + ( + "single_family.ped", + { + "2024D00001": {'family': 'U000001', 'parents': [], 'children': ["2024D00003"]}, + "2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003"]}, + "2024D00003": {'family': 'U000001', 'parents': ["2024D00001", "2024D00002"], 'children': []} + }, + ), + ]) + def test_parse_ped_ok(self, input_file, exp_dict_samples, datadir): + dict_samples = check_kinship.parse_ped(f"{datadir}/{input_file}") + assert dict_samples.keys() == unordered(exp_dict_samples.keys()) + for sample, meta in dict_samples.items(): + assert meta.get("family") == exp_dict_samples.get(sample).get("family") + assert meta.get("parents") == unordered(exp_dict_samples.get(sample).get("parents")) + assert meta.get("children") == unordered(exp_dict_samples.get(sample).get("children")) + + @pytest.mark.parametrize("input_file,exp_data_line,exp_error", [ + ("wrong_separator.ped", "U000001,2024D00001,0,0,1,1", "not enough values to unpack (expected 6, got 1)"), + ("missing_field.ped", "U000001\t2024D00002\t0\t0\t2", "not enough values to unpack (expected 6, got 5)"), + + ]) + def test_incorrect_ped(self, input_file, exp_data_line, exp_error, datadir, capsys, mocker): + with pytest.raises(SystemExit) as pytest_wrapped_e: + check_kinship.parse_ped(f"{datadir}/{input_file}") + assert pytest_wrapped_e.type is SystemExit + assert pytest_wrapped_e.value.code == 1 + + out, err = capsys.readouterr() + assert exp_data_line in out + assert exp_error in out + + +def test_read_and_modify_kinship_trio(datadir, kinship_settings): + kinship_min, kinship_max = kinship_settings + df_out = check_kinship.read_and_modify_kinship(f"{datadir}/trio.kinship", kinship_min, kinship_max) + # Assert succeeded read_table + assert not df_out.empty + # Assert renaming columns + assert set(["sample_1", "sample_2", "kinship"]) < set(df_out.columns) + assert not set(["FID1", "FID2", "Kinship"]) <= set(df_out.columns) + # Assert added columns + assert set(["related", "type", "status", "thresholds", "message"]) < set(df_out.columns) + # Assert threshold added as string + assert df_out.thresholds.iloc[0] == f"{kinship_min},{kinship_max}" and isinstance(df_out.thresholds.iloc[0], str) + + +class TestCheckAndAnnotateKinship(): + @pytest.mark.parametrize("samples,file,related,type", [ + # Type parent_child with kinship values between min and max threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': ["2024D00001-2024D00002"], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003-2024D00004"]}, + }, + "single_row_kin.kinship", True, 'parent_child' + ), + # Type parent_parent with kinship values lower than min threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': ["2024D00005-2024D00006"]}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00005-2024D00006"]}, + }, + "single_row.kinship", True, 'parent_parent' + ), + # Type sibling_sibling with kinship values between min and max threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': []}, + }, + "single_row_kin.kinship", True, 'sibling_sibling' + ), + # Type unrelated with kinship values lower than min threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000002', 'parents': [], 'children': []}, + }, + "single_row.kinship", False, 'unrelated' + ), + ]) + def test_single_rows_ok(self, samples, file, related, type, datadir, kinship_settings): + kinship_min, kinship_max = kinship_settings + df_in = check_kinship.read_and_modify_kinship(f"{datadir}/{file}", kinship_min, kinship_max) + df_out = check_kinship.check_and_annotate_kinship(df_in, samples, kinship_min, kinship_max) + assert df_out.shape == (1, 8) + assert df_out.loc[0, "related"] == related + assert df_out.loc[0, "type"] == type + assert df_out.loc[0, "status"] == "OK" + assert df_out.loc[0, "message"] == '' + + @pytest.mark.parametrize("samples,file,related,type,msg", [ + # Type parent_child with kinship values lower than min threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': ["2024D00001-2024D00002"], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003-2024D00004"]}, + }, + "single_row.kinship", True, 'parent_child', '> 0.177 and < 0.354' + ), + # Type parent_child with kinship values higher than min threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': ["2024D00001-2024D00002"], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00003-2024D00004"]}, + }, + "single_row_duplo.kinship", True, 'parent_child', '> 0.177 and < 0.354' + ), + # Type parent_parent with kinship values between min and max threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': ["2024D00005-2024D00006"]}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': ["2024D00005-2024D00006"]}, + }, + "single_row_kin.kinship", True, 'parent_parent', '<= 0.177' + ), + # Type sibling_sibling with kinship values lower than min threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': []}, + }, + "single_row.kinship", True, 'sibling_sibling', '> 0.177 and < 0.354' + ), + # Type sibling_sibling with kinship values higher than max threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000001', 'parents': [], 'children': []}, + }, + "single_row_duplo.kinship", True, 'sibling_sibling', '> 0.177 and < 0.354' + ), + # Type unrelated with kinship values between min and max threshold + ( + { + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': []}, + "2024D00001-2024D00002": {'family': 'U000002', 'parents': [], 'children': []}, + }, + "single_row_kin.kinship", False, 'unrelated', '<= 0.177' + ), + ]) + def test_single_rows_fail(self, samples, file, related, type, msg, datadir, kinship_settings): + kinship_min, kinship_max = kinship_settings + df_in = check_kinship.read_and_modify_kinship(f"{datadir}/{file}", kinship_min, kinship_max) + df_out = check_kinship.check_and_annotate_kinship(df_in, samples, kinship_min, kinship_max) + assert df_out.shape == (1, 8) + assert df_out.loc[0, "related"] == related + assert df_out.loc[0, "type"] == type + assert df_out.loc[0, "status"] == "FAIL" + + msg_out = df_out.loc[0, "message"] + assert str(df_in.loc[0, "kinship"]) in msg_out + for sample, meta in samples.items(): + assert sample in msg_out + assert meta.get("family") in msg_out + assert type in msg_out + assert msg in msg_out + + @pytest.mark.parametrize("samples,file", [ + # OK: trio + ( + { + "2024D00005-2024D00006": {'family': 'U000001', 'parents': [], 'children': ["2024D00001-2024D00002"]}, + "2024D00003-2024D00004": {'family': 'U000001', 'parents': [], 'children': ["2024D00001-2024D00002"]}, + "2024D00001-2024D00002": { + 'family': 'U000001', 'parents': ["2024D00003-2024D00004", "2024D00005-2024D00006"], 'children': [] + }, + }, + "trio.kinship" + ), + ]) + def test_multi_rows_trio_ok(self, samples, file, datadir, kinship_settings): + kinship_min, kinship_max = kinship_settings + df_in = check_kinship.read_and_modify_kinship(f"{datadir}/{file}", kinship_min, kinship_max) + df_out = check_kinship.check_and_annotate_kinship(df_in, samples, kinship_min, kinship_max) + assert df_out.shape == (3, 8) + assert "parent_child" in df_out.type.values + assert "parent_parent" in df_out.type.values + assert "OK" == df_out.status.unique() + assert not df_out.message.unique() + + +class TestWriteKinship(): + @pytest.mark.parametrize("input_dict,error_comment", [ + ( + { + "sample_1": ["2024D00001-2024D00002"], + "sample_2": ["2024D00003-2024D00004"], + "status": ["OK"], + "thresholds": ["0.177, 0.354"] + }, + "# No kinship errors found." + ), + ( + { + "sample_1": ["2024D00001-2024D00002"], + "sample_2": ["2024D00003-2024D00004"], + "status": ["FAIL"], + "thresholds": ["0.177, 0.354"] + }, + "# WARNING: Kinship errors found." + ), + ]) + def test_file(self, input_dict, error_comment, setup_test_path): + df_in = DataFrame(input_dict) + prefix = f"fake_prefix_{input_dict['status']}" + check_kinship.write_kinship(df_in, setup_test_path, prefix) + + output_file = Path(f"{setup_test_path}/{prefix}.kinship_check.out") + assert output_file.exists() and output_file.is_file() + with open(output_file) as file: + file_content = file.read().rstrip().splitlines() + assert len(file_content) == 4 + assert file_content[0] == error_comment + assert file_content[1] == "# Used kinship check settings: 0.177, 0.354" + assert file_content[2].split("\t") == df_in.columns.to_list() + assert file_content[3].split("\t") == df_in.iloc[0].tolist() + + def test_stdout(self, kinship_settings, capsys): + kinship_min, kinship_max = kinship_settings + df_in = DataFrame({ + "sample_1": ["2024D00001-2024D00002"], + "sample_2": ["2024D00003-2024D00004"], + "status": ["OK"], + "thresholds": [f"{kinship_min}, {kinship_max}"], + }) + + check_kinship.write_kinship(df_kinship_out=df_in, output_path=None, output_prefix=None) + out, err = capsys.readouterr() + lst_lines = out.rstrip().splitlines() + # Check number of expected lines + assert len(lst_lines) == 4 + # Check comments / headers + assert lst_lines[0] == "# No kinship errors found." + assert lst_lines[1] == "# Used kinship check settings: 0.177, 0.354" + # Check table + assert lst_lines[2].split("\t") == df_in.columns.to_list() + assert lst_lines[3].split("\t") == df_in.iloc[0].tolist() diff --git a/Kinship/test_check_kinship/missing_field.ped b/Kinship/test_check_kinship/missing_field.ped new file mode 100644 index 0000000..1fe53df --- /dev/null +++ b/Kinship/test_check_kinship/missing_field.ped @@ -0,0 +1,3 @@ +U000001 2024D00001 0 0 1 1 +U000001 2024D00002 0 0 2 +U000001 2024D00003 2024D00001 2024D00002 2 2 diff --git a/Kinship/test_check_kinship/multi_family.ped b/Kinship/test_check_kinship/multi_family.ped new file mode 100644 index 0000000..8c720f0 --- /dev/null +++ b/Kinship/test_check_kinship/multi_family.ped @@ -0,0 +1,6 @@ +U000001 2024D00001 0 0 1 1 +U000001 2024D00002 0 0 2 1 +U000001 2024D00003 2024D00001 2024D00002 2 2 +U000002 2024D00004 0 0 1 1 +U000002 2024D00005 0 0 2 1 +U000002 2024D00006 2024D00004 2024D00005 2 2 diff --git a/Kinship/test_check_kinship/multi_siblings.ped b/Kinship/test_check_kinship/multi_siblings.ped new file mode 100644 index 0000000..1136c6b --- /dev/null +++ b/Kinship/test_check_kinship/multi_siblings.ped @@ -0,0 +1,4 @@ +U000001 2024D00001 0 0 1 1 +U000001 2024D00002 0 0 2 1 +U000001 2024D00003 2024D00001 2024D00002 2 2 +U000001 2024D00004 2024D00001 2024D00002 2 2 diff --git a/Kinship/test_check_kinship/multi_unrelated_samples.ped b/Kinship/test_check_kinship/multi_unrelated_samples.ped new file mode 100644 index 0000000..9181d7a --- /dev/null +++ b/Kinship/test_check_kinship/multi_unrelated_samples.ped @@ -0,0 +1,2 @@ +U000001 2024D00001 0 0 1 1 +U000002 2024D00002 0 0 2 1 diff --git a/Kinship/test_check_kinship/single_family.ped b/Kinship/test_check_kinship/single_family.ped new file mode 100644 index 0000000..ffd5a3d --- /dev/null +++ b/Kinship/test_check_kinship/single_family.ped @@ -0,0 +1,3 @@ +U000001 2024D00001 0 0 1 1 +U000001 2024D00002 0 0 2 1 +U000001 2024D00003 2024D00001 2024D00002 2 2 diff --git a/Kinship/test_check_kinship/single_row.kinship b/Kinship/test_check_kinship/single_row.kinship new file mode 100644 index 0000000..0afc22e --- /dev/null +++ b/Kinship/test_check_kinship/single_row.kinship @@ -0,0 +1,2 @@ +FID1 ID1 FID2 ID2 N_SNP HetHet IBS0 Kinship +2024D00003-2024D00004 2024D00003-2024D00004 2024D00001-2024D00002 2024D00001-2024D00002 285313 0.1303 0.0521 0.0199 diff --git a/Kinship/test_check_kinship/single_row_duplo.kinship b/Kinship/test_check_kinship/single_row_duplo.kinship new file mode 100644 index 0000000..d3ca8e9 --- /dev/null +++ b/Kinship/test_check_kinship/single_row_duplo.kinship @@ -0,0 +1,2 @@ +FID1 ID1 FID2 ID2 N_SNP HetHet IBS0 Kinship +2024D00003-2024D00004 2024D00003-2024D00004 2024D00001-2024D00002 2024D00001-2024D00002 284880 0.1716 0.0065 0.451 diff --git a/Kinship/test_check_kinship/single_row_kin.kinship b/Kinship/test_check_kinship/single_row_kin.kinship new file mode 100644 index 0000000..525af0f --- /dev/null +++ b/Kinship/test_check_kinship/single_row_kin.kinship @@ -0,0 +1,2 @@ +FID1 ID1 FID2 ID2 N_SNP HetHet IBS0 Kinship +2024D00003-2024D00004 2024D00003-2024D00004 2024D00001-2024D00002 2024D00001-2024D00002 284880 0.1716 0.0065 0.2435 diff --git a/Kinship/test_check_kinship/trio.kinship b/Kinship/test_check_kinship/trio.kinship new file mode 100755 index 0000000..1de24eb --- /dev/null +++ b/Kinship/test_check_kinship/trio.kinship @@ -0,0 +1,4 @@ +FID1 ID1 FID2 ID2 N_SNP HetHet IBS0 Kinship +2024D00003-2024D00004 2024D00003-2024D00004 2024D00001-2024D00002 2024D00001-2024D00002 284880 0.1716 0.0065 0.2435 +2024D00003-2024D00004 2024D00003-2024D00004 2024D00005-2024D00006 2024D00005-2024D00006 285313 0.1303 0.0521 0.0199 +2024D00001-2024D00002 2024D00001-2024D00002 2024D00005-2024D00006 2024D00005-2024D00006 284396 0.1863 0.0063 0.2582 diff --git a/Kinship/test_check_kinship/wrong_separator.ped b/Kinship/test_check_kinship/wrong_separator.ped new file mode 100644 index 0000000..e0622b8 --- /dev/null +++ b/Kinship/test_check_kinship/wrong_separator.ped @@ -0,0 +1,3 @@ +U000001,2024D00001,0,0,1,1 +U000001,2024D00002,0,0,2,1 +U000001,2024D00003,2024D00001,2024D00002,2,2 diff --git a/README.md b/README.md index dcef68d..8072aa7 100644 --- a/README.md +++ b/README.md @@ -8,17 +8,17 @@ This repository contains custom nextflow processes and their dependent/linked fi Utility functions are dinstinguished and can be found in Utils. ### Designated folder -When nextflow processes or their dependent files are linked to another git repository, +When nextflow processes or their dependent files are linked to another git repository, the files will be placed in a designated folder. For example: ClarityEpp folder and https://github.com/UMCUGenetics/clarity_epp If a set of files is not per se a utility and doesnot have a separate repository, it is allowed to create a designated folder as well. ## Docker files -Build docker image for software dependencies. +Build docker image for software dependencies. - [Install Docker Desktop](https://docs.docker.com/desktop/mac/apple-silicon/) ```bash docker build -t organization_or_username/toolname:version -f path_to_dockerfile docker push organization_or_username/toolname:version ``` -- If changes are required to the dockerfile, manually update label version. \ No newline at end of file +- If changes are required to the dockerfile, manually update label version.