diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml
index 0b9fa90..9a9c7dd 100644
--- a/.github/workflows/CI.yaml
+++ b/.github/workflows/CI.yaml
@@ -27,6 +27,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      # - name: Install poetry
+      #   run: pipx install poetry
+
       - name: Additional info about the build
         shell: bash
         run: |
@@ -42,6 +45,7 @@ jobs:
           environment-name: test
           # conda-forge is the default channel now and does not need to be specified
           channels: conda-forge,defaults
+          # cache: 'poetry'
           extra-specs: |
             python=${{ matrix.python-version }}
 
@@ -50,6 +54,10 @@ jobs:
         shell: bash -l {0}
         run: |
           python -m pip install . --no-deps
+          # python -m pip install poetry
+          # python -m venv .venv --copies
+          # poetry config virtualenvs.create false
+          # poetry install
           micromamba list
 
       - name: Run tests
@@ -57,6 +65,7 @@ jobs:
         shell: bash -l {0}
         run: |
           pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/
+          # poetry run pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/
 
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v4.0.1
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 7efc6de..7210450 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -17,3 +17,10 @@ dependencies:
     # Pip-only installs
   #- pip:
   #  - codecov
+
+    # Other
+  - pandas
+  - bravado
+  - requests-cache
+  - beautifulsoup4
+  - numpy
diff --git a/readthedocs.yml b/readthedocs.yml
index 95b50ae..319a3be 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -1,15 +1,33 @@
-# readthedocs.yml
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# taken from https://education.molssi.org/python-package-best-practices/10-documentation.html#read-the-docs
 
+# Required
 version: 2
 
+# Set the OS, Python version and other tools you might need
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
 
-python:
-  version: 3.8
-  install:
-    - method: pip
-      path: .
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
 
-conda:
-  environment: docs/requirements.yaml
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+    install:
+    - requirements: docs/requirements.yaml
diff --git a/src/__init__.py b/src/__init__.py
index e69de29..091ac24 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -0,0 +1,3 @@
+"""An ETL pipeline package to facilitate structure-based ML for missense human kinase property prediction"""
+
+from missense_kinase_toolkit import *
diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
index 67403ef..4cbe0cc 100644
--- a/src/missense_kinase_toolkit/cbioportal.py
+++ b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,74 +1,19 @@
-#!/usr/bin/env python3
-
-import os
+import logging
 import pandas as pd
 
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
+# from pydantic import BaseModel
+# from typing import ClassVar
 
-from missense_kinase_toolkit import config, io_utils
+from missense_kinase_toolkit import config, io_utils, utils_requests
 
 
-def get_all_mutations_by_study(
-    study_id: str,
-) -> list | None:
-    """Get mutations  cBioPortal data
-
-    Returns
-    -------
-    list | None
-        cBioPortal data of Abstract Base Classes objects if successful, otherwise None
-    """
-    instance = config.get_cbioportal_instance()
-    url = f"https://{instance}/api/v2/api-docs"
-    token = config.maybe_get_cbioportal_token()
-
-    if token is not None:
-        http_client = RequestsClient()
-        http_client.set_api_key(
-            instance,
-            f"Bearer {token}",
-            param_name='Authorization',
-            param_in='header'
-        )
-        cbioportal = SwaggerClient.from_url(
-            url,
-            http_client=http_client,
-            config={
-                "validate_requests":False,
-                "validate_responses":False,
-                "validate_swagger_spec": False
-            }
-        )
-    else:
-        cbioportal = SwaggerClient.from_url(
-            url,
-            config={
-                "validate_requests":False,
-                "validate_responses":False,
-                "validate_swagger_spec": False
-            }
-        )
-
-    studies = cbioportal.Studies.getAllStudiesUsingGET().result()
-    study_ids = [study.studyId for study in studies]
-
-    if study_id in study_ids:
-        #TODO: add error handling
-        #TODO: extract multiple studies
-        muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
-            molecularProfileId=f"{study_id}_mutations",
-            sampleListId=f"{study_id}_all",
-            projection="DETAILED"
-            ).result()
-    else:
-        raise ValueError(f"Study {study_id} not found in cBioPortal instance {instance}")
-
-    return muts
+logger = logging.getLogger(__name__)
 
 
 def parse_iterabc2dataframe(
-    list_input: iter,
+    input_object: iter,
 ) -> pd.DataFrame:
     """Parse an iterable containing Abstract Base Classes into a dataframe
 
@@ -82,11 +27,11 @@ def parse_iterabc2dataframe(
     pd.DataFrame
         Dataframe for the input list of Abstract Base Classes objects
     """
-    list_dir = [dir(entry) for entry in list_input]
+    list_dir = [dir(entry) for entry in input_object]
     set_dir = {item for sublist in list_dir for item in sublist}
 
     dict_dir = {attr: [] for attr in set_dir}
-    for entry in list_input:
+    for entry in input_object:
         for attr in dict_dir.keys():
             try:
                 dict_dir[attr].append(getattr(entry, attr))
@@ -99,16 +44,106 @@ def parse_iterabc2dataframe(
     return df
 
 
-def get_and_save_cbioportal_cohort(
-    study_id: str,
-) -> None:
-    muts = get_all_mutations_by_study(study_id)
+class cBioPortal():
+    # instance: ClassVar[str] = f"{config.get_cbioportal_instance()}"
+    # url: ClassVar[str] = f"https://{instance}/api/v2/api-docs"
+    # cbioportal: ClassVar[SwaggerClient | None] = None
+
+    def __init__(self):
+        self.instance = config.get_cbioportal_instance()
+        self.url = f"https://{self.instance}/api/v2/api-docs"
+        self._cbioportal = self.get_cbioportal_api()
+
+    def _set_api_key(self):
+        token = config.maybe_get_cbioportal_token()
+        http_client = RequestsClient()
+        if token is not None:
+            http_client.set_api_key(
+                self.instance,
+                f"Bearer {token}",
+                param_name="Authorization",
+                param_in="header"
+            )
+        else:
+            print("No API token provided")
+        return http_client
+
+    def get_cbioportal_api(self):
+        http_client = self._set_api_key()
+
+        cbioportal_api = SwaggerClient.from_url(
+            self.url,
+            http_client=http_client,
+            config={
+                "validate_requests": False,
+                "validate_responses": False,
+                "validate_swagger_spec": False
+            }
+        )
+
+        # response = cbioportal_api.Studies.getAllStudiesUsingGET().response().incoming_response
+        # logger.error(utils_requests.print_status_code(response.status_code))
+
+        return cbioportal_api
+
+    def get_instance(self):
+        return self.instance
+
+    def get_url(self):
+        return self.url
+
+    def get_cbioportal(self):
+        return self._cbioportal
+
+
+class Mutations(cBioPortal):
+    def __init__(
+        self,
+        study_id: str,
+    ) -> None:
+        super().__init__()
+        self.study_id = study_id
+        self._mutations = self.get_all_mutations_by_study()
+
+    def get_all_mutations_by_study(
+        self,
+    ) -> list | None:
+        """Get mutations  cBioPortal data
+
+        Returns
+        -------
+        list | None
+            cBioPortal data of Abstract Base Classes objects if successful, otherwise None
+        """
+        studies = self._cbioportal.Studies.getAllStudiesUsingGET().result()
+        study_ids = [study.studyId for study in studies]
+
+        if self.study_id in study_ids:
+            #TODO: add incremental error handling beyond missing study
+            muts = self._cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
+                molecularProfileId=f"{self.study_id}_mutations",
+                sampleListId=f"{self.study_id}_all",
+                projection="DETAILED"
+                ).result()
+        else:
+            logging.error(f"Study {self.study_id} not found in cBioPortal instance {self.instance}")
+
+        return muts
+
+    def get_and_save_cbioportal_cohort_mutations(
+        self,
+    ) -> None:
+        df_muts = parse_iterabc2dataframe(self._mutations)
+        df_genes = parse_iterabc2dataframe(df_muts["gene"])
+        df_combo = pd.concat([df_muts, df_genes], axis=1)
+        df_combo = df_combo.drop(["gene"], axis=1)
+
+        filename = f"{self.study_id}_mutations.csv"
 
-    df_muts = parse_iterabc2dataframe(muts)
-    df_genes = parse_iterabc2dataframe(df_muts["gene"])
-    df_combo = pd.concat([df_muts, df_genes], axis=1)
-    df_combo = df_combo.drop(["gene"], axis=1)
+        io_utils.save_dataframe_to_csv(df_combo, filename)
 
-    filename = f"{study_id}_mutations.csv"
+    def get_study_id(self):
+        return self.study_id
 
-    io_utils.save_dataframe_to_csv(df_combo, filename)
+    def get_mutations(self):
+        return self._mutations
diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
index c5df5ea..24485ab 100755
--- a/src/missense_kinase_toolkit/cli/extract_cbioportal.py
+++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
@@ -2,24 +2,31 @@
 
 import argparse
 
-from missense_kinase_toolkit import config, cbioportal
+from missense_kinase_toolkit import config, io_utils, cbioportal
 
 def parsearg_utils():
     parser = argparse.ArgumentParser(
-        description="Get mutations from cBioPortal cohort and instance"
+        description="Get mutations from cBioPortal instance for all specified studies."
     )
 
     parser.add_argument(
-        "--cohort",
+        "--outDir",
         type=str,
-        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
-        default="msk_impact_2017",
+        help="Required: Output directory path (str)",
     )
 
     parser.add_argument(
-        "--outDir",
+        "--requestsCache",
         type=str,
-        help="Required: Output directory path (str)",
+        default="requests_cache",
+        help="Optional: Requests cache; default: `requests_cache` (str)",
+    )
+
+    parser.add_argument(
+        "--cohort",
+        type=str,
+        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
+        default="msk_impact_2017",
     )
 
     parser.add_argument(
@@ -37,21 +44,15 @@ def parsearg_utils():
     )
 
     # TODO: add logging functionality
-    # TODO: cache requests for cBioPortal API
     return parser
 
 
 def main():
     args = parsearg_utils().parse_args()
 
-    str_studies = args.cohort
-    list_studies = str_studies.split(",")
-    list_studies = [study.strip() for study in list_studies]
+    list_studies = io_utils.convert_str2list(args.cohort)
 
-    # required argument
     config.set_output_dir(args.outDir)
-
-    # optional arguments
     config.set_cbioportal_instance(args.instance)
 
     try:
diff --git a/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py b/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py
new file mode 100755
index 0000000..8434dce
--- /dev/null
+++ b/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+import argparse
+
+import pandas as pd
+
+from missense_kinase_toolkit import config, io_utils, scrapers, klifs
+
+def parsearg_utils():
+    parser = argparse.ArgumentParser(
+        description="Get kinase annotations from KinHub and KLIFS databases."
+    )
+
+    parser.add_argument(
+        "--outDir",
+        type=str,
+        help="Required: Output directory path (str)",
+    )
+
+    parser.add_argument(
+        "--requestsCache",
+        type=str,
+        default="requests_cache",
+        help="Optional: Requests cache; default: `requests_cache` (str)",
+    )
+
+    parser.add_argument(
+        "--csvKinhub",
+        type=str,
+        default="kinhub.csv",
+        help="Optional: CSV file in outDir that contains Kinhub kinase info; default: `kinhub.csv` (str)",
+    )
+
+    parser.add_argument(
+        "--csvKLIFS",
+        type=str,
+        default="klifs.csv",
+        help="Optional: CSV file in outDir that contains KLIFS kinase info; default: `klifs.csv` (str)",
+    )
+
+    # TODO: add logging functionality
+    return parser
+
+
+def main():
+    args = parsearg_utils().parse_args()
+
+    config.set_output_dir(args.outDir)
+    config.set_request_cache(args.requestsCache)
+
+    # get KinHub list of kinases
+    df_kinhub = scrapers.kinhub()
+    io_utils.save_dataframe_to_csv(df_kinhub, args.csvKinhub)
+
+    # get KLIFS annotations
+    list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()
+    dict_kinase_info = {}
+    for kinase in list_kinase_hgnc:
+        dict_kinase_info[kinase] = klifs.HumanKinaseInfo(kinase)._kinase_info
+    df_klifs = pd.DataFrame(dict_kinase_info).T
+    df_klifs = df_klifs.rename_axis("HGNC Name").reset_index()
+    io_utils.save_dataframe_to_csv(df_klifs, args.csvKLIFS)
diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
index d8e51be..de524a1 100755
--- a/src/missense_kinase_toolkit/cli/transform_cbioportal.py
+++ b/src/missense_kinase_toolkit/cli/transform_cbioportal.py
@@ -2,18 +2,18 @@
 
 import argparse
 
-from missense_kinase_toolkit import config, scrapers, io_utils
+from missense_kinase_toolkit import config, io_utils
 
 
 def parsearg_utils():
     parser = argparse.ArgumentParser(
-        description="Concatenate, remove duplicates, and extract genes and mutation types of interest"
+        description="Concatenate, remove duplicates, and extract genes and mutation types of interest from cBioPortal data."
     )
 
     parser.add_argument(
-        "--mutations",
+        "--mutationTypes",
         type=str,
-        help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)",
+        help="Optional: Mutation type(s) to extract, separated by commas; default: `Missense_Mutation` (str)",
         default="Missense_Mutation",
     )
 
@@ -25,9 +25,23 @@ def parsearg_utils():
 
     parser.add_argument(
         "--requestsCache",
-        type=bool,
-        default=False,
-        help="Optional: Requests cache; default False (bool)",
+        type=str,
+        default="requests_cache",
+        help="Optional: Requests cache; default: requests_cache (str)",
+    )
+
+    parser.add_argument(
+        "--listCols",
+        type=str,
+        default="HGNC Name, UniprotID",
+        help="Optional: List of columns to merge separated by comma with the first as the merge on column of HGNC gene names; default: `HGNC Name, UniprotID` (str)",
+    )
+
+    parser.add_argument(
+        "--csvRef",
+        type=str,
+        default="kinhub.csv",
+        help="Optional: CSV file in outDir that contains; default: `kinhub.csv` (str)",
     )
 
     # TODO: add logging functionality
@@ -37,31 +51,29 @@ def parsearg_utils():
 def main():
     args = parsearg_utils().parse_args()
 
-    str_mutations = args.mutations
-    list_mutations = str_mutations.split(",")
-    list_mutations = [mutation.strip() for mutation in list_mutations]
-
-    # required argument
     config.set_output_dir(args.outDir)
-
-    # optional argument
     config.set_request_cache(args.requestsCache)
 
+    list_mutations = io_utils.convert_str2list(args.mutationTypes)
+    list_cols = io_utils.convert_str2list(args.listCols)
+
     df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv")
 
-    df_kinhub = scrapers.kinhub()
-    io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv")
+    df_kinhub = io_utils.load_csv_to_dataframe(args.csvRef)
+    # df_kinhub = scrapers.kinhub()
+    # io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv")
 
-    list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()
+    list_kinase_hgnc = df_kinhub[list_cols[0]].to_list()
+    # list_kinase_hgnc = df_kinhub["HGNC Name"].to_list()
 
     df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True)
     df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True)
 
-    list_cols = ["HGNC Name", "UniprotID"]
+    # list_cols = ["HGNC Name", "UniprotID"]
     df_subset_merge = df_subset.merge(df_kinhub[list_cols],
                                       how = "left",
                                       left_on = "hugoGeneSymbol",
-                                      right_on = "HGNC Name")
-    df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1)
+                                      right_on = list_cols[0])
+    df_subset_merge = df_subset_merge.drop([list_cols[0]], axis=1)
 
     io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv")
diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py
index c005461..a93b9f0 100644
--- a/src/missense_kinase_toolkit/io_utils.py
+++ b/src/missense_kinase_toolkit/io_utils.py
@@ -23,6 +23,50 @@ def check_outdir_exists(
     return path_data
 
 
+def convert_str2list(
+    input_str: str
+) -> list[str]:
+    """Convert a string to a list
+
+    Parameters
+    ----------
+    str : str
+        String to convert to list
+
+    Returns
+    -------
+    list[str]
+        List of strings
+    """
+    list_str = input_str.split(",")
+    list_str = [str_in.strip() for str_in in list_str]
+    return list_str
+
+
+def load_csv_to_dataframe(
+    filename: str,
+) -> None:
+    """Load a CSV file as a dataframe
+
+    Parameters
+    ----------
+    filename : str
+        Filename to load (either with or without "csv" suffix)
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe loaded from CSV file
+    """
+    filename = filename.replace(".csv", "") + ".csv"
+    path_data = check_outdir_exists()
+    try:
+        df = pd.read_csv(os.path.join(path_data, filename))
+    except FileNotFoundError:
+        print(f"File {filename} not found in {path_data}...")
+    return df
+
+
 def save_dataframe_to_csv(
     df: pd.DataFrame,
     filename: str,
@@ -36,7 +80,6 @@ def save_dataframe_to_csv(
     filename : str
         Filename to save (either with or without "csv" suffix)
 
-
     Returns
     -------
     None
diff --git a/src/missense_kinase_toolkit/klifs.py b/src/missense_kinase_toolkit/klifs.py
new file mode 100644
index 0000000..468c642
--- /dev/null
+++ b/src/missense_kinase_toolkit/klifs.py
@@ -0,0 +1,113 @@
+# from bravado.requests_client import RequestsClient
+from bravado.client import SwaggerClient
+
+
+class KLIFS():
+    def __init__(self):
+        self.url = "https://dev.klifs.net/swagger_v2/swagger.json"
+        self._klifs = self.get_klifs_api()
+
+    def get_klifs_api(self):
+        klifs_api = SwaggerClient.from_url(
+            self.url,
+            config={
+            "validate_requests": False,
+            "validate_responses": False,
+            "validate_swagger_spec": False
+            }
+        )
+        return klifs_api
+
+    def get_url(self):
+        return self.url
+
+    def get_klifs(self):
+        return self._klifs
+
+
+class HumanKinaseInfo(KLIFS):
+    species: str = "Human"
+    def __init__(
+        self,
+        kinase_name: str,
+    ) -> None:
+        super().__init__()
+        self.kinase_name = kinase_name
+        self._kinase_info = self.get_kinase_info()
+
+    def get_kinase_info(
+        self
+    ) -> dict[str, str | int | None]:
+        try:
+            kinase_info = (
+                self._klifs.Information.get_kinase_ID(
+                kinase_name=[self.kinase_name],
+                species=self.species)
+            .response()
+            .result[0]
+            )
+
+            list_key = dir(kinase_info)
+            list_val = [getattr(kinase_info, key) for key in list_key]
+
+            dict_kinase_info = dict(zip(list_key, list_val))
+
+        except Exception as e:
+            print(e)
+            list_key = [
+                'family',
+                'full_name',
+                'gene_name',
+                'group',
+                'iuphar',
+                'kinase_ID',
+                'name',
+                'pocket',
+                'species',
+                'subfamily',
+                'uniprot'
+                ]
+            dict_kinase_info = dict(zip(list_key, [None]*len(list_key)))
+
+        return dict_kinase_info
+
+    def get_kinase_name(self):
+        return self.kinase_name
+
+    def get_species(self):
+        return self.species
+
+
+# def load_af2active(url, path_save):
+#     import os
+#     import wget
+#     import tarfile
+
+#     if not os.path.exists(path_save):
+#         os.makedirs(path_save)
+#     else:
+#         if os.path.exists(os.path.join(path_save, "Kincore_AF2_HumanCatalyticKinases")):
+#             print("File already exists...")
+#             return
+
+#     wget.download(url, path_save)
+
+# def get_tdc_dti(source_name="DAVIS"):
+#     from tdc.multi_pred import DTI
+
+#     data = DTI(name = source_name)
+#     data_davis = DTI(name = 'DAVIS')
+#     data_davis.get_data()
+#     data_davis.entity1_idx.unique().tolist()
+
+#     data_kiba = DTI(name = 'KIBA')
+
+#     data_kiba.get_data()
+
+#     print(data.label_distribution())
+#     data.print_stats()
+#     data.entity2_name
+#     len(data.entity1_idx.unique())
+#     data.entity2_idx.unique()
+#     data.
+#     split = data.get_split()
diff --git a/src/missense_kinase_toolkit/scrapers.py b/src/missense_kinase_toolkit/scrapers.py
index 21ca8c0..4bbfea2 100644
--- a/src/missense_kinase_toolkit/scrapers.py
+++ b/src/missense_kinase_toolkit/scrapers.py
@@ -18,7 +18,6 @@ def kinhub(
     pd.DataFrame
         DataFrame of kinase information
     """
-    import requests
     from bs4 import BeautifulSoup
     import numpy as np
     # TODO: to fix ImportError
@@ -52,10 +51,9 @@ def kinhub(
     df_kinhub = pd.DataFrame.from_dict(dict_kinhub)
     # df_kinhub = clean_names(df_kinhub)
 
-    # for kinases with 2 kinase domains, entries are duplicated despite same UniProt ID
-    # drop these
-    df_kinhub_drop = df_kinhub.loc[~df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x)), ]
-    # list_uniprot = df_kinhub["UniprotID"][df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x))].to_list()
-    # assert df_kinhub.shape[0] - df_kinhub_drop.shape[0] == df_kinhub_drop["UniprotID"].isin(list_uniprot).sum()
+    list_cols = df_kinhub.columns.to_list()
+    list_cols.remove("HGNC Name")
+    df_kinhub_agg = df_kinhub.groupby(["HGNC Name"], as_index=False, sort=False).agg(set)
+    df_kinhub_agg[list_cols] = df_kinhub_agg[list_cols].map(lambda x : ', '.join(str(s) for s in x))
 
-    return df_kinhub_drop
+    return df_kinhub_agg
diff --git a/src/nextflow/modules/extract_cbioportal.nf b/src/nextflow/modules/extract_cbioportal.nf
new file mode 100644
index 0000000..b9462ca
--- /dev/null
+++ b/src/nextflow/modules/extract_cbioportal.nf
@@ -0,0 +1,16 @@
+process PROCESS_CBIOPORTAL {
+    input:
+    tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache)
+
+    output:
+    path("${out_dir}/cbioportal")
+    """
+    export PYTHONHASHSEED=0
+    process_cbioportal \
+        --cohort ${cbio_cohort} \
+        --outDir ${out_dir} \
+        --instance ${cbio_inst} \
+        --token ${cbio_token} \
+        --requestsCache ${request_cache}
+    """
+}
diff --git a/src/nextflow/nextflow.config b/src/nextflow/nextflow.config
new file mode 100644
index 0000000..78f477f
--- /dev/null
+++ b/src/nextflow/nextflow.config
@@ -0,0 +1,27 @@
+profiles {
+    conda {
+        conda.enabled          = true
+    }
+    mamba {
+        conda.enabled          = true
+        conda.useMamba         = true
+    }
+    micromamba {
+        conda.enabled          = true
+        conda.useMicromamba         = true
+    }
+    juno {
+        // singularity {
+        //     enabled = true
+        //     autoMounts = true
+        // }
+        process {
+            executor = 'lsf'
+        }
+        executor {
+            name = 'lsf'
+            perJobMemLimit = true
+            queueSize = 25
+        }
+    }
+}
diff --git a/src/nextflow/process_cbioportal.nf b/src/nextflow/process_cbioportal.nf
new file mode 100644
index 0000000..5625530
--- /dev/null
+++ b/src/nextflow/process_cbioportal.nf
@@ -0,0 +1,24 @@
+process PROCESS_CBIOPORTAL {
+    // tag "$meta.id"
+    // label 'process_medium'
+    // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    //     'docker://jeffquinnmsk/pan_preclinical_etl:latest' :
+    //     'docker.io/jeffquinnmsk/pan_preclinical_etl:latest' }"
+
+    input:
+    tuple val(meta), path(raw_data), path(studies), path(source_files), path(request_cache), val(study_name)
+
+    output:
+    tuple val(meta), path("${prefix}/per_study_results/${study_name}"), emit: etl_results
+
+    """
+    export PYTHONHASHSEED=0
+    mkdir -p "${prefix}/per_study_results/${study_name}"
+    process_cbioportal \
+        --data-dir ${raw_data} \
+        --output-dir ${prefix}/per_study_results/${study_name} \
+        --studies ${studies} \
+        --source-files ${source_files} \
+        --study-id ${study_name}
+    """
+}
diff --git a/tests/test_missense_kinase_toolkit.py b/tests/test_missense_kinase_toolkit.py
index c2e7887..13ff2c4 100644
--- a/tests/test_missense_kinase_toolkit.py
+++ b/tests/test_missense_kinase_toolkit.py
@@ -13,3 +13,53 @@
 def test_missense_kinase_toolkit_imported():
     """Sample test, will always pass so long as import statement worked."""
     assert "missense_kinase_toolkit" in sys.modules
+
+
+def test_kinhub_scraper():
+    from missense_kinase_toolkit import scrapers
+
+    df_kinhub = scrapers.kinhub()
+
+    assert df_kinhub.shape[0] == 517
+    assert df_kinhub.shape[1] == 8
+    assert "HGNC Name" in df_kinhub.columns
+    assert "UniprotID" in df_kinhub.columns
+
+
+def test_klifs_HumanKinaseInfo():
+    from missense_kinase_toolkit import klifs
+
+    dict_egfr = klifs.HumanKinaseInfo("EGFR")._kinase_info
+
+    assert dict_egfr["family"] == "EGFR"
+    assert dict_egfr["full_name"] == "epidermal growth factor receptor"
+    assert dict_egfr["gene_name"] == "EGFR"
+    assert dict_egfr["group"] == "TK"
+    assert dict_egfr["iuphar"] == 1797
+    assert dict_egfr["kinase_ID"] == 406
+    assert dict_egfr["name"] == "EGFR"
+    assert dict_egfr["pocket"] == "KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQLMPFGCLLDYVREYLEDRRLVHRDLAARNVLVITDFGLA"
+    assert dict_egfr["species"] == "Human"
+    assert dict_egfr["uniprot"] == "P00533"
+
+
+def test_io_utils():
+    from missense_kinase_toolkit import io_utils
+    import pandas as pd
+    import os
+
+    os.environ["OUTPUT_DIR"] = "."
+    df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    io_utils.save_dataframe_to_csv(df, "test1.csv")
+    df_read = io_utils.load_csv_to_dataframe("test1.csv")
+    assert df.equals(df_read)
+
+    io_utils.save_dataframe_to_csv(df, "test2.csv")
+    df_concat = io_utils.concatenate_csv_files_with_glob("*test*.csv")
+    assert df_concat.equals(pd.concat([df, df]))
+
+    os.remove("test1.csv")
+    os.remove("test2.csv")
+
+    assert io_utils.convert_str2list("a,b,c") == ["a", "b", "c"]
+    assert io_utils.convert_str2list("a, b, c") == ["a", "b", "c"]