diff --git a/.gitignore b/.gitignore
index 6fe9c3e..cb6f6c4 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,7 @@
 *.gaf
 **/*.tar.gz
 **/*.whl
-**/*.pyc
+*.pyc
+
+.idea/
+__pycache__/
diff --git a/docs/.nojekyll b/docs/.nojekyll
deleted file mode 100644
index e69de29..0000000
diff --git a/docsource/conf.py b/docsource/conf.py
deleted file mode 100644
index d092eae..0000000
--- a/docsource/conf.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath('../../'))
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon']
-html_theme = 'sphinx_rtd_theme'
-
-
-
-# Project information
-project = 'Hogprof'
-author = 'Dave Moi'
-
-# Extensions to use
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'sphinx_rtd_theme'
-]
-
-# Theme settings
-html_theme = 'sphinx_rtd_theme'
-html_theme_options = {
-    'collapse_navigation': False,
-    'sticky_navigation': True,
-    'navigation_depth': 3,
-    'style_external_links': True
-}
-
-# Add any additional options for autodoc
-autodoc_default_options = {
-    'member-order': 'bysource'
-}
-
-# Add any modules to be excluded from the documentation
-exclude_patterns = []
-
-# The master toctree document.
-master_doc = 'index'
-
-# Mock import modules that may not be available in the documentation build environment
-autodoc_mock_imports = ['requests' , 'Bio' , 'numpy' , 'pandas' , 'matplotlib' , 'seaborn' , 'scipy' , 'wget' , 'statsmodels' , 'toytree' , 'pandas' , '' ]
-
diff --git a/docsource/index.rst b/docsource/index.rst
deleted file mode 100644
index 3c12ad8..0000000
--- a/docsource/index.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-HogProf
-=====================
-
-  - HogProf is an extensible and tunable approach to phylogenetic profiling using orthology data. It is powered by minhash based datastructures and computationally efficient.
-  - Still under major development and may change.
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   installation
-   usage
-   troubleshooting
-   credits
-
-Installation
-------------
-
-To install My Project, run the following command:
-
-Using pip
-
-
-.. code-block:: bash
-   $ pip install hogprof
- 
-Or from github 
-.. code-block:: bash
-$ git clone https://github.com/DessimozLab/HogProf.git
-$ pip install -r pipreqs.txt .
-
-Quickstart
------
-
-To use the library we need an OMA instance's HDF5 file containing HOG info and some accesory files.
-
-.. code-block:: bash
-   $ mkdir YourOmaDirectory
-   $ cd YourOmaDirectory
-   $ wget https://omabrowser.org/All/OmaServer.h5
-   $ wget https://omabrowser.org/All/oma-go.txt.gz
-
-Let's create a directory for the phylogenetic rpfiling database were going to make.
-
-
-.. code-block:: bash
-   $ mkdir YourDBDirectory
-
-Ok. We're ready! Now let's compile a database containing all HOGs and our desired taxonomic levels using default settings. Launch the lshbuilder.
-dbtypes available on the command line are : all , plants , archaea, bacteria , eukarya , protists , fungi , metazoa and vertebrates. These will use the NCBI taxonomy as a tree to annotate events in different gene family's histories.
-
-.. code-block:: bash
-   $python lshbuilder.py --outpath YourHogProfDirectory --dbtype all --OMA YourOmaDirectory/OmaServer.h5 --nthreads numberOfCPUcores          
-
-This should build a taxonomic tree for the genomes contained in the release and then calculate enhanced phylogenies for all HOGs in OMA.
-Once the database is completed it can be interogated using a profiler object. Construction and usage of this object should be done using a python script or notebook.
-
-
-.. code-block:: python
-   import HogProf
-   myproject.do_x()
-
-
-
-
-Troubleshooting
----------------
-
-
-If you encounter any issues while using My Project, please file a bug report on our GitHub repository: https://github.com/user/repo/issues
-
-
-Credits
--------
-
-My Project was created by John Doe.
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
deleted file mode 100755
index 1b644f4..0000000
--- a/environment.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: Hogprof
-channels:
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=main
-  - ca-certificates=2021.5.25=h06a4308_1
-  - certifi=2020.12.5=py38h06a4308_0
-  - ld_impl_linux-64=2.33.1=h53a641e_7
-  - libffi=3.3=he6710b0_2
-  - libgcc-ng=9.1.0=hdf63c60_0
-  - libstdcxx-ng=9.1.0=hdf63c60_0
-  - ncurses=6.2=he6710b0_1
-  - openssl=1.1.1k=h27cfd23_0
-  - pip=21.1.1=py38h06a4308_0
-  - python=3.8.10=hdb3f193_7
-  - readline=8.1=h27cfd23_0
-  - setuptools=52.0.0=py38h06a4308_0
-  - sqlite=3.35.4=hdfb4753_0
-  - tk=8.6.10=hbc83047_0
-  - wheel=0.36.2=pyhd3eb1b0_0
-  - xz=5.2.5=h7b6447c_0
-  - zlib=1.2.11=h7b6447c_3
-  - pip:
-    - biopython==1.78
-    - chardet==4.0.0
-    - datasketch==1.5.3
-    - ete3==3.1.2
-    - future==0.18.2
-    - goatools==1.1.6
-    - h5py==3.2.1
-    - idna==2.10
-    - lxml==4.6.3
-    - numexpr==2.7.3
-    - numpy==1.20.3
-    - pandas==1.2.4
-    - pyham==1.1.10
-    - pyoma==0.11.1
-    - pyopa==0.8.0
-    - python-dateutil==2.8.1
-    - pytz==2021.1
-    - requests==2.25.1
-    - scipy==1.6.3
-    - six==1.16.0
-    - tables==3.6.1
-    - urllib3==1.26.5
\ No newline at end of file
diff --git a/pipreqs.txt b/pipreqs.txt
deleted file mode 100644
index 4879288..0000000
--- a/pipreqs.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-appdirs==1.4.4
-arrow==1.1.1
-attrs==21.2.0
-beautifulsoup4==4.9.3
-biopython==1.79
-bioservices==1.7.12
-certifi==2020.12.5
-chardet==4.0.0
-colorama==0.4.4
-colorlog==6.4.1
-csb==1.2.5
-custom-inherit==2.4.0
-cycler==0.10.0
-datasketch==1.5.3
-easydev==0.11.2
-ete3==3.1.2
-future==0.18.2
-gevent==21.8.0
-goatools==1.1.6
-greenlet==1.1.1
-grequests==0.6.0
-h5py==3.2.1
-idna==2.10
-itsdangerous==2.0.1
-joblib==1.0.1
-kiwisolver==1.3.2
-lxml==4.6.3
-matplotlib==3.4.3
-multipledispatch==0.6.0
-networkx==2.6.2
-numexpr==2.7.3
-numpy==1.21.2
-pandas==1.3.2
-pexpect==4.8.0
-Pillow==8.3.2
-ptyprocess==0.7.0
-pyham==1.1.10
-pyoma==0.11.1
-pyopa==0.8.0
-pyparsing==2.4.7
-pypng==0.0.21
--e git+git@github.com:DessimozLab/HogProf.git@a76d8cae9c3f3f5799da031c1aab632ac81cb409#egg=PyProfiler&subdirectory=pyprofiler
-python-dateutil==2.8.2
-pytz==2021.1
-PyYAML==5.4.1
-reportlab==3.6.1
-requests==2.25.1
-requests-cache==0.7.4
-scikit-learn==0.24.2
-scipy==1.7.1
-seaborn==0.11.2
-six==1.16.0
-sklearn==0.0
-soupsieve==2.2.1
-suds-jurko==0.6
-tables==3.6.1
-threadpoolctl==2.2.0
-toyplot==0.19.0
-url-normalize==1.4.3
-urllib3==1.26.5
-wget==3.2
-wrapt==1.12.1
-xmltodict==0.12.0
-zope.event==4.5.0
-zope.interface==5.4.0
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000..057d516
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,1263 @@
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+
+[[package]]
+name = "biopython"
+version = "1.81"
+description = "Freely available tools for computational molecular biology."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "biopython-1.81-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef7c79b65b0b3f3c7dc59e20a7f8ae5758d8e852cb8b9cace590dc5617e348ba"},
+    {file = "biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ebfbce0d91796c7aef422ee9dffe8827e07e5abaa94545e006f1f20e965c80b"},
+    {file = "biopython-1.81-cp310-cp310-win32.whl", hash = "sha256:919a2c583cabf9c96d2ae4e1245a6b0376932fb342aca302a0fc198b71ab3275"},
+    {file = "biopython-1.81-cp310-cp310-win_amd64.whl", hash = "sha256:b37c0d24191e5c96ca02415a5188551980c83a0d518bbc4ffe3c9a5d1fe0ee81"},
+    {file = "biopython-1.81-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7a168709694e10b338718c18d967edd5b56c237dc88642c22275796007a70000"},
+    {file = "biopython-1.81-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a51d9c1d1b4b634447535da74a644fae59bc234fbbf9001e2dc6b6fbabb98019"},
+    {file = "biopython-1.81-cp311-cp311-win32.whl", hash = "sha256:2f9cfaf16d55ab80d514e7aebe5710dabe4e4ff47ede851031202e33b3249da3"},
+    {file = "biopython-1.81-cp311-cp311-win_amd64.whl", hash = "sha256:e41b55edcfd448630e77bf4de66a7235324a8a149621499891da6bd1d5085b9a"},
+    {file = "biopython-1.81-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3b36ba1bf6395c09a365c53530c9d71f3617763fa2c1d452b3d8948368c0f1de"},
+    {file = "biopython-1.81-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c5c07123ff5f44c9e6b5369df854a38afd3c0c50ef58498a0ae8f7eb799f3e8"},
+    {file = "biopython-1.81-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97cbdbed01b2512471f36c74b91658d1dfbdcbf39bc038f6ce5a41c3e60a8fc6"},
+    {file = "biopython-1.81-cp37-cp37m-win32.whl", hash = "sha256:35506e39822c52d11cf09a3951e82375ca1bb9303960b4286acf02c9a6f6c4cc"},
+    {file = "biopython-1.81-cp37-cp37m-win_amd64.whl", hash = "sha256:793c42a376cd63f62f8a088ce39b7dc6b5c55e4e9031d887c434de1595bfa4b8"},
+    {file = "biopython-1.81-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:11d673698b3d0d6589292ea951fb62cb24ea27d273eca0d08dbbd956690f97f5"},
+    {file = "biopython-1.81-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:655df416936662c0c8a06a549cb25e1560e1fea5067d850f34fb714b8a3fae6c"},
+    {file = "biopython-1.81-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:762c6c43a8486b5fcd07f136a3217b87d24755618b9ea9da1f17124ff44c2ad6"},
+    {file = "biopython-1.81-cp38-cp38-win32.whl", hash = "sha256:ee51bb1cd7decffd24da6b76d5e01b7e2fd818ab85cf0c180226cbb5793a3abd"},
+    {file = "biopython-1.81-cp38-cp38-win_amd64.whl", hash = "sha256:ccd729249fd5f586dd4c2a3507c2ea2456825d7e615e97c07c409c850eaf4594"},
+    {file = "biopython-1.81-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9ba33244f0eff830beaa7240065bdb5095d96fded6599b76bbb9ddab45cd2bbd"},
+    {file = "biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bb0c690c7368f255ed45236bf0f5464b476b8c083c8f634533921af78278261"},
+    {file = "biopython-1.81-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65b93b513ce9dd7b2ce058720eadf42cd03f312db3409356efeb93123d1320aa"},
+    {file = "biopython-1.81-cp39-cp39-win32.whl", hash = "sha256:811796f8d222aa3869a50e31e54ce62b69106b47cd8bb06934867c0d843297b5"},
+    {file = "biopython-1.81-cp39-cp39-win_amd64.whl", hash = "sha256:b09efcb4733c8770f25eab5fe555a96a08f5ab9e1bc36939e08ebf2ffbf3e0f1"},
+    {file = "biopython-1.81.tar.gz", hash = "sha256:2cf38112b6d8415ad39d6a611988cd11fb5f33eb09346666a87263beba9614e0"},
+]
+
+[package.dependencies]
+numpy = "*"
+
+[[package]]
+name = "blosc2"
+version = "2.2.9"
+description = "Python wrapper for the C-Blosc2 library"
+optional = false
+python-versions = "<4,>=3.8"
+files = [
+    {file = "blosc2-2.2.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:135afe34913cd43b02186fb400f30e2c9bdbfe3752470d9b6b00a20e7293fb9f"},
+    {file = "blosc2-2.2.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:562828192e3c6f4629823d836bec1d129dfdad38a7e6d2e84f52dcaf9979633b"},
+    {file = "blosc2-2.2.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f9413d6926d7442847b115680567fd4ad4ddcdf46e2419cd2f5e82ee8d00f6c"},
+    {file = "blosc2-2.2.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74a24b4efb8b608b71d8af51d5c8f16dc63f45c2145240e7d313472fa720a68e"},
+    {file = "blosc2-2.2.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73c7a7afd5390d60ad8ecd1e0e5de2492c60a24cce748b8ae2da83ceda0649ad"},
+    {file = "blosc2-2.2.9-cp310-cp310-win32.whl", hash = "sha256:49f3b3951764ddf6d7ad3c1c0800adef2b7780348b1fe5126b6e0970f3ea6c2f"},
+    {file = "blosc2-2.2.9-cp310-cp310-win_amd64.whl", hash = "sha256:e24335d97ae43558d222b15141d8499c3b220b3d166350441a6d2a4470997921"},
+    {file = "blosc2-2.2.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f774b0c20b86c99fe1ba4fa7737add60d71930662192fdf66a547707a1e3a37"},
+    {file = "blosc2-2.2.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7746244318adeb552cfb45c95b329eb12e146159ae6506b06b4854dec4c3b2c1"},
+    {file = "blosc2-2.2.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e82b6280107b9ec05aa0ae7d86a3f73d14bd99767901cec95dab622d37cb0d7e"},
+    {file = "blosc2-2.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c11ace31c542aa6eed11708e7b92cf5d3dbbb3c1b8a691919c3bb6130caf1746"},
+    {file = "blosc2-2.2.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5a4db24030be00e8ccc9ff0645716504e4caf7525b70c7976ad8434b47f04f4f"},
+    {file = "blosc2-2.2.9-cp311-cp311-win32.whl", hash = "sha256:ebfc1e9736d83bffa16e49f53278de6caa7b5469c44a4448800fc40009efbbba"},
+    {file = "blosc2-2.2.9-cp311-cp311-win_amd64.whl", hash = "sha256:368b12e43249e55137a05506e747cc4656539afc73bf82a85b896a2f13a529d8"},
+    {file = "blosc2-2.2.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8504a92404b2ba5112db83bebdfbe7eb3c286514acb658191434f020ea084c7a"},
+    {file = "blosc2-2.2.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e38cc441798595f05e70d620f1124cd4c472003f9b58c17e79dd0477a4d151fb"},
+    {file = "blosc2-2.2.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f10e14c7f3b9f14431df58f9891e490af83ae6fb3d7c2a7d05722560273a2da8"},
+    {file = "blosc2-2.2.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaef04627713e22bc7883a35afd499266762f700d8644a65cfafbf2879d4350"},
+    {file = "blosc2-2.2.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a46f9216d63958572514354b94eaedaa2052b60b3301ec7c41c8f30c6825c718"},
+    {file = "blosc2-2.2.9-cp312-cp312-win32.whl", hash = "sha256:658443f639975d29eaa3feea269a2f971d2da5cab736bb6462561d7efe261cc3"},
+    {file = "blosc2-2.2.9-cp312-cp312-win_amd64.whl", hash = "sha256:0eb8ae893b60743a31feb4ed02dd96039400fb8e7fc5ff4d9adea8d70acde204"},
+    {file = "blosc2-2.2.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82ec6d1a4343868ce833380c82f60e9799794e04d35f630af948f0f3d28c3577"},
+    {file = "blosc2-2.2.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:555468f4c77a45e35a7a878fab7679bf4705585a84b81649fc423eba293cf17b"},
+    {file = "blosc2-2.2.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e788170a2e80cac38f15d723f7397a87d3c522980fc4f8d96c6fa9f5a74dd3"},
+    {file = "blosc2-2.2.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31c0ee147f5f78ceeb65b601c47b0431a0f6111b8443aeb1485547394725895"},
+    {file = "blosc2-2.2.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:25f27b50b2823e6a2e142eff02840979c19f629eb7833b45a98332a2d728543f"},
+    {file = "blosc2-2.2.9-cp39-cp39-win32.whl", hash = "sha256:fa36fa18b8d41aee7db975a318b481304e6e3558b48641ec53933287274a4ec3"},
+    {file = "blosc2-2.2.9-cp39-cp39-win_amd64.whl", hash = "sha256:c840bdfd97e25cd61d6e048f8d9ee6478133f3e70c880c2cb3054db93e142bba"},
+    {file = "blosc2-2.2.9.tar.gz", hash = "sha256:63606498aaa72d58215b618d4512d5d3de29000a7b01a870edce8cb21d237c40"},
+]
+
+[package.dependencies]
+msgpack = "*"
+ndindex = ">=1.4"
+numpy = ">=1.20.3"
+py-cpuinfo = "*"
+
+[[package]]
+name = "certifi"
+version = "2023.7.22"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.3.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset-normalizer-3.3.0.tar.gz", hash = "sha256:63563193aec44bce707e0c5ca64ff69fa72ed7cf34ce6e11d5127555756fd2f6"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:effe5406c9bd748a871dbcaf3ac69167c38d72db8c9baf3ff954c344f31c4cbe"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4162918ef3098851fcd8a628bf9b6a98d10c380725df9e04caf5ca6dd48c847a"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0570d21da019941634a531444364f2482e8db0b3425fcd5ac0c36565a64142c8"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5707a746c6083a3a74b46b3a631d78d129edab06195a92a8ece755aac25a3f3d"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:278c296c6f96fa686d74eb449ea1697f3c03dc28b75f873b65b5201806346a69"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a4b71f4d1765639372a3b32d2638197f5cd5221b19531f9245fcc9ee62d38f56"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5969baeaea61c97efa706b9b107dcba02784b1601c74ac84f2a532ea079403e"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3f93dab657839dfa61025056606600a11d0b696d79386f974e459a3fbc568ec"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:db756e48f9c5c607b5e33dd36b1d5872d0422e960145b08ab0ec7fd420e9d649"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:232ac332403e37e4a03d209a3f92ed9071f7d3dbda70e2a5e9cff1c4ba9f0678"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e5c1502d4ace69a179305abb3f0bb6141cbe4714bc9b31d427329a95acfc8bdd"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:2502dd2a736c879c0f0d3e2161e74d9907231e25d35794584b1ca5284e43f596"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23e8565ab7ff33218530bc817922fae827420f143479b753104ab801145b1d5b"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-win32.whl", hash = "sha256:1872d01ac8c618a8da634e232f24793883d6e456a66593135aeafe3784b0848d"},
+    {file = "charset_normalizer-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:557b21a44ceac6c6b9773bc65aa1b4cc3e248a5ad2f5b914b91579a32e22204d"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d7eff0f27edc5afa9e405f7165f85a6d782d308f3b6b9d96016c010597958e63"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a685067d05e46641d5d1623d7c7fdf15a357546cbb2f71b0ebde91b175ffc3e"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d3d5b7db9ed8a2b11a774db2bbea7ba1884430a205dbd54a32d61d7c2a190fa"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2935ffc78db9645cb2086c2f8f4cfd23d9b73cc0dc80334bc30aac6f03f68f8c"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fe359b2e3a7729010060fbca442ca225280c16e923b37db0e955ac2a2b72a05"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:380c4bde80bce25c6e4f77b19386f5ec9db230df9f2f2ac1e5ad7af2caa70459"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0d1e3732768fecb052d90d62b220af62ead5748ac51ef61e7b32c266cac9293"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b2919306936ac6efb3aed1fbf81039f7087ddadb3160882a57ee2ff74fd2382"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f8888e31e3a85943743f8fc15e71536bda1c81d5aa36d014a3c0c44481d7db6e"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:82eb849f085624f6a607538ee7b83a6d8126df6d2f7d3b319cb837b289123078"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7b8b8bf1189b3ba9b8de5c8db4d541b406611a71a955bbbd7385bbc45fcb786c"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5adf257bd58c1b8632046bbe43ee38c04e1038e9d37de9c57a94d6bd6ce5da34"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c350354efb159b8767a6244c166f66e67506e06c8924ed74669b2c70bc8735b1"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-win32.whl", hash = "sha256:02af06682e3590ab952599fbadac535ede5d60d78848e555aa58d0c0abbde786"},
+    {file = "charset_normalizer-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:86d1f65ac145e2c9ed71d8ffb1905e9bba3a91ae29ba55b4c46ae6fc31d7c0d4"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3b447982ad46348c02cb90d230b75ac34e9886273df3a93eec0539308a6296d7"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:abf0d9f45ea5fb95051c8bfe43cb40cda383772f7e5023a83cc481ca2604d74e"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b09719a17a2301178fac4470d54b1680b18a5048b481cb8890e1ef820cb80455"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3d9b48ee6e3967b7901c052b670c7dda6deb812c309439adaffdec55c6d7b78"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:edfe077ab09442d4ef3c52cb1f9dab89bff02f4524afc0acf2d46be17dc479f5"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3debd1150027933210c2fc321527c2299118aa929c2f5a0a80ab6953e3bd1908"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f63face3a527284f7bb8a9d4f78988e3c06823f7bea2bd6f0e0e9298ca0403"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24817cb02cbef7cd499f7c9a2735286b4782bd47a5b3516a0e84c50eab44b98e"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c71f16da1ed8949774ef79f4a0260d28b83b3a50c6576f8f4f0288d109777989"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9cf3126b85822c4e53aa28c7ec9869b924d6fcfb76e77a45c44b83d91afd74f9"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b3b2316b25644b23b54a6f6401074cebcecd1244c0b8e80111c9a3f1c8e83d65"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:03680bb39035fbcffe828eae9c3f8afc0428c91d38e7d61aa992ef7a59fb120e"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cc152c5dd831641e995764f9f0b6589519f6f5123258ccaca8c6d34572fefa8"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-win32.whl", hash = "sha256:b8f3307af845803fb0b060ab76cf6dd3a13adc15b6b451f54281d25911eb92df"},
+    {file = "charset_normalizer-3.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:8eaf82f0eccd1505cf39a45a6bd0a8cf1c70dcfc30dba338207a969d91b965c0"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dc45229747b67ffc441b3de2f3ae5e62877a282ea828a5bdb67883c4ee4a8810"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4a0033ce9a76e391542c182f0d48d084855b5fcba5010f707c8e8c34663d77"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ada214c6fa40f8d800e575de6b91a40d0548139e5dc457d2ebb61470abf50186"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b1121de0e9d6e6ca08289583d7491e7fcb18a439305b34a30b20d8215922d43c"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1063da2c85b95f2d1a430f1c33b55c9c17ffaf5e612e10aeaad641c55a9e2b9d"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70f1d09c0d7748b73290b29219e854b3207aea922f839437870d8cc2168e31cc"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:250c9eb0f4600361dd80d46112213dff2286231d92d3e52af1e5a6083d10cad9"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:750b446b2ffce1739e8578576092179160f6d26bd5e23eb1789c4d64d5af7dc7"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:fc52b79d83a3fe3a360902d3f5d79073a993597d48114c29485e9431092905d8"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:588245972aca710b5b68802c8cad9edaa98589b1b42ad2b53accd6910dad3545"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e39c7eb31e3f5b1f88caff88bcff1b7f8334975b46f6ac6e9fc725d829bc35d4"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:abecce40dfebbfa6abf8e324e1860092eeca6f7375c8c4e655a8afb61af58f2c"},
+    {file = "charset_normalizer-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:24a91a981f185721542a0b7c92e9054b7ab4fea0508a795846bc5b0abf8118d4"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:67b8cc9574bb518ec76dc8e705d4c39ae78bb96237cb533edac149352c1f39fe"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac71b2977fb90c35d41c9453116e283fac47bb9096ad917b8819ca8b943abecd"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3ae38d325b512f63f8da31f826e6cb6c367336f95e418137286ba362925c877e"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:542da1178c1c6af8873e143910e2269add130a299c9106eef2594e15dae5e482"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a85aed0b864ac88309b7d94be09f6046c834ef60762a8833b660139cfbad13"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aae32c93e0f64469f74ccc730a7cb21c7610af3a775157e50bbd38f816536b38"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b26ddf78d57f1d143bdf32e820fd8935d36abe8a25eb9ec0b5a71c82eb3895"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f5d10bae5d78e4551b7be7a9b29643a95aded9d0f602aa2ba584f0388e7a557"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:249c6470a2b60935bafd1d1d13cd613f8cd8388d53461c67397ee6a0f5dce741"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:c5a74c359b2d47d26cdbbc7845e9662d6b08a1e915eb015d044729e92e7050b7"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:b5bcf60a228acae568e9911f410f9d9e0d43197d030ae5799e20dca8df588287"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:187d18082694a29005ba2944c882344b6748d5be69e3a89bf3cc9d878e548d5a"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:81bf654678e575403736b85ba3a7867e31c2c30a69bc57fe88e3ace52fb17b89"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-win32.whl", hash = "sha256:85a32721ddde63c9df9ebb0d2045b9691d9750cb139c161c80e500d210f5e26e"},
+    {file = "charset_normalizer-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:468d2a840567b13a590e67dd276c570f8de00ed767ecc611994c301d0f8c014f"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e0fc42822278451bc13a2e8626cf2218ba570f27856b536e00cfa53099724828"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09c77f964f351a7369cc343911e0df63e762e42bac24cd7d18525961c81754f4"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:12ebea541c44fdc88ccb794a13fe861cc5e35d64ed689513a5c03d05b53b7c82"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:805dfea4ca10411a5296bcc75638017215a93ffb584c9e344731eef0dcfb026a"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96c2b49eb6a72c0e4991d62406e365d87067ca14c1a729a870d22354e6f68115"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aaf7b34c5bc56b38c931a54f7952f1ff0ae77a2e82496583b247f7c969eb1479"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:619d1c96099be5823db34fe89e2582b336b5b074a7f47f819d6b3a57ff7bdb86"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a0ac5e7015a5920cfce654c06618ec40c33e12801711da6b4258af59a8eff00a"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93aa7eef6ee71c629b51ef873991d6911b906d7312c6e8e99790c0f33c576f89"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7966951325782121e67c81299a031f4c115615e68046f79b85856b86ebffc4cd"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:02673e456dc5ab13659f85196c534dc596d4ef260e4d86e856c3b2773ce09843"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:c2af80fb58f0f24b3f3adcb9148e6203fa67dd3f61c4af146ecad033024dde43"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:153e7b6e724761741e0974fc4dcd406d35ba70b92bfe3fedcb497226c93b9da7"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-win32.whl", hash = "sha256:d47ecf253780c90ee181d4d871cd655a789da937454045b17b5798da9393901a"},
+    {file = "charset_normalizer-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:d97d85fa63f315a8bdaba2af9a6a686e0eceab77b3089af45133252618e70884"},
+    {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "datasketch"
+version = "1.6.4"
+description = "Probabilistic data structures for processing and searching very large datasets"
+optional = false
+python-versions = "*"
+files = [
+    {file = "datasketch-1.6.4-py3-none-any.whl", hash = "sha256:0982712115139348c21217b8ca83b8d3b342f2556f2686eeda2972604cc68532"},
+    {file = "datasketch-1.6.4.tar.gz", hash = "sha256:fe5a3545885c4c84eeb49d53a8bd82414c9c26948f7b0271cfe51cf16944c81a"},
+]
+
+[package.dependencies]
+numpy = ">=1.11"
+scipy = ">=1.0.0"
+
+[package.extras]
+benchmark = ["SetSimilaritySearch (>=0.1.7)", "matplotlib (>=3.1.2)", "nltk (>=3.4.5)", "pandas (>=0.25.3)", "pyfarmhash (>=0.2.2)", "pyhash (>=0.9.3)", "scikit-learn (>=0.21.3)", "scipy (>=1.3.3)"]
+cassandra = ["cassandra-driver (>=3.20)"]
+experimental-aio = ["aiounittest", "motor"]
+redis = ["redis (>=2.10.0)"]
+test = ["cassandra-driver (>=3.20)", "coverage", "mock (>=2.0.0)", "mockredispy", "nose (>=1.3.7)", "nose-exclude (>=0.5.0)", "pymongo (>=3.9.0)", "pytest", "redis (>=2.10.0)"]
+
+[[package]]
+name = "docopt"
+version = "0.6.2"
+description = "Pythonic argument parser, that will make you smile"
+optional = false
+python-versions = "*"
+files = [
+    {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"},
+]
+
+[[package]]
+name = "et-xmlfile"
+version = "1.1.0"
+description = "An implementation of lxml.xmlfile for the standard library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
+    {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
+]
+
+[[package]]
+name = "ete3"
+version = "3.1.3"
+description = "A Python Environment for (phylogenetic) Tree Exploration"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"},
+]
+
+[[package]]
+name = "future"
+version = "0.18.3"
+description = "Clean single-source support for Python 3 and 2"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "future-0.18.3.tar.gz", hash = "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"},
+]
+
+[[package]]
+name = "fuzzyset2"
+version = "0.2.2"
+description = "A simple python fuzzyset implementation."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "fuzzyset2-0.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:778c60881834ed5ddb789feb8d3f274774e0c429e46801af9ac426353b95c0fe"},
+    {file = "fuzzyset2-0.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:70e9d03315eb0e1c9c07648baa957bcfc164584179283f0c2f5bafac04798192"},
+    {file = "fuzzyset2-0.2.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54ae6f676190f5ecd3dd912e10a53537710bc47361f3208fcd78d0781d2a1595"},
+    {file = "fuzzyset2-0.2.2-cp310-cp310-win32.whl", hash = "sha256:023d22f20e60a264fe5f340fbf2d8cb4438180c08def9ed32e25abbf2582fa6f"},
+    {file = "fuzzyset2-0.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:3c32fdf5d605715f802c766b4ddb2cf5abcd4e59fb7d203da673cd471e8e3247"},
+    {file = "fuzzyset2-0.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:61cfa257b41aa900c1a63e71ae07f9e96467d233db6f0bef2128f5fe644d19e2"},
+    {file = "fuzzyset2-0.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0301a9d57dabec30fc9d91abbb5d8e395374e72219ebc72a0d6abae9d5421810"},
+    {file = "fuzzyset2-0.2.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a01640140b341c196df55c910f93dfaf07558a1fe082b0e0511ac2b220eb10d5"},
+    {file = "fuzzyset2-0.2.2-cp311-cp311-win32.whl", hash = "sha256:1f9a8d1f7bd51129a10f17910698f52e4d6da08be1f17cd3a1039c23fd2fc7e8"},
+    {file = "fuzzyset2-0.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:394d307742219e0fc6854773040dc0d592ccead2f779c1586753b70792469f85"},
+    {file = "fuzzyset2-0.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d6a8233f6463a2091b0e5bb5349a2bb527a8dd1e335814034217aadbb42e455b"},
+    {file = "fuzzyset2-0.2.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:395820970f77d6694224f98c5837758b6f9d7d33c3526980ca7e366b051e5844"},
+    {file = "fuzzyset2-0.2.2-cp37-cp37m-win32.whl", hash = "sha256:ac0e5e9a52778ef1acc708a9d0e97d3ddbe7aa6a12b50a2431f54da481de31c1"},
+    {file = "fuzzyset2-0.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:a660a7a19d3af2898845e3ce4b7e77662bbd8b77c549d3bd48106e13ba92356f"},
+    {file = "fuzzyset2-0.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edb70b7e79e22d0e6467ca6b121778a14dd67181cc6657cf07da1c647b044372"},
+    {file = "fuzzyset2-0.2.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:06493e858892e5c2306f17a0f8e2c34f9a6c2020fa659fc31278aff82870a309"},
+    {file = "fuzzyset2-0.2.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6133a52ce7023f0a31540e8bb072b465ad0ff25126f5454aae0d02f88bcf6b5f"},
+    {file = "fuzzyset2-0.2.2-cp38-cp38-win32.whl", hash = "sha256:ff5313a9096f53096c8972bda186c0ffb7b78b45e68ac9b589b6d900bfdcda63"},
+    {file = "fuzzyset2-0.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:6c32b048b11e45b0f49fd8bb92061a8289f03d722026d966bcb2202a7af0f590"},
+    {file = "fuzzyset2-0.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c033ecf4ea6327de89b07cfe5fe4017e585d1a7bc4c204cc7fc2c3dae984d68"},
+    {file = "fuzzyset2-0.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11e0cbbd96db35b19c0911c26a8ee5c763ecd93a4e02cbcb8a736fb3ae9e6473"},
+    {file = "fuzzyset2-0.2.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8db31967266d780ecf20d5dd88f6aeece3b7c752b9da6b3b040ab00275b20e4"},
+    {file = "fuzzyset2-0.2.2-cp39-cp39-win32.whl", hash = "sha256:3020e804d3967c9620499b0722250b0610f5a41774dd701d2d9d8c7c2e86d687"},
+    {file = "fuzzyset2-0.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:82a46ddd5aab26675da6b1b41221c810ea97d1d4ffdfcabbb2557028c3855715"},
+    {file = "fuzzyset2-0.2.2.tar.gz", hash = "sha256:71f08c69ece31e73631f402ee532f74115255290819747d25e55661b5029cfb5"},
+]
+
+[package.dependencies]
+rapidfuzz = ">=2.0"
+
+[[package]]
+name = "goatools"
+version = "1.3.9"
+description = "Python scripts to find enrichment of GO terms"
+optional = false
+python-versions = "*"
+files = [
+    {file = "goatools-1.3.9-py3-none-any.whl", hash = "sha256:200732531e9cd897584c04347ccf86f481ee90526133f795b42cc8e59f548dc5"},
+    {file = "goatools-1.3.9.tar.gz", hash = "sha256:a9c2fe7d4735d725dc685b1d4ef5df489f37506ada1c8198f0786c92559c348c"},
+]
+
+[package.dependencies]
+docopt = "*"
+numpy = "*"
+openpyxl = "*"
+pandas = "*"
+pydot = "*"
+requests = "*"
+scipy = "*"
+statsmodels = "*"
+xlsxwriter = "*"
+
+[[package]]
+name = "h5py"
+version = "3.9.0"
+description = "Read and write HDF5 files from Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "h5py-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eb7bdd5e601dd1739698af383be03f3dad0465fe67184ebd5afca770f50df9d6"},
+    {file = "h5py-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:78e44686334cbbf2dd21d9df15823bc38663f27a3061f6a032c68a3e30c47bf7"},
+    {file = "h5py-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68b41efd110ce9af1cbe6fa8af9f4dcbadace6db972d30828b911949e28fadd"},
+    {file = "h5py-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12aa556d540f11a2cae53ea7cfb94017353bd271fb3962e1296b342f6550d1b8"},
+    {file = "h5py-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:d97409e17915798029e297a84124705c8080da901307ea58f29234e09b073ddc"},
+    {file = "h5py-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:551e358db05a874a0f827b22e95b30092f2303edc4b91bb62ad2f10e0236e1a0"},
+    {file = "h5py-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6822a814b9d8b8363ff102f76ea8d026f0ca25850bb579d85376029ee3e73b93"},
+    {file = "h5py-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54f01202cdea754ab4227dd27014bdbd561a4bbe4b631424fd812f7c2ce9c6ac"},
+    {file = "h5py-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64acceaf6aff92af091a4b83f6dee3cf8d3061f924a6bb3a33eb6c4658a8348b"},
+    {file = "h5py-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:804c7fb42a34c8ab3a3001901c977a5c24d2e9c586a0f3e7c0a389130b4276fc"},
+    {file = "h5py-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8d9492391ff5c3c80ec30ae2fe82a3f0efd1e750833739c25b0d090e3be1b095"},
+    {file = "h5py-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9da9e7e63376c32704e37ad4cea2dceae6964cee0d8515185b3ab9cbd6b947bc"},
+    {file = "h5py-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e20897c88759cbcbd38fb45b507adc91af3e0f67722aa302d71f02dd44d286"},
+    {file = "h5py-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbf5225543ca35ce9f61c950b73899a82be7ba60d58340e76d0bd42bf659235a"},
+    {file = "h5py-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:36408f8c62f50007d14e000f9f3acf77e103b9e932c114cbe52a3089e50ebf94"},
+    {file = "h5py-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:23e74b878bbe1653ab34ca49b83cac85529cd0b36b9d625516c5830cc5ca2eac"},
+    {file = "h5py-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f457089c5d524b7998e3649bc63240679b8fb0a3859ea53bbb06841f3d755f1"},
+    {file = "h5py-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6284061f3214335e1eec883a6ee497dbe7a79f19e6a57fed2dd1f03acd5a8cb"},
+    {file = "h5py-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7a745efd0d56076999b52e8da5fad5d30823bac98b59c68ae75588d09991a"},
+    {file = "h5py-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:79bbca34696c6f9eeeb36a91776070c49a060b2879828e2c8fa6c58b8ed10dd1"},
+    {file = "h5py-3.9.0.tar.gz", hash = "sha256:e604db6521c1e367c6bd7fad239c847f53cc46646f2d2651372d05ae5e95f817"},
+]
+
+[package.dependencies]
+numpy = ">=1.17.3"
+
+[[package]]
+name = "idna"
+version = "3.4"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
+    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+]
+
+[[package]]
+name = "lxml"
+version = "4.9.3"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+files = [
+    {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"},
+    {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"},
+    {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"},
+    {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"},
+    {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"},
+    {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"},
+    {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"},
+    {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"},
+    {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"},
+    {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"},
+    {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1247694b26342a7bf47c02e513d32225ededd18045264d40758abeb3c838a51f"},
+    {file = "lxml-4.9.3-cp310-cp310-win32.whl", hash = "sha256:cdb650fc86227eba20de1a29d4b2c1bfe139dc75a0669270033cb2ea3d391b85"},
+    {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"},
+    {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"},
+    {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"},
+    {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"},
+    {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e28c51fa0ce5674be9f560c6761c1b441631901993f76700b1b30ca6c8378d6"},
+    {file = "lxml-4.9.3-cp311-cp311-win32.whl", hash = "sha256:0bfd0767c5c1de2551a120673b72e5d4b628737cb05414f03c3277bf9bed3305"},
+    {file = "lxml-4.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:25f32acefac14ef7bd53e4218fe93b804ef6f6b92ffdb4322bb6d49d94cad2bc"},
+    {file = "lxml-4.9.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:d3ff32724f98fbbbfa9f49d82852b159e9784d6094983d9a8b7f2ddaebb063d4"},
+    {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48d6ed886b343d11493129e019da91d4039826794a3e3027321c56d9e71505be"},
+    {file = "lxml-4.9.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9a92d3faef50658dd2c5470af249985782bf754c4e18e15afb67d3ab06233f13"},
+    {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b4e4bc18382088514ebde9328da057775055940a1f2e18f6ad2d78aa0f3ec5b9"},
+    {file = "lxml-4.9.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fc9b106a1bf918db68619fdcd6d5ad4f972fdd19c01d19bdb6bf63f3589a9ec5"},
+    {file = "lxml-4.9.3-cp312-cp312-win_amd64.whl", hash = "sha256:d37017287a7adb6ab77e1c5bee9bcf9660f90ff445042b790402a654d2ad81d8"},
+    {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:56dc1f1ebccc656d1b3ed288f11e27172a01503fc016bcabdcbc0978b19352b7"},
+    {file = "lxml-4.9.3-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:578695735c5a3f51569810dfebd05dd6f888147a34f0f98d4bb27e92b76e05c2"},
+    {file = "lxml-4.9.3-cp35-cp35m-win32.whl", hash = "sha256:704f61ba8c1283c71b16135caf697557f5ecf3e74d9e453233e4771d68a1f42d"},
+    {file = "lxml-4.9.3-cp35-cp35m-win_amd64.whl", hash = "sha256:c41bfca0bd3532d53d16fd34d20806d5c2b1ace22a2f2e4c0008570bf2c58833"},
+    {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"},
+    {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"},
+    {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0c0850c8b02c298d3c7006b23e98249515ac57430e16a166873fc47a5d549287"},
+    {file = "lxml-4.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:aca086dc5f9ef98c512bac8efea4483eb84abbf926eaeedf7b91479feb092458"},
+    {file = "lxml-4.9.3-cp36-cp36m-win32.whl", hash = "sha256:50baa9c1c47efcaef189f31e3d00d697c6d4afda5c3cde0302d063492ff9b477"},
+    {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"},
+    {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"},
+    {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:081d32421db5df44c41b7f08a334a090a545c54ba977e47fd7cc2deece78809a"},
+    {file = "lxml-4.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:23eed6d7b1a3336ad92d8e39d4bfe09073c31bfe502f20ca5116b2a334f8ec02"},
+    {file = "lxml-4.9.3-cp37-cp37m-win32.whl", hash = "sha256:1509dd12b773c02acd154582088820893109f6ca27ef7291b003d0e81666109f"},
+    {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"},
+    {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"},
+    {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3e9bdd30efde2b9ccfa9cb5768ba04fe71b018a25ea093379c857c9dad262c40"},
+    {file = "lxml-4.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fcdd00edfd0a3001e0181eab3e63bd5c74ad3e67152c84f93f13769a40e073a7"},
+    {file = "lxml-4.9.3-cp38-cp38-win32.whl", hash = "sha256:57aba1bbdf450b726d58b2aea5fe47c7875f5afb2c4a23784ed78f19a0462574"},
+    {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"},
+    {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"},
+    {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b6420a005548ad52154c8ceab4a1290ff78d757f9e5cbc68f8c77089acd3c432"},
+    {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bb3bb49c7a6ad9d981d734ef7c7193bc349ac338776a0360cc671eaee89bcf69"},
+    {file = "lxml-4.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d27be7405547d1f958b60837dc4c1007da90b8b23f54ba1f8b728c78fdb19d50"},
+    {file = "lxml-4.9.3-cp39-cp39-win32.whl", hash = "sha256:8df133a2ea5e74eef5e8fc6f19b9e085f758768a16e9877a60aec455ed2609b2"},
+    {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"},
+    {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"},
+    {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"},
+    {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"},
+    {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"},
+    {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"},
+]
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+source = ["Cython (>=0.29.35)"]
+
+[[package]]
+name = "msgpack"
+version = "1.0.7"
+description = "MessagePack serializer"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04ad6069c86e531682f9e1e71b71c1c3937d6014a7c3e9edd2aa81ad58842862"},
+    {file = "msgpack-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cca1b62fe70d761a282496b96a5e51c44c213e410a964bdffe0928e611368329"},
+    {file = "msgpack-1.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e50ebce52f41370707f1e21a59514e3375e3edd6e1832f5e5235237db933c98b"},
+    {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b4f35de6a304b5533c238bee86b670b75b03d31b7797929caa7a624b5dda6"},
+    {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28efb066cde83c479dfe5a48141a53bc7e5f13f785b92ddde336c716663039ee"},
+    {file = "msgpack-1.0.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4cb14ce54d9b857be9591ac364cb08dc2d6a5c4318c1182cb1d02274029d590d"},
+    {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b573a43ef7c368ba4ea06050a957c2a7550f729c31f11dd616d2ac4aba99888d"},
+    {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ccf9a39706b604d884d2cb1e27fe973bc55f2890c52f38df742bc1d79ab9f5e1"},
+    {file = "msgpack-1.0.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cb70766519500281815dfd7a87d3a178acf7ce95390544b8c90587d76b227681"},
+    {file = "msgpack-1.0.7-cp310-cp310-win32.whl", hash = "sha256:b610ff0f24e9f11c9ae653c67ff8cc03c075131401b3e5ef4b82570d1728f8a9"},
+    {file = "msgpack-1.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:a40821a89dc373d6427e2b44b572efc36a2778d3f543299e2f24eb1a5de65415"},
+    {file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:576eb384292b139821c41995523654ad82d1916da6a60cff129c715a6223ea84"},
+    {file = "msgpack-1.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:730076207cb816138cf1af7f7237b208340a2c5e749707457d70705715c93b93"},
+    {file = "msgpack-1.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:85765fdf4b27eb5086f05ac0491090fc76f4f2b28e09d9350c31aac25a5aaff8"},
+    {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3476fae43db72bd11f29a5147ae2f3cb22e2f1a91d575ef130d2bf49afd21c46"},
+    {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d4c80667de2e36970ebf74f42d1088cc9ee7ef5f4e8c35eee1b40eafd33ca5b"},
+    {file = "msgpack-1.0.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b0bf0effb196ed76b7ad883848143427a73c355ae8e569fa538365064188b8e"},
+    {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f9a7c509542db4eceed3dcf21ee5267ab565a83555c9b88a8109dcecc4709002"},
+    {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:84b0daf226913133f899ea9b30618722d45feffa67e4fe867b0b5ae83a34060c"},
+    {file = "msgpack-1.0.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ec79ff6159dffcc30853b2ad612ed572af86c92b5168aa3fc01a67b0fa40665e"},
+    {file = "msgpack-1.0.7-cp311-cp311-win32.whl", hash = "sha256:3e7bf4442b310ff154b7bb9d81eb2c016b7d597e364f97d72b1acc3817a0fdc1"},
+    {file = "msgpack-1.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:3f0c8c6dfa6605ab8ff0611995ee30d4f9fcff89966cf562733b4008a3d60d82"},
+    {file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f0936e08e0003f66bfd97e74ee530427707297b0d0361247e9b4f59ab78ddc8b"},
+    {file = "msgpack-1.0.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:98bbd754a422a0b123c66a4c341de0474cad4a5c10c164ceed6ea090f3563db4"},
+    {file = "msgpack-1.0.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b291f0ee7961a597cbbcc77709374087fa2a9afe7bdb6a40dbbd9b127e79afee"},
+    {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebbbba226f0a108a7366bf4b59bf0f30a12fd5e75100c630267d94d7f0ad20e5"},
+    {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e2d69948e4132813b8d1131f29f9101bc2c915f26089a6d632001a5c1349672"},
+    {file = "msgpack-1.0.7-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdf38ba2d393c7911ae989c3bbba510ebbcdf4ecbdbfec36272abe350c454075"},
+    {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:993584fc821c58d5993521bfdcd31a4adf025c7d745bbd4d12ccfecf695af5ba"},
+    {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:52700dc63a4676669b341ba33520f4d6e43d3ca58d422e22ba66d1736b0a6e4c"},
+    {file = "msgpack-1.0.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e45ae4927759289c30ccba8d9fdce62bb414977ba158286b5ddaf8df2cddb5c5"},
+    {file = "msgpack-1.0.7-cp312-cp312-win32.whl", hash = "sha256:27dcd6f46a21c18fa5e5deed92a43d4554e3df8d8ca5a47bf0615d6a5f39dbc9"},
+    {file = "msgpack-1.0.7-cp312-cp312-win_amd64.whl", hash = "sha256:7687e22a31e976a0e7fc99c2f4d11ca45eff652a81eb8c8085e9609298916dcf"},
+    {file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5b6ccc0c85916998d788b295765ea0e9cb9aac7e4a8ed71d12e7d8ac31c23c95"},
+    {file = "msgpack-1.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:235a31ec7db685f5c82233bddf9858748b89b8119bf4538d514536c485c15fe0"},
+    {file = "msgpack-1.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cab3db8bab4b7e635c1c97270d7a4b2a90c070b33cbc00c99ef3f9be03d3e1f7"},
+    {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bfdd914e55e0d2c9e1526de210f6fe8ffe9705f2b1dfcc4aecc92a4cb4b533d"},
+    {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36e17c4592231a7dbd2ed09027823ab295d2791b3b1efb2aee874b10548b7524"},
+    {file = "msgpack-1.0.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38949d30b11ae5f95c3c91917ee7a6b239f5ec276f271f28638dec9156f82cfc"},
+    {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ff1d0899f104f3921d94579a5638847f783c9b04f2d5f229392ca77fba5b82fc"},
+    {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dc43f1ec66eb8440567186ae2f8c447d91e0372d793dfe8c222aec857b81a8cf"},
+    {file = "msgpack-1.0.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dd632777ff3beaaf629f1ab4396caf7ba0bdd075d948a69460d13d44357aca4c"},
+    {file = "msgpack-1.0.7-cp38-cp38-win32.whl", hash = "sha256:4e71bc4416de195d6e9b4ee93ad3f2f6b2ce11d042b4d7a7ee00bbe0358bd0c2"},
+    {file = "msgpack-1.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:8f5b234f567cf76ee489502ceb7165c2a5cecec081db2b37e35332b537f8157c"},
+    {file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfef2bb6ef068827bbd021017a107194956918ab43ce4d6dc945ffa13efbc25f"},
+    {file = "msgpack-1.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:484ae3240666ad34cfa31eea7b8c6cd2f1fdaae21d73ce2974211df099a95d81"},
+    {file = "msgpack-1.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3967e4ad1aa9da62fd53e346ed17d7b2e922cba5ab93bdd46febcac39be636fc"},
+    {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dd178c4c80706546702c59529ffc005681bd6dc2ea234c450661b205445a34d"},
+    {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6ffbc252eb0d229aeb2f9ad051200668fc3a9aaa8994e49f0cb2ffe2b7867e7"},
+    {file = "msgpack-1.0.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:822ea70dc4018c7e6223f13affd1c5c30c0f5c12ac1f96cd8e9949acddb48a61"},
+    {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:384d779f0d6f1b110eae74cb0659d9aa6ff35aaf547b3955abf2ab4c901c4819"},
+    {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f64e376cd20d3f030190e8c32e1c64582eba56ac6dc7d5b0b49a9d44021b52fd"},
+    {file = "msgpack-1.0.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5ed82f5a7af3697b1c4786053736f24a0efd0a1b8a130d4c7bfee4b9ded0f08f"},
+    {file = "msgpack-1.0.7-cp39-cp39-win32.whl", hash = "sha256:f26a07a6e877c76a88e3cecac8531908d980d3d5067ff69213653649ec0f60ad"},
+    {file = "msgpack-1.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:1dc93e8e4653bdb5910aed79f11e165c85732067614f180f70534f056da97db3"},
+    {file = "msgpack-1.0.7.tar.gz", hash = "sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87"},
+]
+
+[[package]]
+name = "ndindex"
+version = "1.7"
+description = "A Python library for manipulating indices of ndarrays."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ndindex-1.7-py3-none-any.whl", hash = "sha256:4c0555d352ac9947b0f022562aea9f5d57fa06743ea069669138f75a88b42884"},
+    {file = "ndindex-1.7.tar.gz", hash = "sha256:bf9bd0b76eeada1c8275e04091f8291869ed2b373b7af48e56faf7579fd2efd2"},
+]
+
+[package.extras]
+arrays = ["numpy"]
+
+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
+[[package]]
+name = "numexpr"
+version = "2.8.7"
+description = "Fast numerical expression evaluator for NumPy"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numexpr-2.8.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d88531ffea3ea9287e8a1665c6a2d0206d3f4660d5244423e2a134a7f0ce5fba"},
+    {file = "numexpr-2.8.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db1065ba663a854115cf1f493afd7206e2efcef6643129e8061e97a51ad66ebb"},
+    {file = "numexpr-2.8.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4546416004ff2e7eb9cf52c2d7ab82732b1b505593193ee9f93fa770edc5230"},
+    {file = "numexpr-2.8.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb2f473fdfd09d17db3038e34818d05b6bc561a36785aa927d6c0e06bccc9911"},
+    {file = "numexpr-2.8.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5496fc9e3ae214637cbca1ab556b0e602bd3afe9ff4c943a29c482430972cda8"},
+    {file = "numexpr-2.8.7-cp310-cp310-win32.whl", hash = "sha256:d43f1f0253a6f2db2f76214e6f7ae9611b422cba3f7d4c86415d7a78bbbd606f"},
+    {file = "numexpr-2.8.7-cp310-cp310-win_amd64.whl", hash = "sha256:cf5f112bce5c5966c47cc33700bc14ce745c8351d437ed57a9574fff581f341a"},
+    {file = "numexpr-2.8.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:32934d51b5bc8a6636436326da79ed380e2f151989968789cf65b1210572cb46"},
+    {file = "numexpr-2.8.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f021ac93cb3dd5d8ba2882627b615b1f58cb089dcc85764c6fbe7a549ed21b0c"},
+    {file = "numexpr-2.8.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dccf572763517db6562fb7b17db46aacbbf62a9ca0a66672872f4f71aee7b186"},
+    {file = "numexpr-2.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11121b14ee3179bade92e823f25f1b94e18716d33845db5081973331188c3338"},
+    {file = "numexpr-2.8.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:81451962d4145a46dba189df65df101d4d1caddb6efe6ebfe05982cd9f62b2cf"},
+    {file = "numexpr-2.8.7-cp311-cp311-win32.whl", hash = "sha256:da55ba845b847cc33c4bf81cee4b1bddfb0831118cabff8db62888ab8697ec34"},
+    {file = "numexpr-2.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:fd93b88d5332069916fa00829ea1b972b7e73abcb1081eee5c905a514b8b59e3"},
+    {file = "numexpr-2.8.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5340d2c86d83f52e1a3e7fd97c37d358ae99af9de316bdeeab2565b9b1e622ca"},
+    {file = "numexpr-2.8.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3bdf8cbc00c77a46230c765d242f92d35905c239b20c256c48dbac91e49f253"},
+    {file = "numexpr-2.8.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d46c47e361fa60966a3339cb4f463ae6151ce7d78ed38075f06e8585d2c8929f"},
+    {file = "numexpr-2.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a371cfc1670a18eea2d5c70abaa95a0e8824b70d28da884bad11931266e3a0ca"},
+    {file = "numexpr-2.8.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:47a249cecd1382d482a5bf1fac0d11392fb2ed0f7d415ebc4cd901959deb1ec9"},
+    {file = "numexpr-2.8.7-cp312-cp312-win32.whl", hash = "sha256:b8a5b2c21c26b62875bf819d375d798b96a32644e3c28bd4ce7789ed1fb489da"},
+    {file = "numexpr-2.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:f29f4d08d9b0ed6fa5d32082971294b2f9131b8577c2b7c36432ed670924313f"},
+    {file = "numexpr-2.8.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ecaa5be24cf8fa0f00108e9dfa1021b7510e9dd9d159b8d8bc7c7ddbb995b31"},
+    {file = "numexpr-2.8.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a84284e0a407ca52980fd20962e89aff671c84cd6e73458f2e29ea2aa206356"},
+    {file = "numexpr-2.8.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e838289e3b7bbe100b99e35496e6cc4cc0541c2207078941ee5a1d46e6b925ae"},
+    {file = "numexpr-2.8.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0983052f308ea75dd232eb7f4729eed839db8fe8d82289940342b32cc55b15d0"},
+    {file = "numexpr-2.8.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8bf005acd7f1985c71b1b247aaac8950d6ea05a0fe0bbbbf3f96cd398b136daa"},
+    {file = "numexpr-2.8.7-cp39-cp39-win32.whl", hash = "sha256:56ec95f8d1db0819e64987dcf1789acd500fa4ea396eeabe4af6efdcb8902d07"},
+    {file = "numexpr-2.8.7-cp39-cp39-win_amd64.whl", hash = "sha256:c7bf60fc1a9c90a9cb21c4c235723e579bff70c8d5362228cb2cf34426104ba2"},
+    {file = "numexpr-2.8.7.tar.gz", hash = "sha256:596eeb3bbfebc912f4b6eaaf842b61ba722cebdb8bc42dfefa657d3a74953849"},
+]
+
+[package.dependencies]
+numpy = ">=1.13.3"
+
+[[package]]
+name = "numpy"
+version = "1.26.0"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = "<3.13,>=3.9"
+files = [
+    {file = "numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8db2f125746e44dce707dd44d4f4efeea8d7e2b43aace3f8d1f235cfa2733dd"},
+    {file = "numpy-1.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0621f7daf973d34d18b4e4bafb210bbaf1ef5e0100b5fa750bd9cde84c7ac292"},
+    {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51be5f8c349fdd1a5568e72713a21f518e7d6707bcf8503b528b88d33b57dc68"},
+    {file = "numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:767254ad364991ccfc4d81b8152912e53e103ec192d1bb4ea6b1f5a7117040be"},
+    {file = "numpy-1.26.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:436c8e9a4bdeeee84e3e59614d38c3dbd3235838a877af8c211cfcac8a80b8d3"},
+    {file = "numpy-1.26.0-cp310-cp310-win32.whl", hash = "sha256:c2e698cb0c6dda9372ea98a0344245ee65bdc1c9dd939cceed6bb91256837896"},
+    {file = "numpy-1.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:09aaee96c2cbdea95de76ecb8a586cb687d281c881f5f17bfc0fb7f5890f6b91"},
+    {file = "numpy-1.26.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:637c58b468a69869258b8ae26f4a4c6ff8abffd4a8334c830ffb63e0feefe99a"},
+    {file = "numpy-1.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:306545e234503a24fe9ae95ebf84d25cba1fdc27db971aa2d9f1ab6bba19a9dd"},
+    {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6adc33561bd1d46f81131d5352348350fc23df4d742bb246cdfca606ea1208"},
+    {file = "numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e062aa24638bb5018b7841977c360d2f5917268d125c833a686b7cbabbec496c"},
+    {file = "numpy-1.26.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:546b7dd7e22f3c6861463bebb000646fa730e55df5ee4a0224408b5694cc6148"},
+    {file = "numpy-1.26.0-cp311-cp311-win32.whl", hash = "sha256:c0b45c8b65b79337dee5134d038346d30e109e9e2e9d43464a2970e5c0e93229"},
+    {file = "numpy-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:eae430ecf5794cb7ae7fa3808740b015aa80747e5266153128ef055975a72b99"},
+    {file = "numpy-1.26.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:166b36197e9debc4e384e9c652ba60c0bacc216d0fc89e78f973a9760b503388"},
+    {file = "numpy-1.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f042f66d0b4ae6d48e70e28d487376204d3cbf43b84c03bac57e28dac6151581"},
+    {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5e18e5b14a7560d8acf1c596688f4dfd19b4f2945b245a71e5af4ddb7422feb"},
+    {file = "numpy-1.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f6bad22a791226d0a5c7c27a80a20e11cfe09ad5ef9084d4d3fc4a299cca505"},
+    {file = "numpy-1.26.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4acc65dd65da28060e206c8f27a573455ed724e6179941edb19f97e58161bb69"},
+    {file = "numpy-1.26.0-cp312-cp312-win32.whl", hash = "sha256:bb0d9a1aaf5f1cb7967320e80690a1d7ff69f1d47ebc5a9bea013e3a21faec95"},
+    {file = "numpy-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:ee84ca3c58fe48b8ddafdeb1db87388dce2c3c3f701bf447b05e4cfcc3679112"},
+    {file = "numpy-1.26.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4a873a8180479bc829313e8d9798d5234dfacfc2e8a7ac188418189bb8eafbd2"},
+    {file = "numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:914b28d3215e0c721dc75db3ad6d62f51f630cb0c277e6b3bcb39519bed10bd8"},
+    {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c78a22e95182fb2e7874712433eaa610478a3caf86f28c621708d35fa4fd6e7f"},
+    {file = "numpy-1.26.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86f737708b366c36b76e953c46ba5827d8c27b7a8c9d0f471810728e5a2fe57c"},
+    {file = "numpy-1.26.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b44e6a09afc12952a7d2a58ca0a2429ee0d49a4f89d83a0a11052da696440e49"},
+    {file = "numpy-1.26.0-cp39-cp39-win32.whl", hash = "sha256:5671338034b820c8d58c81ad1dafc0ed5a00771a82fccc71d6438df00302094b"},
+    {file = "numpy-1.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:020cdbee66ed46b671429c7265cf00d8ac91c046901c55684954c3958525dab2"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0792824ce2f7ea0c82ed2e4fecc29bb86bee0567a080dacaf2e0a01fe7654369"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d484292eaeb3e84a51432a94f53578689ffdea3f90e10c8b203a99be5af57d8"},
+    {file = "numpy-1.26.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:186ba67fad3c60dbe8a3abff3b67a91351100f2661c8e2a80364ae6279720299"},
+    {file = "numpy-1.26.0.tar.gz", hash = "sha256:f93fc78fe8bf15afe2b8d6b6499f1c73953169fad1e9a8dd086cdff3190e7fdf"},
+]
+
+[[package]]
+name = "openpyxl"
+version = "3.1.2"
+description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
+    {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
+]
+
+[package.dependencies]
+et-xmlfile = "*"
+
+[[package]]
+name = "packaging"
+version = "23.2"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"},
+    {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"},
+]
+
+[[package]]
+name = "pandas"
+version = "2.1.1"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58d997dbee0d4b64f3cb881a24f918b5f25dd64ddf31f467bb9b67ae4c63a1e4"},
+    {file = "pandas-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02304e11582c5d090e5a52aec726f31fe3f42895d6bfc1f28738f9b64b6f0614"},
+    {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffa8f0966de2c22de408d0e322db2faed6f6e74265aa0856f3824813cf124363"},
+    {file = "pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1f84c144dee086fe4f04a472b5cd51e680f061adf75c1ae4fc3a9275560f8f4"},
+    {file = "pandas-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:75ce97667d06d69396d72be074f0556698c7f662029322027c226fd7a26965cb"},
+    {file = "pandas-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:4c3f32fd7c4dccd035f71734df39231ac1a6ff95e8bdab8d891167197b7018d2"},
+    {file = "pandas-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9e2959720b70e106bb1d8b6eadd8ecd7c8e99ccdbe03ee03260877184bb2877d"},
+    {file = "pandas-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:25e8474a8eb258e391e30c288eecec565bfed3e026f312b0cbd709a63906b6f8"},
+    {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8bd1685556f3374520466998929bade3076aeae77c3e67ada5ed2b90b4de7f0"},
+    {file = "pandas-2.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc3657869c7902810f32bd072f0740487f9e030c1a3ab03e0af093db35a9d14e"},
+    {file = "pandas-2.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:05674536bd477af36aa2effd4ec8f71b92234ce0cc174de34fd21e2ee99adbc2"},
+    {file = "pandas-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:b407381258a667df49d58a1b637be33e514b07f9285feb27769cedb3ab3d0b3a"},
+    {file = "pandas-2.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c747793c4e9dcece7bb20156179529898abf505fe32cb40c4052107a3c620b49"},
+    {file = "pandas-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3bcad1e6fb34b727b016775bea407311f7721db87e5b409e6542f4546a4951ea"},
+    {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5ec7740f9ccb90aec64edd71434711f58ee0ea7f5ed4ac48be11cfa9abf7317"},
+    {file = "pandas-2.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29deb61de5a8a93bdd033df328441a79fcf8dd3c12d5ed0b41a395eef9cd76f0"},
+    {file = "pandas-2.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4f99bebf19b7e03cf80a4e770a3e65eee9dd4e2679039f542d7c1ace7b7b1daa"},
+    {file = "pandas-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:84e7e910096416adec68075dc87b986ff202920fb8704e6d9c8c9897fe7332d6"},
+    {file = "pandas-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366da7b0e540d1b908886d4feb3d951f2f1e572e655c1160f5fde28ad4abb750"},
+    {file = "pandas-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e50e72b667415a816ac27dfcfe686dc5a0b02202e06196b943d54c4f9c7693e"},
+    {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1ab6a25da197f03ebe6d8fa17273126120874386b4ac11c1d687df288542dd"},
+    {file = "pandas-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0dbfea0dd3901ad4ce2306575c54348d98499c95be01b8d885a2737fe4d7a98"},
+    {file = "pandas-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0489b0e6aa3d907e909aef92975edae89b1ee1654db5eafb9be633b0124abe97"},
+    {file = "pandas-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:4cdb0fab0400c2cb46dafcf1a0fe084c8bb2480a1fa8d81e19d15e12e6d4ded2"},
+    {file = "pandas-2.1.1.tar.gz", hash = "sha256:fecb198dc389429be557cde50a2d46da8434a17fe37d7d41ff102e3987fd947b"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.1"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"]
+aws = ["s3fs (>=2022.05.0)"]
+clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"]
+compression = ["zstandard (>=0.17.0)"]
+computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"]
+feather = ["pyarrow (>=7.0.0)"]
+fss = ["fsspec (>=2022.05.0)"]
+gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"]
+hdf5 = ["tables (>=3.7.0)"]
+html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"]
+mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"]
+parquet = ["pyarrow (>=7.0.0)"]
+performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"]
+plot = ["matplotlib (>=3.6.1)"]
+postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"]
+spss = ["pyreadstat (>=1.1.5)"]
+sql-other = ["SQLAlchemy (>=1.4.36)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.8.0)"]
+
+[[package]]
+name = "patsy"
+version = "0.5.3"
+description = "A Python package for describing statistical models and for building design matrices."
+optional = false
+python-versions = "*"
+files = [
+    {file = "patsy-0.5.3-py2.py3-none-any.whl", hash = "sha256:7eb5349754ed6aa982af81f636479b1b8db9d5b1a6e957a6016ec0534b5c86b7"},
+    {file = "patsy-0.5.3.tar.gz", hash = "sha256:bdc18001875e319bc91c812c1eb6a10be4bb13cb81eb763f466179dca3b67277"},
+]
+
+[package.dependencies]
+numpy = ">=1.4"
+six = "*"
+
+[package.extras]
+test = ["pytest", "pytest-cov", "scipy"]
+
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+description = "Get CPU info with pure Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
+    {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
+]
+
+[[package]]
+name = "pydot"
+version = "1.4.2"
+description = "Python interface to Graphviz's Dot"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "pydot-1.4.2-py2.py3-none-any.whl", hash = "sha256:66c98190c65b8d2e2382a441b4c0edfdb4f4c025ef9cb9874de478fb0793a451"},
+    {file = "pydot-1.4.2.tar.gz", hash = "sha256:248081a39bcb56784deb018977e428605c1c758f10897a339fce1dd728ff007d"},
+]
+
+[package.dependencies]
+pyparsing = ">=2.1.4"
+
+[[package]]
+name = "pyham"
+version = "1.1.12"
+description = "A tool to analyse Hierarchical Orthologous Groups (HOGs)"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pyham-1.1.12-py2.py3-none-any.whl", hash = "sha256:a26b47bb9a9f5b961cb31ba1f271eef7b46dafa8829913e0aace207fd2aef37f"},
+    {file = "pyham-1.1.12.tar.gz", hash = "sha256:c2ec409ddae705670b2e0ed6870f0c8eb8ba8eb69f036ecdeb3c559ab9f7eb68"},
+]
+
+[package.dependencies]
+ete3 = ">=3.1"
+future = "*"
+lxml = "*"
+requests = "*"
+scipy = "*"
+six = "*"
+
+[package.extras]
+dev = ["fabric", "fabric3", "nose", "sphinx", "twine", "wheel"]
+test = ["nose"]
+
+[[package]]
+name = "pyoma"
+version = "0.12.1"
+description = "library to interact and build OMA hdf5 files"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pyoma-0.12.1-py3-none-any.whl", hash = "sha256:3f04310ed62a049b758763005e8bc23591c34f3626809291a518d22fc27b1252"},
+    {file = "pyoma-0.12.1.tar.gz", hash = "sha256:27667492eb0b465c5e2c561bf94121dee74ae4c986533cb3d475a6fc78aebfba"},
+]
+
+[package.dependencies]
+biopython = {version = ">=1.76", markers = "python_version >= \"3.6\""}
+datasketch = "*"
+ete3 = "*"
+future = "*"
+fuzzyset2 = ">=0.1.1"
+networkx = "*"
+numpy = ">=1.16"
+pandas = ">=0.22"
+pyopa = ">=0.8"
+tables = ">=3.5.1"
+tqdm = "*"
+
+[package.extras]
+create-db = ["PySAIS", "familyanalyzer (>=0.7.3)", "lark-parser", "matplotlib", "pebble", "pyham", "scikit-fuzzy", "scikit-learn"]
+docs = ["sphinx"]
+notebooks = ["jupyter", "matplotlib", "seaborn"]
+
+[[package]]
+name = "pyopa"
+version = "0.8.4"
+description = "PyOPA - optimal pairwise sequence alignments"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pyopa-0.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fd0b6dbfcd0397390065f0cecb973cc0daf1d98e2473bc4d4d0bc5cb7aa8b30"},
+    {file = "pyopa-0.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bdea0dc270fcd6496b1c932e8762b0f7919cd6a9a26dd152bd40041300684a17"},
+    {file = "pyopa-0.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:067e0f692077c83b5b3344fbbd64f5d7439054ee7e986796cd1ec090885e1482"},
+    {file = "pyopa-0.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f184f4b36fc8e4dd03c59a600b3256f12d39d02fb790469309931c87f78306f4"},
+    {file = "pyopa-0.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1d08b21177d97bfdaee90148ece8e2b3c7579c4a211638ea5852b6607b464da3"},
+    {file = "pyopa-0.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7699a06132a614915a8927d7d1367ef9e83f8707b17880a091be9b4638332d7a"},
+    {file = "pyopa-0.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40517c93fcaa44608b94dd58b446e4a6a8d2ad52efa42b6c7bc99288474479"},
+    {file = "pyopa-0.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:23a4627a018de0eb9539e158d77d4016af93c147bcbcb07cb20f7d8be7ccc824"},
+    {file = "pyopa-0.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:db24b34516d04ad51ec38391f92f7a3900dadd2ffaff43294be6b1405e028be3"},
+    {file = "pyopa-0.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37261da0339e77bb1c80b01844906db3df19817e277e334681621d4e03e7c951"},
+    {file = "pyopa-0.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:37eaf805ce193eb288a38e4b4731f0a0f7f1a1b8b7c904efdfb8ed53cc084fbe"},
+    {file = "pyopa-0.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:029fcacb44d2bcd1f5dced906037b4e17c6eb583cb15c6b52a208f3d28177520"},
+    {file = "pyopa-0.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d897e5d38c86b6e0b1418449c420ac84f18baf000278ea0bfd47568a85b02463"},
+    {file = "pyopa-0.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:be72a0a6a97f023bb8ffe1612cbe82487f08502ae4a315644abe3c8918fb2250"},
+    {file = "pyopa-0.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:67b8204023ee0926bd755f735946dc1fc11ada09ec7d4e2835cf80f300bf043c"},
+    {file = "pyopa-0.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:841e8fb227d1305853f87e454b6e186962b5bb8efcb82880fee1ebe4f43a5fc6"},
+    {file = "pyopa-0.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0b07a8d01cc83ad99b2b958f2f829f2c62b04f2ff7b31c0fc9380714c03ba6b"},
+    {file = "pyopa-0.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f006a1eb5fa848b5cecc2fa8bcbffb2201933504ea3fff08608b210210c778bd"},
+    {file = "pyopa-0.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ff16bc7bb7a3f488c06f1aa0a5e7aa0268afe5bec77830276a36ef873802d4d2"},
+    {file = "pyopa-0.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:274301ea0bad35f16a66484b9c2fb0a10cf3302abbad565454447f3846f12156"},
+    {file = "pyopa-0.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc7784364e255bfad3bc102a4b88c891216db03d7104fe9f26f2b8e488f9cac6"},
+    {file = "pyopa-0.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d375a5092e22a5635387270b2fbbc5aa380810ea58993f82d1b81d0e4950be51"},
+    {file = "pyopa-0.8.4.tar.gz", hash = "sha256:f83d1a7fddeb8e5d4abd63c38c9f638b2330ef323b2501031b4f9efa67abe819"},
+]
+
+[package.dependencies]
+numpy = "*"
+
+[[package]]
+name = "pyparsing"
+version = "3.1.1"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.6.8"
+files = [
+    {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"},
+    {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2023.3.post1"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"},
+    {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"},
+]
+
+[[package]]
+name = "rapidfuzz"
+version = "3.4.0"
+description = "rapid fuzzy string matching"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1438e68fe8869fe6819a313140e98641b34bfc89234b82486d8fd02044a067e8"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:59f851c7a54a9652b9598553547e0940244bfce7c9b672bac728efa0b9028d03"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6286510910fcd649471a7f5b77fcc971e673729e7c84216dbf321bead580d5a1"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87409e12f9a82aa33a5b845c49dd8d5d4264f2f171f0a69ddc638e100fcc50de"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1d81d380ceabc8297880525c9d8b9e93fead38d3d2254e558c36c18aaf2553f"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a716efcfc92659d8695291f07da4fa60f42a131dc4ceab583931452dd5662e92"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83387fb81c4c0234b199110655779762dd5982cdf9de4f7c321110713193133e"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55efb3231bb954f3597313ebdf104289b8d139d5429ad517051855f84e12b94e"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:51d47d52c890cbdb2d8b2085d747e557f15efd9c990cb6ae624c8f6948c4aa3a"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3db79070888d0dcd4f6a20fd30b8184dd975d6b0f7818acff5d7e07eba19b71f"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:46efc5e4675e2bd5118427513f86eaf3689e1482ebd309ad4532bcefae78179d"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:d15c364c5aa8f032dadf5b82fa02b7a4bd9688a961a27961cd5b985203f58037"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f1e91460baa42f5408f3c062913456a24b2fc1a181959b58a9c06b5eef700ca6"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c7f4f6dac25c120de8845a65a97090658c8a976827ac22b6b86e2a16a60bb820"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:124578029d926b2be32d60b748be95ee0de6cb2753eb49d6d1d6146269b428b9"},
+    {file = "rapidfuzz-3.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:3af0384132e79fe6f6370d49347649382e04f689277525903bef84d30f3992fd"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:66ff93b81b382269dc7c2d46c839ce72e2d2331ad46a06321770bc94016fe236"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:da2764604a31fd1e3f1cacf226b43a871cc9f28844a3196c2a6b1ba52ae12922"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8eb33895353bfcc33ccf4b4bae837c0afb4eaf20a0361aa6f0800cef12505e91"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed3da08830c08c8bcd49414cc06b704a760d3067804775facc0df725b52085a4"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b38c7021f6114cfacba5717192fb3e1e50053261d49a774e645021a2f77e20a3"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5ea97886d2ec7b2b9a8172812a76e1d243f2ce705c2f24baf46f9ef5d3951"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b9a7ab061c1b75b274fc2ebd1d29cfa2e510c36e2f4cd9518a6d56d589003c8"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23b07685c21c93cdf6d68b49eccacfe975651b8d99ea8a02687400c60315e5bc"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c2a564f748497b6a5e08a1dc0ac06655f65377cf072c4f0e2c73818acc655d36"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ef30b5f2720f0acbcfba0e0661a4cc118621c47cf69b5fe92531dfed1e369e1c"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:ab981f9091ae8bd32bca9289fa1019b4ec656543489e7e13e64882d57d989282"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a80f9aa4245a49e0677896d1b51b2b3bc36472aff7cec31c4a96f789135f03fe"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d8c6cb80b5d2edf88bf6a88ac6827a353c974405c2d7e3025ed9527a5dbe1a6"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-win32.whl", hash = "sha256:c0150d521199277b5ad8bd3b060a5f3c1dbdf11df0533b4d79f458ef11d07e8c"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:bd50bc90167601963e2a90b820fb862d239ecb096a991bf3ce33ffaa1d6eedee"},
+    {file = "rapidfuzz-3.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:bd10d68baabb63a3bb36b683f98fc481fcc62230e493e4b31e316bd5b299ef68"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7f497f850d46c5e08f3340343842a28ede5d3997e5d1cadbd265793cf47417e5"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7d6a9f04ea1277add8943d4e144e59215009f54f2668124ff26dee18a875343"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b6fe2aff0d9b35191701714e05afe08f79eaea376a3a6ca802b72d9e5b48b545"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b81b8bc29114ca861fed23da548a837832b85495b0c1b2600e6060e3cf4d50aa"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:805dc2aa3ac295dcbf2df8c1e420e8a73b1f632d6820a5a1c8506d22c11e0f27"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1276c7f50cd90a48b00084feb25256135c9ace6c599295dd5932949ec30c0e70"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0b9197656a6d71483959bf7d216e7fb7a6b80ca507433bcb3015fb92abc266f8"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3456f4df5b8800315fd161045c996479016c112228e4da370d09ed80c24853e5"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:734046d557550589edb83d5ad1468a1341d1092f1c64f26fd0b1fc50f9efdce1"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:37d5f0fbad6c092c89840eea2c4c845564d40849785de74c5e6ff48b47b0ecf6"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:bfe14711b9a7b744e242a482c6cabb696517a1a9946fc1e88d353cd3eb384788"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a733c10b1fcc47f837c23ab4a255cc4021a88939ff81baa64d6738231cba33d"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:929e6b71e5b36caee2ee11c209e75a0fcbd716a1b76ae6162b89ee9b591b63b1"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-win32.whl", hash = "sha256:c56073ba1d1b25585359ad9769163cb2f3183e7a03c03b914a0667fcbd95dc5c"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:bf58ba21df06fc8aeef3056fd137eca0a593c2f5c82923a4524d251dc5f3df5d"},
+    {file = "rapidfuzz-3.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:f3effbe9c677658b3149da0d2778a740a6b7d8190c1407fd0c0770a4e223cfe0"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ed0d5761b44d9dd87278d5c32903bb55632346e4d84ea67ba2e4a84afc3b7d45"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bafbd3e2e9e0b5f740f66155cc7e1e23eee1e1f2c44eff12daf14f90af0e8ab"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2543fd8d0fb3b1ac065bf94ee54c0ea33343c62481d8e54b6117a88c92c9b721"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93ceb62ade1a0e62696487274002157a58bb751fc82cd25016fc5523ba558ca5"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76f4162ce5fe08609455d318936ed4aa709f40784be61fb4e200a378137b0230"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f723197f2dbce508a7030dcf6d3fc940117aa54fc876021bf6f6feeaf3825ba1"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:cfdc74afd93ac71270b5be5c25cb864b733b9ae32b07495705a6ac294ac4c390"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:273c7c7f5b405f2f54d41e805883572d57e1f0a56861f93ca5a6733672088acb"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:712dd91d429afaddbf7e86662155f2ad9bc8135fca5803a01035a3c1d76c5977"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:9814905414696080d8448d6e6df788a0148954ab34d7cd8d75bcb85ba30e0b25"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:01013ee67fb15608c8c5961af3bc2b1f242cff94c19f53237c9b3f0edb8e0a2d"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:8f5d2adc48c181486125d42230e80479a1e0568942e883d1ebdeb76cd3f83470"},
+    {file = "rapidfuzz-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c92d847c997c384670e3b4cf6727cb73a4d7a7ba6457310e2083cf06d56013c4"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d0bda173b0ec1fa546f123088c0d42c9096304771b4c0555d4e08a66a246b3f6"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bbb05b1203f683b341f44ebe8fe38afed6e56f606094f9840d6406e4a7bf0eab"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f0075ff8990437923da42202b60cf04b5c122ee2856f0cf2344fb890cadecf57"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f295842c282fe7fe93bfe7a20e78f33f43418f47fb601f2f0a05df8a8282b43"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ebee7313719dfe652debb74bdd4024e8cf381a59adc6d065520ff927f3445f4"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f71454249ddd29d8ba5415ed7307e7b7493fc7e9018f1ff496127b8b9a8df94b"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52c6b7a178f0e800488fa1aede17b00f6397cab0b79d48531504b0d89e45315f"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d38596c804a9f2bd49360c15e1f4afbf016f181fe37fc4f1a4ddd247d3e91e5"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8756461e7ee79723b8f762fc6db226e65eb453bf9fa64b14fc0274d4aaaf9e21"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e14799297f194a4480f373e45142ef16d5dc68a42084c0e2018e0bdba56a8fef"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f813fb663d90038c1171d30ea1b6b275e09fced32f1d12b972c6045d9d4233f2"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:0df66e07e42e2831fae84dea481f7803bec7cfa53c31d770e86ac47bb18dcd57"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b05c7d4b4ddb617e977d648689013e50e5688140ee03538d3760a3a11d4fa8a2"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-win32.whl", hash = "sha256:74b9a1c1fc139d325fb0b89ccc85527d27096a76f6ed690ee3378143cc38e91d"},
+    {file = "rapidfuzz-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5fe3ef7daecd79f852936528e37528fd88818bc000991e0fea23b9ac5b79e875"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:61f16bb0f3026853500e7968261831a2e1a35d56947752bb6cf6953afd70b9de"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d188e8fb5a9709931c6a48cc62c4ac9b9d163969333711e426d9dbd134c1489b"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c006aa481d1b91c2600920ce16e42d208a4b6f318d393aef4dd2172d568f2641"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02afbe7ed12e9191082ed7bda43398baced1d9d805302b7b010d397de3ae973f"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01d64710060bc3241c08ac1f1a9012c7184f3f4c3d6e2eebb16c6093a03f6a67"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3198f70b97127e52a4f96bb2f7de447f89baa338ff398eb126930c8e3137ad1"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50ad7bac98a0f00492687eddda73d2c0bdf71c78b52fddaa5901634ae323d3ce"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc3efc06db79e818f4a6783a4e001b3c8b2c61bd05c0d5c4d333adaf64ed1b34"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:75d1365387ec8ef2128fd7e2f7436aa1a04a1953bc6d7068835bb769cd07c146"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a0750278693525b5ce58d3b313e432dfa5d90f00d06ae54fa8cde87f2a397eb0"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2e49151572b842d290dcee2cc6f9ce7a7b40b77cc20d0f6d6b54e7afb7bafa5c"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:8b38d7677b2f20b137bb7aaf0dcd3d8ac2a2cde65f09f5621bf3f57d9a1e5d6e"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d904ac97f2e370f91e8170802669c8ad68641bf84d742968416b53c5960410c6"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-win32.whl", hash = "sha256:53bbef345644eac1c2d7cc21ade4fe9554fa289f60eb2c576f7fdc454dbc0641"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:233bf022938c38060a93863ec548e624d69a56d7384634d8bea435b915b88e52"},
+    {file = "rapidfuzz-3.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:63933792146f3d333680d415cecc237e6275b42ad948d0a798f9a81325517666"},
+    {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e182ea5c809e7ed36ebfbcef4bb1808e213d27b33c036007a33bcbb7ba498356"},
+    {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e1142c8d35fa6f3af8150d02ff8edcbea3723c851d889e8b2172e0d1b99f3f7"},
+    {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b8258846e56b03230fa733d29bb4f9fb1f4790ac97d1ebe9faa3ff9d2850999"},
+    {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:950d1dfd2927cd45c9bb2927933926718f0a17792841e651d42f4d1cb04a5c1d"},
+    {file = "rapidfuzz-3.4.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd54dd0355225dc3c1d55e233d510adcccee9bb25d656b4cf1136114b92e7bf3"},
+    {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f5921780e7995e9ac3cea41fa57b623159d7295788618d3f2946d61328c25c25"},
+    {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc4b1b69a64d337c40fa07a721dae1b1550d90f17973fb348055f6440d597e26"},
+    {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5c8b901b6d3be63591c68e2612f76ad85af27193d0a88d4d87bb047aeafcb3"},
+    {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c67f5ced39aff6277dd772b239ef8aa8fc810200a3b42f69ddbb085ea0e18232"},
+    {file = "rapidfuzz-3.4.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4fd94acab871afbc845400814134a83512a711e824dc2c9a9776d6123464a221"},
+    {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:437508ec1ea6e71a77126715ac6208cb9c3e74272536ebfa79be9dd008cfb85f"},
+    {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7215f7c5de912b364d5cf7c4c66915ccf4acf71aafbb8da62ad346569196e15"},
+    {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:698488002eb7be2f737e48679ed0cd310b76291f26d8ec792db8345d13eb6573"},
+    {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e77873126eb07e7461f0b675263e6c5d42c8a952e88e4a44eeff96f237b2b024"},
+    {file = "rapidfuzz-3.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:28d03cd33817f6e0bea9b618b460f85ff9c9c3fedc6c19cfa0992f719a0d1801"},
+    {file = "rapidfuzz-3.4.0.tar.gz", hash = "sha256:a74112e2126b428c77db5e96f7ce34e91e750552147305b2d361122cbede2955"},
+]
+
+[package.extras]
+full = ["numpy"]
+
+[[package]]
+name = "requests"
+version = "2.31.0"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "scipy"
+version = "1.11.3"
+description = "Fundamental algorithms for scientific computing in Python"
+optional = false
+python-versions = "<3.13,>=3.9"
+files = [
+    {file = "scipy-1.11.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:370f569c57e1d888304052c18e58f4a927338eafdaef78613c685ca2ea0d1fa0"},
+    {file = "scipy-1.11.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9885e3e4f13b2bd44aaf2a1a6390a11add9f48d5295f7a592393ceb8991577a3"},
+    {file = "scipy-1.11.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e04aa19acc324a1a076abb4035dabe9b64badb19f76ad9c798bde39d41025cdc"},
+    {file = "scipy-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1a8a4657673bfae1e05e1e1d6e94b0cabe5ed0c7c144c8aa7b7dbb774ce5c1"},
+    {file = "scipy-1.11.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7abda0e62ef00cde826d441485e2e32fe737bdddee3324e35c0e01dee65e2a88"},
+    {file = "scipy-1.11.3-cp310-cp310-win_amd64.whl", hash = "sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929"},
+    {file = "scipy-1.11.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:925c6f09d0053b1c0f90b2d92d03b261e889b20d1c9b08a3a51f61afc5f58165"},
+    {file = "scipy-1.11.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5664e364f90be8219283eeb844323ff8cd79d7acbd64e15eb9c46b9bc7f6a42a"},
+    {file = "scipy-1.11.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3"},
+    {file = "scipy-1.11.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f290cf561a4b4edfe8d1001ee4be6da60c1c4ea712985b58bf6bc62badee221"},
+    {file = "scipy-1.11.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:91770cb3b1e81ae19463b3c235bf1e0e330767dca9eb4cd73ba3ded6c4151e4d"},
+    {file = "scipy-1.11.3-cp311-cp311-win_amd64.whl", hash = "sha256:e1f97cd89c0fe1a0685f8f89d85fa305deb3067d0668151571ba50913e445820"},
+    {file = "scipy-1.11.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dfcc1552add7cb7c13fb70efcb2389d0624d571aaf2c80b04117e2755a0c5d15"},
+    {file = "scipy-1.11.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0d3a136ae1ff0883fffbb1b05b0b2fea251cb1046a5077d0b435a1839b3e52b7"},
+    {file = "scipy-1.11.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bae66a2d7d5768eaa33008fa5a974389f167183c87bf39160d3fefe6664f8ddc"},
+    {file = "scipy-1.11.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2f6dee6cbb0e263b8142ed587bc93e3ed5e777f1f75448d24fb923d9fd4dce6"},
+    {file = "scipy-1.11.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:74e89dc5e00201e71dd94f5f382ab1c6a9f3ff806c7d24e4e90928bb1aafb280"},
+    {file = "scipy-1.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:90271dbde4be191522b3903fc97334e3956d7cfb9cce3f0718d0ab4fd7d8bfd6"},
+    {file = "scipy-1.11.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a63d1ec9cadecce838467ce0631c17c15c7197ae61e49429434ba01d618caa83"},
+    {file = "scipy-1.11.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:5305792c7110e32ff155aed0df46aa60a60fc6e52cd4ee02cdeb67eaccd5356e"},
+    {file = "scipy-1.11.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ea7f579182d83d00fed0e5c11a4aa5ffe01460444219dedc448a36adf0c3917"},
+    {file = "scipy-1.11.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c77da50c9a91e23beb63c2a711ef9e9ca9a2060442757dffee34ea41847d8156"},
+    {file = "scipy-1.11.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:15f237e890c24aef6891c7d008f9ff7e758c6ef39a2b5df264650eb7900403c0"},
+    {file = "scipy-1.11.3-cp39-cp39-win_amd64.whl", hash = "sha256:4b4bb134c7aa457e26cc6ea482b016fef45db71417d55cc6d8f43d799cdf9ef2"},
+    {file = "scipy-1.11.3.tar.gz", hash = "sha256:bba4d955f54edd61899776bad459bf7326e14b9fa1c552181f0479cc60a568cd"},
+]
+
+[package.dependencies]
+numpy = ">=1.21.6,<1.28.0"
+
+[package.extras]
+dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
+doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"]
+test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
+[[package]]
+name = "statsmodels"
+version = "0.14.0"
+description = "Statistical computations and models for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "statsmodels-0.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16bfe0c96a53b20fa19067e3b6bd2f1d39e30d4891ea0d7bc20734a0ae95942d"},
+    {file = "statsmodels-0.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5a6a0a1a06ff79be8aa89c8494b33903442859add133f0dda1daf37c3c71682e"},
+    {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77b3cd3a5268ef966a0a08582c591bd29c09c88b4566c892a7c087935234f285"},
+    {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c64ebe9cf376cba0c31aed138e15ed179a1d128612dd241cdf299d159e5e882"},
+    {file = "statsmodels-0.14.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:229b2f676b4a45cb62d132a105c9c06ca8a09ffba060abe34935391eb5d9ba87"},
+    {file = "statsmodels-0.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb471f757fc45102a87e5d86e87dc2c8c78b34ad4f203679a46520f1d863b9da"},
+    {file = "statsmodels-0.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:582f9e41092e342aaa04920d17cc3f97240e3ee198672f194719b5a3d08657d6"},
+    {file = "statsmodels-0.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7ebe885ccaa64b4bc5ad49ac781c246e7a594b491f08ab4cfd5aa456c363a6f6"},
+    {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b587ee5d23369a0e881da6e37f78371dce4238cf7638a455db4b633a1a1c62d6"},
+    {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef7fa4813c7a73b0d8a0c830250f021c102c71c95e9fe0d6877bcfb56d38b8c"},
+    {file = "statsmodels-0.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afe80544ef46730ea1b11cc655da27038bbaa7159dc5af4bc35bbc32982262f2"},
+    {file = "statsmodels-0.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:a6ad7b8aadccd4e4dd7f315a07bef1bca41d194eeaf4ec600d20dea02d242fce"},
+    {file = "statsmodels-0.14.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0eea4a0b761aebf0c355b726ac5616b9a8b618bd6e81a96b9f998a61f4fd7484"},
+    {file = "statsmodels-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4c815ce7a699047727c65a7c179bff4031cff9ae90c78ca730cfd5200eb025dd"},
+    {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:575f61337c8e406ae5fa074d34bc6eb77b5a57c544b2d4ee9bc3da6a0a084cf1"},
+    {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8be53cdeb82f49c4cb0fda6d7eeeb2d67dbd50179b3e1033510e061863720d93"},
+    {file = "statsmodels-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:6f7d762df4e04d1dde8127d07e91aff230eae643aa7078543e60e83e7d5b40db"},
+    {file = "statsmodels-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc2c7931008a911e3060c77ea8933f63f7367c0f3af04f82db3a04808ad2cd2c"},
+    {file = "statsmodels-0.14.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3757542c95247e4ab025291a740efa5da91dc11a05990c033d40fce31c450dc9"},
+    {file = "statsmodels-0.14.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:de489e3ed315bdba55c9d1554a2e89faa65d212e365ab81bc323fa52681fc60e"},
+    {file = "statsmodels-0.14.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e290f4718177bffa8823a780f3b882d56dd64ad1c18cfb4bc8b5558f3f5757"},
+    {file = "statsmodels-0.14.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71054f9dbcead56def14e3c9db6f66f943110fdfb19713caf0eb0f08c1ec03fd"},
+    {file = "statsmodels-0.14.0-cp38-cp38-win_amd64.whl", hash = "sha256:d7fda067837df94e0a614d93d3a38fb6868958d37f7f50afe2a534524f2660cb"},
+    {file = "statsmodels-0.14.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1c7724ad573af26139a98393ae64bc318d1b19762b13442d96c7a3e793f495c3"},
+    {file = "statsmodels-0.14.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3b0a135f3bfdeec987e36e3b3b4c53e0bb87a8d91464d2fcc4d169d176f46fdb"},
+    {file = "statsmodels-0.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce28eb1c397dba437ec39b9ab18f2101806f388c7a0cf9cdfd8f09294ad1c799"},
+    {file = "statsmodels-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68b1c768dd94cc5ba8398121a632b673c625491aa7ed627b82cb4c880a25563f"},
+    {file = "statsmodels-0.14.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d1e3e10dfbfcd58119ba5a4d3c7d519182b970a2aebaf0b6f539f55ae16058d"},
+    {file = "statsmodels-0.14.0.tar.gz", hash = "sha256:6875c7d689e966d948f15eb816ab5616f4928706b180cf470fd5907ab6f647a4"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.18", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""},
+    {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""},
+]
+packaging = ">=21.3"
+pandas = ">=1.0"
+patsy = ">=0.5.2"
+scipy = ">=1.4,<1.9.2 || >1.9.2"
+
+[package.extras]
+build = ["cython (>=0.29.26)"]
+develop = ["colorama", "cython (>=0.29.26)", "cython (>=0.29.28,<3.0.0)", "flake8", "isort", "joblib", "matplotlib (>=3)", "oldest-supported-numpy (>=2022.4.18)", "pytest (>=7.0.1,<7.1.0)", "pytest-randomly", "pytest-xdist", "pywinpty", "setuptools-scm[toml] (>=7.0.0,<7.1.0)"]
+docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "numpydoc", "pandas-datareader", "sphinx"]
+
+[[package]]
+name = "tables"
+version = "3.9.1"
+description = "Hierarchical datasets for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "tables-3.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:784c1ffe7f972e69a9c97c0f164064e43617727668df4333802a7f23cfb06ee3"},
+    {file = "tables-3.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af92f1e63b9fcadea621ab544540b7312553ea4f9456cf3d2728b48346fa557c"},
+    {file = "tables-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f725f69d49f414736de24616b4ffa400127b86417bd14a11854aacd2a505b4d"},
+    {file = "tables-3.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:e346249116b2eb95dd9277336c12f0d10d5328a5a3e8e16c74faa3c815817dc3"},
+    {file = "tables-3.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f49e899247b541ed69d12fef10b5505b97243317a91b93927328c19a15d38671"},
+    {file = "tables-3.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d1f2c947d63019db20728c6ecec39a1c900be00a65cae8025ac770148b641e8"},
+    {file = "tables-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb89fab4a3c3cd98bd781913234e1f67464ff6e17662180cf718e67645a09271"},
+    {file = "tables-3.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:aa176e1c72b0f935b0e607218ea8302378a39ed4fef5a544ebbd8d0523b56b86"},
+    {file = "tables-3.9.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f482aaaa4b12d394421013cd4617d3e8a53a8d4a7a872454f7a13fb16c51a68e"},
+    {file = "tables-3.9.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1813c0eced77540598987db32ce9e619d02b6032acdc3f59590d83c13bdb910c"},
+    {file = "tables-3.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a64ce39652a2e2934f6d41500b2c6f8d4922e2022f1361e2302f3e85df4e2393"},
+    {file = "tables-3.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:b49015aa8f576c6d5108c4aeb4d430bfcfc91ee8d0cca4d03e574e5485ffdc8b"},
+    {file = "tables-3.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50140091af9d60eb3f806d3ee43f542beae569888c37ae96d6a1c887c389d8c8"},
+    {file = "tables-3.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:282a0747b3ce4e3108bcd443361e031c9817bf7e84358317723a51b9c02c5655"},
+    {file = "tables-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0295123272bb49efbebdc9b1e2b72baa99c5761b78fccacedbf44c52a5fa51ac"},
+    {file = "tables-3.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:22084019437c504917ba8c0b2af75419e3d5c8ffc6d2ef4cd44031f06939518c"},
+    {file = "tables-3.9.1.tar.gz", hash = "sha256:48331503cd509c9f1f95cf2f5c64a57c48c0aa5141423f0eca352965c4f9bf81"},
+]
+
+[package.dependencies]
+blosc2 = ">=2.2.8"
+numexpr = ">=2.6.2"
+numpy = ">=1.19.0"
+packaging = "*"
+py-cpuinfo = "*"
+
+[package.extras]
+doc = ["ipython", "numpydoc", "sphinx (>=1.1,<6)", "sphinx-rtd-theme"]
+
+[[package]]
+name = "tqdm"
+version = "4.66.1"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "tzdata"
+version = "2023.3"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"},
+    {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.0.6"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "urllib3-2.0.6-py3-none-any.whl", hash = "sha256:7a7c7003b000adf9e7ca2a377c9688bbc54ed41b985789ed576570342a375cd2"},
+    {file = "urllib3-2.0.6.tar.gz", hash = "sha256:b19e1a85d206b56d7df1d5e683df4a7725252a964e3993648dd0fb5a1c157564"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "xlsxwriter"
+version = "3.1.6"
+description = "A Python module for creating Excel XLSX files."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "XlsxWriter-3.1.6-py3-none-any.whl", hash = "sha256:fc3838232f9f50763c1e81a3b381c6ad559dcdcd0983ee239bf54556392b4f3f"},
+    {file = "XlsxWriter-3.1.6.tar.gz", hash = "sha256:2087abdaa4a5e981a3ae50b5c21ff1adae59c8fecb6157808585fc169a6bfcd9"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = ">=3.10,<3.13"
+content-hash = "929e4f186404a38e3a3b0ca423cad177b5271758033b5e306e33e25dae89387b"
diff --git a/pyproject.toml b/pyproject.toml
index 6ce4573..08b57d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,59 +1,21 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-
-
-build-backend = "setuptools.build_meta"
-[project]
-name = "HogProf"
-dynamic = ["entry-points"]
-version = "0.0.8"
-authors = [
-  { name="Dave Moi", email="dmoi@unil.ch" },
-]
-dependencies = ["biopython",
-"certifi",
-"chardet",
-"datasketch",
-"ete3",
-"future",
-"goatools",
-"h5py",
-"idna",
-"lxml",
-"numexpr",
-"numpy",
-"pandas",
-"pyham>=1.1.10",
-"pyoma",
-"pyopa",
-"python-dateutil",
-"pytz",
-"requests",
-"scipy",
-"six",
-"tables",
-"urllib3",
-"tqdm"
-]
-description = "Phylogenetic Profiling with OMA and minhashing"
+[tool.poetry]
+name = "hogprof"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
 readme = "README.md"
 
-license = { file="LICENSE" }
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+tables = "^3.9.1"
+pyoma = "^0.12.1"
+pandas = "^2.1.1"
+h5py = "^3.9.0"
+ete3 = "^3.1.3"
+pyham = "^1.1.12"
+goatools = "^1.3.9"
 
-requires-python = ">=3.7"
 
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: MIT License",
-    "Operating System :: OS Independent",
-]
-[project.urls]
-"Homepage" = "https://github.com/DessimozLab/HogProf"
-"Bug Tracker" = "https://github.com/DessimozLab/HogProf/issues"
-"Docs" = "https://dessimozlab.github.io/HogProf/"
-
-[tool.setuptools]
-package-dir = {"" = "src"}
-
-[tool.setuptools.packages.find]
-where = ["src"]
\ No newline at end of file
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/src/HogProf.egg-info/PKG-INFO b/src/HogProf.egg-info/PKG-INFO
deleted file mode 100644
index 38ed72e..0000000
--- a/src/HogProf.egg-info/PKG-INFO
+++ /dev/null
@@ -1,113 +0,0 @@
-Metadata-Version: 2.1
-Name: HogProf
-Version: 0.0.8
-Summary: Phylogenetic Profiling with OMA and minhashing
-Author-email: Dave Moi <dmoi@unil.ch>
-License: MIT License
-        
-        Copyright (c) 2019 David Moi
-        
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
-        
-Project-URL: Homepage, https://github.com/DessimozLab/HogProf
-Project-URL: Bug Tracker, https://github.com/DessimozLab/HogProf/issues
-Project-URL: Docs, https://dessimozlab.github.io/HogProf/
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Operating System :: OS Independent
-Requires-Python: >=3.7
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: biopython
-Requires-Dist: certifi
-Requires-Dist: chardet
-Requires-Dist: datasketch
-Requires-Dist: ete3
-Requires-Dist: future
-Requires-Dist: goatools
-Requires-Dist: h5py
-Requires-Dist: idna
-Requires-Dist: lxml
-Requires-Dist: numexpr
-Requires-Dist: numpy
-Requires-Dist: pandas
-Requires-Dist: pyham>=1.1.10
-Requires-Dist: pyoma
-Requires-Dist: pyopa
-Requires-Dist: python-dateutil
-Requires-Dist: pytz
-Requires-Dist: requests
-Requires-Dist: scipy
-Requires-Dist: six
-Requires-Dist: tables
-Requires-Dist: urllib3
-Requires-Dist: tqdm
-
-# HogProf
-  - HogProf is an extensible and tunable approach to phylogenetic profiling using orthology data. It is powered by minhash based datastructures and computationally efficient.
-  - Still under major development and may change
-
-# Features
-
-  - Using orthoxoml files and a taxonomy calculated enhanced phylogenies of each family
-  - These are transformed into minhash signatures and a locally sensitive hashing forest object for search and comparison of profiles
-  - Taxonomic levels and evolutionary event types ( presence, loss, duplication ) can have custom weight in profile construction
-  - Optimization of weights using machine learning
-
-If you run into any problems feel free to contact me at [dmoi@unil.ch](dmoi@unil.ch)
-
-# Quickstart
-
-to install from github
-```
-$ git clone https://github.com/DessimozLab/HogProf.git
-$ pip install -r pipreqs.txt .
-```
-or to install from pypi
-```
-$ pip install hogprof
-```
-
-
-lets get a current version of the OMA hdf5 file and GAF. This will alow us to use the HOGs and study the functional enrichment of our search results.
-
-```
-$ cd ../..
-$ mkdir YourOmaDirectory
-$ cd YourOmaDirectory
-$ wget https://omabrowser.org/All/OmaServer.h5
-$ wget https://omabrowser.org/All/oma-go.txt.gz
-```
-
-We also need to make a location to store our pyprofiler databases
-
-```
-$ cd ..
-$ mkdir YourHogProfDirectory
-```
-
-Ok. We're ready! Now let's compile a database containing all HOGs and our desired taxonomic levels using default settings. Launch the lshbuilder.
-dbtypes available on the command line are : all , plants , archaea, bacteria , eukarya , protists , fungi , metazoa and vertebrates. These will use the NCBI taxonomy as a tree to annotate events in different gene family's histories.
-```
-$python lshbuilder.py --outpath YourHogProfDirectory --dbtype all --OMA YourOmaDirectory/OmaServer.h5 --nthreads numberOfCPUcores         
-
-```
-This should build a taxonomic tree for the genomes contained in the release and then calculate enhanced phylogenies for all HOGs in OMA.
-
-Once the database is completed it can be interogated using a profiler object. Construction and usage of this object should be done using a python script or notebook. This shown in the example notebook searchenrich.ipynb found in the examples. Please feel free to modify it to suit the needs of your own research.
diff --git a/src/HogProf/__pycache__/__init__.cpython-310.pyc b/src/HogProf/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 324adb6..0000000
Binary files a/src/HogProf/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/__pycache__/profiler.cpython-310.pyc b/src/HogProf/__pycache__/profiler.cpython-310.pyc
deleted file mode 100644
index bfa1130..0000000
Binary files a/src/HogProf/__pycache__/profiler.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/build/lib/notebooks/__init__.py b/src/HogProf/build/lib/notebooks/__init__.py
deleted file mode 100755
index c049efd..0000000
--- a/src/HogProf/build/lib/notebooks/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-name = "PyProfiler"
diff --git a/src/HogProf/build/lib/pyoma/__init__.py b/src/HogProf/build/lib/pyoma/__init__.py
deleted file mode 100755
index f6327cb..0000000
--- a/src/HogProf/build/lib/pyoma/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-name = "pyoma"
diff --git a/src/HogProf/build/lib/pyoma/browser/KmerEncoder.py b/src/HogProf/build/lib/pyoma/browser/KmerEncoder.py
deleted file mode 100755
index d8e4144..0000000
--- a/src/HogProf/build/lib/pyoma/browser/KmerEncoder.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-'''
-    DNA / AA to integer packer. i.e., base 5/21 <-> base 10.
-    Heavily adapted, but based on https://gist.github.com/bitsandbooks/2649444
-
-    NOTE: any sequences that are being decoded should be sanitised first.
-
-    -- Alex Warwick Vesztrocy, May-June 2017
-'''
-import numpy as np
-
-
-# "digits"
-DIGITS_AA = np.fromstring('ACDEFGHIKLMNPQRSTVWXY', dtype='S1')
-DIGITS_DNA = np.fromstring('ACGTX', dtype='S1')
-
-
-class KmerEncoder(object):
-    def __init__(self, k, is_protein=True):
-        '''
-            Initialise the kmer converter. k is the kmer length.
-            If is_dna=True then DNA else AA.
-        '''
-        self.digits = DIGITS_AA if is_protein else DIGITS_DNA
-        self.k = int(k)  # Cast incase np-type for n below.
-        self.max = (len(self.digits) ** self.k) - 1
-        self.n = self.decode(self.digits[-1] * self.k) + 1
-        self._prot = np.zeros((self.k,), dtype='S1')
-
-    def __len__(self):
-        '''
-            Return the maximum integer-representation of the kmer length
-            in this converter.
-        '''
-        return self.n
-
-    def encode(self, seq):
-        '''
-            Encode integer kmer in protein chars.
-        '''
-        if seq <= self.max:
-            self._prot[:] = self.digits[0]
-            i = -1
-            while seq > 0:
-                self._prot[i] = self.digits[seq % self.digits.size]
-                seq //= self.digits.size
-                i -= 1
-            return self._prot.tostring()
-        else:
-            raise ValueError('{} Larger than largest kmer of size {}'
-                             .format(seq, self.k))
-
-    def decode(self, seq):
-        '''
-            Decode a protein kmer -> integer. NOTE: sanitisation to a byte
-            string required first.
-        '''
-        x = 0
-        for digit in seq[:self.k].decode('ascii'):
-            x = (x * self.digits.size) + np.searchsorted(self.digits, digit)
-        return x
-
-    def decompose(self, seq):
-        '''
-            Decompose a sequence into counts of its constituent (decoded) kmers.
-        '''
-        for i in range(len(seq) - self.k + 1):
-            yield self.decode(seq[i:(i + self.k)])
diff --git a/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py b/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py
deleted file mode 100755
index e16541e..0000000
--- a/src/HogProf/build/lib/pyoma/browser/OrthoXMLSplitter.py
+++ /dev/null
@@ -1,195 +0,0 @@
-from __future__ import unicode_literals, division
-from builtins import str
-import lxml.etree as etree
-import os
-import errno
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class OrthoXMLSplitter(object):
-    """Convert orthoxml files with several families.
-
-    This class provides the means to extract a subset of root HOGs (i.e.
-    families) into a new output orthoxml file, or to split it and create
-    for each family an individual file.
-
-    The object should be instantiated with the input orthoxml file and
-    optionally a cache_dir argument where the output orthoxml files will
-    be stored. This later parameter can be overwritten in the __call__
-    method call that does the work.
-
-    .. note::
-
-       Calls to the splitter will remove the created families from the
-       loaded input file, so subsequent calls that contain a family in
-       common will miss them from the second call onwards.
-
-
-    :Example:
-
-      splitter = OrthoXMLSplitter("data.orthoxml", cache_dir="./splits")
-      splitter()
-
-    will create files HOGxxxxxx.orthoxml in the ./splits directory."""
-
-    def __init__(self, xml_file, cache_dir=None):
-        self.xml_file = xml_file
-        if cache_dir is not None:
-            self._assert_cache_dir(cache_dir)
-        logger.info('loading xml file {}...'.format(xml_file))
-        parser = etree.XMLParser(remove_blank_text=True)
-        self.Etree_XML = etree.parse(self.xml_file, parser=parser)
-        self.Etree_root = self.Etree_XML.getroot()
-        logger.info('building lookup table for genes')
-        self.gene_lookup = {gene.get('id'): gene for gene in self._iter_gene_elements()}
-        logger.info('init of OrthoXMLSplitter finished')
-
-    def _assert_cache_dir(self, cache_dir):
-        # Ensure existance of cache directory (py2 compat)
-        try:
-            os.makedirs(cache_dir)
-        except OSError as exc:
-            if exc.errno == errno.EEXIST and os.path.isdir(cache_dir):
-                pass
-            else:
-                raise
-        self.cache_dir = cache_dir
-
-    def _iter_gene_elements(self):
-        """This method is a faster version of xpath '//ns:gene'.
-
-        It iterates the element in sequential order"""
-        for node in self.Etree_root:
-            if node.tag == "{http://orthoXML.org/2011/}species":
-                for gene in node.iter('{http://orthoXML.org/2011/}gene'):
-                    yield gene
-
-    def _iter_toplevel_groups(self):
-        """This method yields all the root hogs sequentially."""
-        for node in self.Etree_root:
-            if node.tag == "{http://orthoXML.org/2011/}groups":
-                for root_hog in node:
-                    yield root_hog
-
-    def __call__(self, hogs_to_extract=None, single_hog_files=False, basename=None, cache_dir=None):
-        """Split/extract hogs from orthoxml file based on root hogs ids.
-
-        Split the input orthoxml or extract a subset of root hogs. If no
-        argument is passed, one orthoxml file per root hog is created,
-        named as 'HOGxxxxxx.orthoxml', where xxxxxx is the numeric id of
-        each hog.
-
-        The set of root hogs to be extracted can be limited by specifying
-        a subset of hog ids in the hogs_to_extract parameter. If
-        single_hog_files is set to true, each of these hogs will be converted
-        into a single orthoxml file named as explained above. If single_hog_files
-        is set to false, the whole subset of hogs will be stored in one
-        orthoxml file named as specified in `basename`.
-
-        The file(s) will be stored in the cache_dir folder which can be
-        specified in the constructor or overwritten as an argument in
-        this method.
-
-        :param hogs_to_extract: list or set that contains the set of root
-            hogs to be extracted. If set to None, all hogs are extracted.
-        :param bool single_hog_files: whether or not to build one orthoxml
-            file for all the selected hogs or individual ones.
-        :param str basename: name of the output file if a subset of hogs
-            is extracted into a single file.
-        :param str cache_dir: folder where to store the output files.
-        """
-        if cache_dir is not None:
-            self._assert_cache_dir(cache_dir)
-        elif self.cache_dir is None:
-            raise RuntimeError("cache dir to output files to is not set")
-
-        if single_hog_files:
-            if hogs_to_extract is None:
-                raise RuntimeError('useless to extract all hogs into single output file')
-            if basename is None or not isinstance(basename, (str, bytes)):
-                raise ValueError('basename needs to be specified: {}'.format(basename))
-            ogs = [og for og in self._iter_toplevel_groups() if int(og.get("id")) in hogs_to_extract]
-            fn = os.path.join(self.cache_dir, basename)
-            logger.info("extracting {:d} hogs into {:s}".format(len(ogs), fn))
-            self.create_new_orthoxml(fn, ogs)
-        else:
-            for og in self._iter_toplevel_groups():
-                if hogs_to_extract is None or int(og.get('id')) in hogs_to_extract:
-                    hog_nr = int(og.get("id"))
-                    hog_id = "HOG{:06d}.orthoxml".format(hog_nr)
-                    fname = os.path.join(self.cache_dir, hog_id)
-                    logger.info("extracting {} into {}".format(hog_id, fname))
-                    self.create_new_orthoxml(fname, [og])
-
-    def iter_generefs_in_og(self, og_node):
-        for node in og_node.iterdescendants('{http://orthoXML.org/2011/}geneRef'):
-            yield node
-
-    def get_gene_via_generef(self, genesref_ids):
-        genesref_ids = set(genesref_ids)
-        return [self.gene_lookup[gene_id] for gene_id in genesref_ids]
-
-    def create_new_orthoxml(self, fn, OGs):
-        """create a new orthoxml file for the passed orthologGroup elements.
-
-        :param fn: the filename of the output file. The path needs to exists
-            prior to calling this method.
-        :param OGs: the orthologGroup elements that should be included in the
-            new output file."""
-        # Get element to store
-        for og_node in OGs:
-            gene_ids = [gene_ref_elem.get("id") for gene_ref_elem in self.iter_generefs_in_og(og_node)]
-        gene_els = self.get_gene_via_generef(gene_ids)
-
-        # Get all information to store
-        zoo = {}  # <- {key:sp_etree || value: {key:db_el || values:[list_genes]}}
-        for gene_el in gene_els:  # <- for all gene el
-            db_el = gene_el.getparent().getparent()
-            sp_el = db_el.getparent()
-            if sp_el in zoo.keys():  # <- if species already visited
-                if db_el in zoo[sp_el].keys():  # <- if db already visited so add gene
-                    zoo[sp_el][db_el].append(gene_el)
-                else:  # <- if db not visited so add db,genes
-                    zoo[sp_el][db_el] = []
-                    zoo[sp_el][db_el].append(gene_el)
-            else:  # <- if species not visited so add sp,db,gene
-                zoo[sp_el] = {}
-                zoo[sp_el][db_el] = []
-                zoo[sp_el][db_el].append(gene_el)
-
-        etree_2_dump = etree.Element("orthoXML", nsmap=self.Etree_root.nsmap)
-        for attr, value in self.Etree_root.items():
-            etree_2_dump.set(attr, value)
-
-        for species_el in zoo.keys():
-            species_xml = etree.Element("species")
-            for attr, value in species_el.items():
-                species_xml.set(attr, value)
-            etree_2_dump.insert(0, species_xml)
-
-            for db_el in zoo[species_el].keys():
-                # Add <database> into <species>
-                database_xml = etree.SubElement(species_xml, "database")
-                for attr, value in db_el.items():
-                    database_xml.set(attr, value)
-
-                # Add <genes> TAG into <database>
-                genes_xml = etree.SubElement(database_xml, "genes")
-
-                # Fill <genes> with <gene>
-                for gene_el in zoo[species_el][db_el]:
-                    gene_xml = etree.SubElement(genes_xml, "gene")
-                    for attr, value in gene_el.attrib.items():
-                        gene_xml.set(attr, value)
-
-        groupsxml = etree.SubElement(etree_2_dump, "groups")
-        for og_et in OGs:
-            if not og_et.get('id').startswith('HOG:'):
-                og_et.set('id', 'HOG:{:07d}'.format(int(og_et.get('id'))))
-            groupsxml.append(og_et)
-
-        tree = etree.ElementTree(etree_2_dump)
-        tree.write(fn, xml_declaration=True, encoding='utf-8', method="xml", pretty_print=True)
-
diff --git a/src/HogProf/build/lib/pyoma/browser/__init__.py b/src/HogProf/build/lib/pyoma/browser/__init__.py
deleted file mode 100755
index dd4c206..0000000
--- a/src/HogProf/build/lib/pyoma/browser/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-name = "browser"
diff --git a/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py b/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py
deleted file mode 100755
index 264de1d..0000000
--- a/src/HogProf/build/lib/pyoma/browser/check_db_consistency.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import random
-import unittest
-import os
-import Bio.Seq
-import Bio.Data.CodonTable
-import pyoma.browser.db as pyomadb
-import tables
-import numpy
-
-
-class DatabaseChecks(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        try:
-            path = os.environ['PYOMA_DB2CHECK']
-        except KeyError:
-            raise unittest.SkipTest("No database specified in PYOMA_DB2CHECK")
-
-        cls.db = pyomadb.Database(path)
-
-    def translated_cdna_match_protein_sequence(self, cdna, prot):
-        cdna = cdna.replace('X', 'N')
-        for tab in Bio.Data.CodonTable.generic_by_id.keys():
-            tab_ok = True
-            trans = Bio.Seq.translate(cdna, table=tab)
-            if not 3 >= len(trans) - len(prot) >= 0:
-                return False
-            for pos, (trans_aa, prot_aa) in enumerate(zip(trans, prot)):
-                if trans_aa == prot_aa or trans_aa == 'X' or prot_aa == 'X':
-                    continue
-                elif prot_aa == 'M' and pos == 0 and trans_aa != '*':
-                    continue
-                else:
-                    tab_ok = False
-                    break
-            if tab_ok:
-                return True
-
-    def test_cdna_and_protein_sequence_match(self):
-        """test translated cdna sequence and protein sequence match.
-
-        This is done for a random sample of 1000 entries"""
-        SAMPLES = 1000
-        nr_entries = self.db.id_resolver.max_entry_nr
-        for entry_nr in random.sample(range(nr_entries+1), SAMPLES):
-            with self.subTest(entry_nr=entry_nr):
-                cdna = self.db.get_cdna(entry_nr).decode()
-                prot = self.db.get_sequence(entry_nr).decode()
-                self.assertTrue(self.translated_cdna_match_protein_sequence(cdna, prot))
-
-    def test_increasing_offsets(self):
-        entry_tab = self.db.get_hdf5_handle().get_node('/Protein/Entries')
-        seq_off = -1
-        cds_off = -1
-        for row in entry_tab:
-            self.assertLess(seq_off, row['SeqBufferOffset'], "SeqBufferOffset decreases in row {}: {} vs {}"
-                            .format(row.nrow, seq_off, row['SeqBufferOffset']))
-            self.assertLess(cds_off, row['CDNABufferOffset'], "CDNABufferOffset decreases in row {}: {} vs {}"
-                            .format(row.nrow, seq_off, row['CDNABufferOffset']))
-            seq_off = row['SeqBufferOffset']
-            cds_off = row['CDNABufferOffset']
-
-    def test_homeology_flag(self):
-        genome_tab = self.db.get_hdf5_handle().get_node('/Genome')
-        for g in (b'WHEAT', b'GOSHI', b'BRANA'):
-            for row in genome_tab.read_where('UniProtSpeciesCode == g'):
-                self.assertTrue(row['IsPolyploid'], "{} is not recorded as polyploid genome".format(g))
-        for g in (b'YEAST', b'HUMAN', b'PLAF7', b'ARATH', b'MOUSE'):
-            for row in genome_tab.read_where('UniProtSpeciesCode == g'):
-                self.assertFalse(row['IsPolyploid'], "{} is recorded to be a ployploid genome".format(g))
-
-    def test_synteny_scores_exist(self):
-        for g in ('WHEAT', 'BRANA', 'GOSHI'):
-            try:
-                t = self.db.get_hdf5_handle().get_node('/PairwiseRelation/{}/within'.format(g))
-            except tables.NoSuchNodeError:
-                # if species does not exist, we skip - not all datasets will have these genomes
-                continue
-            syn_col = t.col('SyntenyConservationLocal')
-            computed_pairs = numpy.where(syn_col >= 0)
-            self.assertLess(0, len(computed_pairs[0]), "No synteny values computed for {}".format(g))
diff --git a/src/HogProf/build/lib/pyoma/browser/convert.py b/src/HogProf/build/lib/pyoma/browser/convert.py
deleted file mode 100755
index a2814e6..0000000
--- a/src/HogProf/build/lib/pyoma/browser/convert.py
+++ /dev/null
@@ -1,1910 +0,0 @@
-from __future__ import division, print_function
-from builtins import str, chr, range, object, super, bytes
-
-import pandas
-from future.standard_library import hooks
-from PySAIS import sais
-from tempfile import NamedTemporaryFile
-from tqdm import tqdm
-import csv
-import resource
-import tables
-import numpy
-import numpy.lib.recfunctions
-import os
-import subprocess
-import errno
-import json
-import time
-import familyanalyzer
-import re
-import multiprocessing as mp
-import lxml.html
-import collections
-import gzip
-import hashlib
-import itertools
-import operator
-import fileinput
-
-from .. import common
-from . import locus_parser
-from . import tablefmt
-from .KmerEncoder import KmerEncoder
-from .OrthoXMLSplitter import OrthoXMLSplitter
-from .geneontology import GeneOntology, OntologyParser
-from .synteny import SyntenyScorer
-from .homoeologs import HomeologsConfidenceCalculator
-
-with hooks():
-    import urllib.request
-
-
-class DarwinException(Exception):
-    pass
-
-
-def callDarwinExport(func, drwfile=None):
-    """Function starts a darwin session, loads convert.drw file
-    and calls the darwin function passed as argument. The output
-    is expected to be written by darwin in json format into the
-    file specified by 'outfn'.
-    This function returns the parsed json datastructure"""
-
-    with NamedTemporaryFile(suffix='.dat') as tmpfile:
-        if drwfile is None:
-            drwfile = os.path.abspath(os.path.splitext(__file__)[0] + ".drw")
-        # with open(os.devnull, 'w') as DEVNULL:
-        stacksize = resource.getrlimit(resource.RLIMIT_STACK)
-        common.package_logger.info('current stacklimit: {}'.format(stacksize))
-        common.package_logger.info('setting stacklimit: {}'.format((max(stacksize)-1, stacksize[1])))
-        resource.setrlimit(resource.RLIMIT_STACK, (min(stacksize), stacksize[1]))
-        p = subprocess.Popen(['darwin', '-q', '-E', '-B'], stdin=subprocess.PIPE,
-                             stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-        drw_cmd = "outfn := '{}': ReadProgram('{}'): {}; done;".format(
-            tmpfile.name,
-            drwfile,
-            func).encode('utf-8')
-        common.package_logger.debug('calling darwin function: {}'.format(func))
-        (stdout, stderr) = p.communicate(input=drw_cmd)
-        if p.returncode > 0:
-            raise DarwinException(p.stderr.read())
-
-        trans_tab = "".join(str(chr(x)) for x in range(128)) + " " * 128
-        with open(tmpfile.name, 'r') as jsonData:
-            rawdata = jsonData.read()
-            return json.loads(rawdata.translate(trans_tab))
-
-
-def uniq(seq):
-    """return uniq elements of a list, preserving order
-
-    :param seq: an iterable to be analyzed
-    """
-    seen = set()
-    return [x for x in seq if not (x in seen or seen.add(x))]
-
-
-def silentremove(filename):
-    """Function to remove a given file. No exception is raised if the
-    file does not exist. Other errors are passed to the user.
-    :param filename: the path of the file to be removed"""
-    try:
-        os.remove(filename)
-    except OSError as e:
-        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
-            raise  # re-raise exception if a different error occured
-
-
-def gz_is_empty(fname):
-    """Test if gzip file fname is empty
-
-    Return True if the uncompressed data in fname has zero length
-    or if fname itself has zero length
-    Raises OSError if fname has non-zero length and is not a gzip file
-    """
-    with gzip.open(fname, 'rb') as f:
-        data = f.read(1)
-    return len(data) == 0
-
-
-def load_tsv_to_numpy(args):
-    fn, off1, off2, swap = args
-    rel_desc = tablefmt.PairwiseRelationTable
-    # we need to get the enum as a dict to be able to extend it
-    # with the reversed labels, i.e. n:1
-    relEnum = rel_desc.columns['RelType'].enum._names
-    relEnum['n:1'] = relEnum['m:1']
-    relEnum['1:m'] = relEnum['1:n']
-    relEnum['n:m'] = relEnum['m:n']
-    read_dir = -1 if swap else 1
-    tsv_dtype = [('EntryNr1', 'i4'), ('EntryNr2', 'i4'), ('Score', 'f4'), ('RelType', 'i1'),
-                 ('AlignmentOverlap', 'f2'), ('Distance', 'f4')]
-    for curNr, curFn in enumerate([fn, fn.replace('.ext.', '.')]):
-        try:
-            if gz_is_empty(curFn):
-                return numpy.empty(0, dtype=tables.dtype_from_descr(rel_desc))
-            with gzip.GzipFile(curFn) as fh:
-                data = numpy.genfromtxt(fh, dtype=tsv_dtype,
-                                        names=[_[0] for _ in tsv_dtype],
-                                        delimiter='\t',
-                                        usecols=(0, 1, 2, 3, 4, 5),
-                                        converters={'EntryNr1': lambda nr: int(nr) + off1,
-                                                    'EntryNr2': lambda nr: int(nr) + off2,
-                                                    'RelType': lambda rel: (relEnum[rel[::read_dir].decode()]
-                                                                            if len(rel) <= 3
-                                                                            else relEnum[rel.decode()]),
-                                                    'Score': lambda score: float(score) / 100})
-                break
-        except OSError as e:
-            if curNr < 1:
-                common.package_logger.info('tried to load {}'.format(curFn))
-                pass
-            else:
-                raise e
-
-    if swap:
-        reversed_cols = tuple(data.dtype.names[z] for z in (1, 0, 2, 3, 4, 5))
-        data.dtype.names = reversed_cols
-    full_table = numpy.empty(data.size, dtype=tables.dtype_from_descr(rel_desc))
-    common_cols = list(data.dtype.names)
-    full_table[common_cols] = data[common_cols]
-    for col_not_in_tsv in set(full_table.dtype.names) - set(data.dtype.names):
-        full_table[col_not_in_tsv] = rel_desc.columns[col_not_in_tsv].dflt
-    return full_table
-
-
-def read_vps_from_tsv(gs, ref_genome):
-    ref_genome_idx = gs.get_where_list('(UniProtSpeciesCode=={!r})'.
-                                       format(ref_genome))[0]
-    job_args = []
-    for g in range(len(gs)):
-        if g == ref_genome_idx:
-            continue
-        g1, g2 = sorted((g, ref_genome_idx,))
-        off1, off2 = gs.read_coordinates(numpy.array((g1, g2)), 'EntryOff')
-        fn = os.path.join(os.environ['DARWIN_OMADATA_PATH'], 'Phase4',
-                          gs.cols.UniProtSpeciesCode[g1].decode(),
-                          gs.cols.UniProtSpeciesCode[g2].decode() + ".orth.txt.gz")
-        tup = (fn, off1, off2, g1 != ref_genome_idx)
-        common.package_logger.info('adding job: {}'.format(tup))
-        job_args.append(tup)
-
-    pool = mp.Pool(processes=min(os.cpu_count(), 10))
-    all_pairs = pool.map(load_tsv_to_numpy, job_args)
-    pool.close()
-    return numpy.lib.recfunctions.stack_arrays(all_pairs, usemask=False)
-
-
-class DataImportError(Exception):
-    pass
-
-
-def _load_taxonomy_without_ref_to_itselfs(data):
-    dtype = tables.dtype_from_descr(tablefmt.TaxonomyTable)
-    arr = numpy.array([tuple(x) for x in data], dtype=dtype)
-    clean = arr[numpy.where(arr['NCBITaxonId'] != arr['ParentTaxonId'])]
-    return clean
-
-
-def compute_ortholog_types(data, genome_offs):
-    """this function computes the type of orthologs from the data and sets in
-    the RelType column.
-
-    :param data: a numpy recarray corresponding to the `numpy.dtype` of
-           `tablefmt.PairwiseRelationTable`
-    :param genome_offs: a numpy array with the genome offsets, i.e. the entry
-           numbers where the next genome starts
-
-    :returns: a modified version of data
-    """
-    typEnum = tablefmt.PairwiseRelationTable.columns.get('RelType').enum
-    query_type = {val: 'm' if cnt > 1 else '1'
-                  for val, cnt in zip(*numpy.unique(data['EntryNr2'],
-                                                    return_counts=True))}
-
-    def genome_idx(enr):
-        return numpy.searchsorted(genome_offs, enr - 1, side='right')
-
-    g0 = genome_idx(data[0]['EntryNr2'])
-    it = numpy.nditer(data, flags=['c_index'], op_flags=['readwrite'])
-    while not it.finished:
-        row0 = it[0]
-        i1 = it.index + 1
-        # we move i1 forward to the row where the next genome starts, i.e. the
-        # current query changes the species or the query itself changes
-        while i1 < len(data):
-            row1 = data[i1]
-            g1 = genome_idx(row1['EntryNr2'])
-            if g1 != g0 or row0['EntryNr1'] != row1['EntryNr1']:
-                break
-            i1 += 1
-        subj_type = 'n' if i1 - it.index > 1 else '1'
-        while not it.finished and it.index < i1:
-            typ = '{}:{}'.format(query_type[int(it[0]['EntryNr2'])], subj_type)
-            it[0]['RelType'] = typEnum[typ]
-            it.iternext()
-        g0 = g1
-
-
-def get_or_create_tables_node(h5, path, desc=None):
-    """return the node of a given path from the h5 file
-
-    If the node does not yet exist, it is created (including potential
-    inexistant internal nodes).
-
-    :param h5: Handle to the hdf5 object
-    :param str path: Path of the node to return
-    :param str desc: Description to be added to the node"""
-    try:
-        grp = h5.get_node(path)
-    except tables.NoSuchNodeError:
-        base, name = os.path.split(path)
-        grp = h5.create_group(base, name, title=desc, createparents=True)
-    return grp
-
-
-class DarwinExporter(object):
-    DB_SCHEMA_VERSION = '3.2'
-    DRW_CONVERT_FILE = os.path.abspath(os.path.splitext(__file__)[0] + '.drw')
-
-    def __init__(self, path, logger=None, mode=None):
-        self.logger = logger if logger is not None else common.package_logger
-        fn = os.path.normpath(os.path.join(
-            os.getenv('DARWIN_BROWSERDATA_PATH', ''),
-            path))
-        if mode is None:
-            mode = 'append' if os.path.exists(fn) else 'write'
-        self._compr = tables.Filters(complevel=6, complib='zlib', fletcher32=True)
-        self.h5 = tables.open_file(fn, mode=mode[0], filters=self._compr)
-        self.logger.info("opened {} in {} mode, options {}".format(
-            fn, mode, str(self._compr)))
-        if mode == 'write':
-            self.h5.root._f_setattr('convertion_start', time.strftime("%c"))
-
-    def call_darwin_export(self, func):
-        return callDarwinExport(func, self.DRW_CONVERT_FILE)
-
-    def _get_or_create_node(self, path, desc=None):
-        return get_or_create_tables_node(self.h5, path, desc)
-
-    def create_table_if_needed(self, parent, name, drop_data=False, **kwargs):
-        """create a table if needed.
-
-        The function only checks whether a table exists with that name,
-        but not if it is compatible with the passed arguments.
-        if you pass data with the `obj` argument, this data is appended
-        to the table. If you set `drop_data` to True, data that was
-        previously in the existing table is dropped prior to adding new
-        data."""
-        try:
-            tab = self.h5.get_node(parent, name=name)
-            if drop_data:
-                tab.remove_rows(0, tab.nrows)
-            if 'obj' in kwargs:
-                tab.append(kwargs['obj'])
-        except tables.NoSuchNodeError:
-            tab = self.h5.create_table(parent, name, **kwargs)
-        return tab
-
-    def get_version(self):
-        """return version of the dataset.
-
-        Default implementation searches for 'mname' in Matrix or matrix_stats.drw files.
-        """
-        for fname in ('Matrix', 'matrix_stats.drw'):
-            with open(os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'], fname), 'r') as fh:
-                for i, line in enumerate(fh):
-                    if line.startswith('mname :='):
-                        match = re.match(r'mname := \'(?P<version>[^\']*)\'', line)
-                        return match.group('version')
-                    if i > 1000:
-                        break
-        raise DataImportError('No version information found')
-
-    def add_version(self):
-        version = self.get_version()
-        self.h5.set_node_attr('/', 'oma_version', version)
-        self.h5.set_node_attr('/', 'pytables', tables.get_pytables_version())
-        self.h5.set_node_attr('/', 'hdf5_version', tables.get_hdf5_version())
-        self.h5.set_node_attr('/', 'db_schema_version', self.DB_SCHEMA_VERSION)
-
-    def add_species_data(self):
-        cache_file = os.path.join(
-            os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''),
-            'pyoma', 'gs.json')
-        if os.path.exists(cache_file):
-            with open(cache_file, 'r') as fd:
-                data = json.load(fd)
-        else:
-            data = self.call_darwin_export('GetGenomeData();')
-        gstab = self.h5.create_table('/', 'Genome', tablefmt.GenomeTable,
-                                     expectedrows=len(data['GS']))
-        gs_data = self._parse_date_columns(data['GS'], gstab)
-        self._write_to_table(gstab, gs_data)
-        gstab.cols.NCBITaxonId.create_csindex(filters=self._compr)
-        gstab.cols.UniProtSpeciesCode.create_csindex(filters=self._compr)
-        gstab.cols.EntryOff.create_csindex(filters=self._compr)
-
-        taxtab = self.h5.create_table('/', 'Taxonomy', tablefmt.TaxonomyTable,
-                                      expectedrows=len(data['Tax']))
-        self._write_to_table(taxtab, _load_taxonomy_without_ref_to_itselfs(data['Tax']))
-        taxtab.cols.NCBITaxonId.create_csindex(filters=self._compr)
-
-    def _parse_date_columns(self, data, tab):
-        """convert str values in a date column to epoch timestamps"""
-        time_cols = [i for i, col in enumerate(tab.colnames) if tab.coldescrs[col].kind == 'time']
-        dflts = [tab.coldflts[col] for col in tab.colnames]
-
-        def map_data(col, data):
-            try:
-                val = data[col]
-                if col in time_cols and isinstance(val, str):
-                    for fmt in ('%b %d, %Y', '%B %d, %Y', '%d.%m.%Y', '%Y%m%d'):
-                        try:
-                            date = time.strptime(val, fmt)
-                            return time.mktime(date)
-                        except ValueError:
-                            pass
-                    raise ValueError("Cannot parse date of '{}'".format(val))
-                return val
-            except IndexError:
-                return dflts[col]
-
-        arr = numpy.empty(len(data), dtype=tab.dtype)
-        for i, row in enumerate(data):
-            as_tup = tuple(map_data(c, row) for c in range(len(dflts)))
-            arr[i] = as_tup
-        return arr
-
-    def _convert_to_numpyarray(self, data, tab):
-        """convert a list of list dataset into a numpy rec array that
-        corresponds to the table definition of `tab`.
-
-        :param data: the data to be converted.
-        :param tab: a pytables table node."""
-
-        enum_cols = {i: tab.get_enum(col) for (i, col) in enumerate(tab.colnames)
-                     if tab.coltypes[col] == 'enum'}
-        dflts = [tab.coldflts[col] for col in tab.colnames]
-
-        def map_data(col, data):
-            try:
-                val = data[col]
-                return enum_cols[col][val]
-            except IndexError:
-                return dflts[col]
-            except KeyError:
-                return val
-
-        arr = numpy.empty(len(data), dtype=tab.dtype)
-        for i, row in enumerate(data):
-            as_tup = tuple(map_data(c, row) for c in range(len(dflts)))
-            arr[i] = as_tup
-        return arr
-
-    def add_orthologs(self):
-        genome_offs = self.h5.root.Genome.col('EntryOff')
-        for gs in self.h5.root.Genome.iterrows():
-            genome = gs['UniProtSpeciesCode'].decode()
-            rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome))
-            if 'VPairs' not in rel_node_for_genome:
-                cache_file = os.path.join(
-                    os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''),
-                    'pyoma', 'vps', '{}.json'.format(genome))
-                if os.path.exists(cache_file):
-                    with open(cache_file, 'r') as fd:
-                        data = json.load(fd)
-                elif ((not os.getenv('DARWIN_OMADATA_PATH') is None) and
-                      os.path.exists(os.path.join(
-                           os.environ['DARWIN_OMADATA_PATH'], 'Phase4'))):
-                    # try to read from Phase4 in parallel.
-                    data = read_vps_from_tsv(self.h5.root.Genome,
-                                             genome.encode('utf-8'))
-                else:
-                    # fallback to read from VPsDB
-                    data = self.call_darwin_export('GetVPsForGenome({})'.format(genome))
-
-                vp_tab = self.h5.create_table(rel_node_for_genome, 'VPairs', tablefmt.PairwiseRelationTable,
-                                              expectedrows=len(data))
-                if isinstance(data, list):
-                    data = self._convert_to_numpyarray(data, vp_tab)
-                if numpy.any(data['RelType'] >= tablefmt.PairwiseRelationTable.columns.get('RelType').enum['n/a']):
-                    compute_ortholog_types(data, genome_offs)
-                self._write_to_table(vp_tab, data)
-                vp_tab.cols.EntryNr1.create_csindex()
-
-    def add_same_species_relations(self):
-        for gs in self.h5.root.Genome.iterrows():
-            genome = gs['UniProtSpeciesCode'].decode()
-            rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome))
-            if 'within' not in rel_node_for_genome:
-                cache_file = os.path.join(
-                    os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''),
-                    'pyoma', 'cps', '{}.json'.format(genome))
-                if os.path.exists(cache_file):
-                    with open(cache_file, 'r') as fd:
-                        data = json.load(fd)
-                else:
-                    # fallback to read from VPsDB
-                    data = self.call_darwin_export('GetSameSpeciesRelations({})'.format(genome))
-
-                ss_tab = self.h5.create_table(rel_node_for_genome, 'within', tablefmt.PairwiseRelationTable,
-                                              expectedrows=len(data))
-                if isinstance(data, list):
-                    data = self._convert_to_numpyarray(data, ss_tab)
-                self._write_to_table(ss_tab, data)
-                ss_tab.cols.EntryNr1.create_csindex()
-
-    def add_synteny_scores(self):
-        """add synteny scores of pairwise relations to database.
-
-        Current implementation only computes synteny scores for
-        homoeologs, but easy to extend. Question is rather if we
-        need synteny scores for all genome pairs, and if not, how
-        to select.
-
-        The computations of the scores are done using :mod:`synteny`
-        module of this package."""
-        # TODO: compute for non-homoeologs relation as well.
-        self.logger.info("Adding synteny scores for polyploid genomes")
-        polyploid_genomes = self.h5.root.Genome.where('IsPolyploid==True')
-        for genome in polyploid_genomes:
-            genome_code = genome['UniProtSpeciesCode'].decode()
-            self.logger.info('compute synteny score for {}'.format(genome_code))
-            synteny_scorer = SyntenyScorer(self.h5, genome_code)
-            rels = synteny_scorer.compute_scores()
-            self._callback_store_rel_data(
-                genome_code, rels, [('SyntenyConservationLocal', 'mean_synteny_score')])
-
-    def add_homoeology_confidence(self):
-        """adds the homoeology confidence scores to the database.
-
-        This method should be called only after the synteny scores have
-        been computed and added to the database.
-
-        The computations are done using :mod:`homoeologs` module."""
-        self.logger.info("Adding homoeolog confidence scores")
-        polyploid_genomes = self.h5.root.Genome.where('IsPolyploid==True')
-        for genome in polyploid_genomes:
-            genome_code = genome['UniProtSpeciesCode'].decode()
-            self.logger.info("compute homoeolog confidence for {}".format(genome_code))
-            homoeolg_scorer = HomeologsConfidenceCalculator(self.h5, genome_code)
-            rels = homoeolg_scorer.calculate_scores()
-            self._callback_store_rel_data(
-                genome_code, rels, [("Confidence", "fuzzy_confidence_scaled")])
-
-    def _callback_store_rel_data(self, genome, rels_df, assignments):
-        tab = self.h5.get_node('/PairwiseRelation/{}/within'.format(genome))
-        df_all = pandas.DataFrame(tab.read())
-        if 'entry_nr1' in list(rels_df):
-            enr_col_names = ['entry_nr1', 'entry_nr2']
-        else:
-            enr_col_names = ['EntryNr1', 'EntryNr2']
-        merged = pandas.merge(df_all, rels_df, how="left", left_on=['EntryNr1', 'EntryNr2'],
-                              right_on=enr_col_names, validate='one_to_one')
-
-        for target, source in assignments:
-            # replace NaN in column from rels_df by the default value of the target column
-            merged.loc[merged[source].isnull(), source] = tab.coldescrs[target].dflt
-            # update the data in the target hdf5 column by the source column data
-            tab.modify_column(column=merged[source].as_matrix(), colname=target)
-        tab.flush()
-
-    def _add_sequence(self, sequence, row, sequence_array, off, typ="Seq"):
-        # add ' ' after each sequence (Ascii is smaller than
-        # any AA, allows to build PAT array with split between
-        # sequences.
-        seqLen = len(sequence) + 1
-        row[typ + 'BufferOffset'] = off
-        row[typ + 'BufferLength'] = seqLen
-        seqNumpyObj = numpy.ndarray((seqLen,),
-                                    buffer=(sequence + " ").encode('utf-8'),
-                                    dtype=tables.StringAtom(1))
-        sequence_array.append(seqNumpyObj)
-        if typ == "Seq":
-            row['MD5ProteinHash'] = hashlib.md5(sequence.encode('utf-8')).hexdigest()
-        return seqLen
-
-    def add_proteins(self):
-        gsNode = self.h5.get_node('/Genome')
-        nrProt = sum(gsNode.cols.TotEntries)
-        nrAA = sum(gsNode.cols.TotAA)
-        protGrp = self._get_or_create_node('/Protein', "Root node for protein (oma entries) information")
-        protTab = self.h5.create_table(protGrp, 'Entries', tablefmt.ProteinTable,
-                                       expectedrows=nrProt)
-        seqArr = self.h5.create_earray(protGrp, 'SequenceBuffer',
-                                       tables.StringAtom(1), (0,), 'concatenated protein sequences',
-                                       expectedrows=nrAA + nrProt)
-        cdnaArr = self.h5.create_earray(protGrp, 'CDNABuffer',
-                                        tables.StringAtom(1), (0,), 'concatenated cDNA sequences',
-                                        expectedrows=3 * nrAA + nrProt)
-        seqOff = cdnaOff = 0
-        loc_parser = locus_parser.LocusParser()
-        for gs in gsNode.iterrows():
-            genome = gs['UniProtSpeciesCode'].decode()
-            cache_file = os.path.join(
-                os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''),
-                'pyoma', 'prots', '{}.json'.format(genome))
-            if os.path.exists(cache_file):
-                with open(cache_file, 'r') as fd:
-                    data = json.load(fd)
-            else:
-                data = self.call_darwin_export('GetProteinsForGenome({})'.format(genome))
-
-            if len(data['seqs']) != gs['TotEntries']:
-                raise DataImportError('number of entries ({:d}) does '
-                                      'not match number of seqs ({:d}) for {}'
-                                      .format(len(data['seqs']), gs['TotEntries'], genome))
-
-            locTab = self.h5.create_table('/Protein/Locus',
-                                          genome, tablefmt.LocusTable, createparents=True,
-                                          expectedrows=gs['TotEntries'] * 4)
-
-            for nr in range(gs['TotEntries']):
-                eNr = data['off'] + nr + 1
-                protTab.row['EntryNr'] = eNr
-                protTab.row['OmaGroup'] = data['ogs'][nr]
-
-                seqOff += self._add_sequence(data['seqs'][nr], protTab.row, seqArr, seqOff)
-                cdnaOff += self._add_sequence(data['cdna'][nr], protTab.row, cdnaArr, cdnaOff, 'CDNA')
-
-                protTab.row['Chromosome'] = data['chrs'][nr]
-                protTab.row['AltSpliceVariant'] = data['alts'][nr]
-                protTab.row['OmaHOG'] = b" "  # will be assigned later
-                protTab.row['CanonicalId'] = b" "  # will be assigned later
-
-                locus_str = data['locs'][nr]
-                try:
-                    locus_tab = loc_parser.parse(locus_str, eNr)
-                    locTab.append(locus_tab)
-                    len_cds = sum(z['End'] - z['Start']+1 for z in locus_tab)
-                    if len_cds != protTab.row['CDNABufferLength']-1:
-                        self.logger.warning("sum of exon lengths differ with cdna sequence for {}: {} vs {}"
-                                            .format(eNr, len_cds, protTab.row['CDNABufferLength']-1))
-
-                    protTab.row['LocusStart'] = locus_tab['Start'].min()
-                    protTab.row['LocusEnd'] = locus_tab['End'].max()
-                    protTab.row['LocusStrand'] = locus_tab[0]['Strand']
-                except ValueError as e:
-                    self.logger.warning(e)
-                protTab.row['SubGenome'] = data['subgenome'][nr].encode('ascii')
-                protTab.row.append()
-            protTab.flush()
-            seqArr.flush()
-            for n in (protTab, seqArr, locTab):
-                if n.size_in_memory != 0:
-                    self.logger.info('worte %s: compression ratio %3f%%' %
-                                     (n._v_pathname, 100 * n.size_on_disk / n.size_in_memory))
-        protTab.cols.EntryNr.create_csindex(filters=self._compr)
-        protTab.cols.MD5ProteinHash.create_csindex(filters=self._compr)
-
-    def _write_to_table(self, tab, data):
-        if len(data)>0:
-            tab.append(data)
-        self.logger.info('wrote %s : compression ratio %.3f%%' %
-                         (tab._v_pathname, 100 * tab.size_on_disk / tab.size_in_memory))
-
-    def add_hogs(self):
-        hog_path = os.path.normpath(os.path.join(
-            os.environ['DARWIN_NETWORK_SCRATCH_PATH'],
-            'pyoma', 'split_hogs'))
-        entryTab = self.h5.get_node('/Protein/Entries')
-        tree_filename = os.path.join(
-            os.environ['DARWIN_BROWSERDATA_PATH'],
-            'speciestree.nwk')
-        if not os.path.exists(hog_path):
-            hog_file = os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'],
-                                    '..', 'downloads', 'oma-hogs.orthoXML.gz')
-            splitter = OrthoXMLSplitter(hog_file, cache_dir=hog_path)
-            splitter()
-        hog_converter = HogConverter(entryTab)
-        hog_converter.attach_newick_taxonomy(tree_filename)
-        hogTab = self.h5.create_table('/', 'HogLevel', tablefmt.HOGsTable,
-                                      'nesting structure for each HOG', expectedrows=1e8)
-        self.orthoxml_buffer = self.h5.create_earray('/OrthoXML', 'Buffer',
-                                                     tables.StringAtom(1), (0,), 'concatenated orthoxml files',
-                                                     expectedrows=1e9, createparents=True)
-        self.orthoxml_index = self.h5.create_table('/OrthoXML', 'Index', tablefmt.OrthoXmlHogTable,
-                                                   'Range index per HOG into OrthoXML Buffer', expectedrows=5e6)
-        for root, dirs, filenames in os.walk(hog_path):
-            for fn in filenames:
-                try:
-                    levels = hog_converter.convert_file(os.path.join(root, fn))
-                    hogTab.append(levels)
-                    fam_nrs = set([z[0] for z in levels])
-                    self.add_orthoxml(os.path.join(root, fn), fam_nrs)
-                except Exception as e:
-                    self.logger.error('an error occured while processing ' + fn + ':')
-                    self.logger.exception(e)
-
-        hog_converter.write_hogs()
-
-    def add_orthoxml(self, orthoxml_path, fam_nrs):
-        """append orthoxml file content to orthoxml_buffer array and add index for the HOG family"""
-        if len(fam_nrs) > 1:
-            self.logger.warning('expected only one family per HOG file, but found {}: {}'
-                                .format(len(fam_nrs), fam_nrs))
-            self.logger.warning(' --> the orthoxml files per family will be not correct, '
-                                'i.e. they will contain all families of this file.')
-        with open(orthoxml_path, 'r') as fh:
-            orthoxml = fh.read().encode('utf-8')
-            offset = len(self.orthoxml_buffer)
-            length = len(orthoxml)
-            self.orthoxml_buffer.append(numpy.ndarray((length,),
-                                                      buffer=orthoxml, dtype=tables.StringAtom(1)))
-            for fam in fam_nrs:
-                row = self.orthoxml_index.row
-                row['Fam'] = fam
-                row['HogBufferOffset'] = offset
-                row['HogBufferLength'] = length
-                offset += length
-                row.append()
-
-    def xref_databases(self):
-        return os.path.join(os.environ['DARWIN_BROWSERDATA_PATH'], 'ServerIndexed.db')
-
-    def add_xrefs(self):
-        self.logger.info('start extracting XRefs, EC and GO annotations')
-        db_parser = DarwinDbEntryParser()
-        xref_tab = self.h5.create_table('/', 'XRef', tablefmt.XRefTable,
-                                        'Cross-references of proteins to external ids / descriptions',
-                                        expectedrows=1e8)
-
-        ec_tab = self.h5.create_table('/Annotations', 'EC', tablefmt.ECTable, 'Enzyme Commission annotations',
-                                      expectedrows=1e7, createparents=True)
-        gs = self.h5.get_node('/Genome').read()
-        with DescriptionManager(self.h5, '/Protein/Entries', '/Protein/DescriptionBuffer') as de_man, \
-             GeneOntologyManager(self.h5, '/Annotations/GeneOntology', '/Ontologies/GO') as go_man:
-            xref_importer = XRefImporter(db_parser, gs, xref_tab, ec_tab, go_man, de_man)
-            files = self.xref_databases()
-            dbs_iter = fileinput.input(files=files)
-            db_parser.parse_entrytags(dbs_iter)
-            xref_importer.flush_buffers()
-            xref_importer.build_suffix_index()
-
-    def add_group_metadata(self):
-        m = OmaGroupMetadataLoader(self.h5)
-        m.add_data()
-
-    def close(self):
-        self.h5.root._f_setattr('conversion_end', time.strftime("%c"))
-        self.h5.close()
-        self.logger.info('closed {}'.format(self.h5.filename))
-
-    def create_indexes(self):
-        self.logger.info('creating indexes for HogLevel table')
-        hogTab = self.h5.get_node('/HogLevel')
-        for col in ('Fam', 'ID', 'Level'):
-            if not hogTab.colindexed[col]:
-                hogTab.colinstances[col].create_csindex()
-        orthoxmlTab = self.h5.get_node('/OrthoXML/Index')
-        orthoxmlTab.cols.Fam.create_csindex()
-
-        self.logger.info('creating missing indexes for Entries table')
-        entryTab = self.h5.get_node('/Protein/Entries')
-        for col in ('EntryNr', 'OmaHOG', 'OmaGroup', 'MD5ProteinHash'):
-            if not entryTab.colindexed[col]:
-                entryTab.colinstances[col].create_csindex()
-
-        self.logger.info('creating index for xrefs (EntryNr and XRefId)')
-        xrefTab = self.h5.get_node('/XRef')
-        xrefTab.cols.EntryNr.create_csindex()
-        xrefTab.cols.XRefId.create_csindex()
-
-        self.logger.info('creating index for go (EntryNr and TermNr)')
-        goTab = self.h5.get_node('/Annotations/GeneOntology')
-        goTab.cols.EntryNr.create_csindex()
-        goTab.cols.TermNr.create_index()
-
-        self.logger.info('creating index for EC (EntryNr)')
-        ec_tab = self.h5.get_node('/Annotations/EC')
-        ec_tab.cols.EntryNr.create_csindex()
-
-        self.logger.info('creating index for domains (EntryNr)')
-        domtab = self.h5.get_node('/Annotations/Domains')
-        domtab.cols.EntryNr.create_csindex()
-
-        self.logger.info('creating indexes for HOG to prevalent domains '
-                         '(Fam and DomainId)')
-        dom2hog_tab = self.h5.get_node('/HOGAnnotations/Domains')
-        dom2hog_tab.cols.DomainId.create_csindex()
-        domprev_tab = self.h5.get_node('/HOGAnnotations/DomainArchPrevalence')
-        domprev_tab.cols.Fam.create_csindex()
-
-    def _iter_canonical_xref(self):
-        """extract one canonical xref id for each protein.
-
-        We take the first valid xref per gene with the ordering of xrefsources
-        as given in the xrefsource_order."""
-        xrefsource_order = ('UniProtKB/SwissProt', 'UniProtKB/TrEMBL',
-                            'Ensembl Gene', 'Ensembl Protein', 'FlyBase',
-                            'WormBase', 'EnsemblGenomes', 'RefSeq', 'SourceID')
-
-        xrefs = self.h5.get_node('/XRef')
-        source_enum = xrefs.get_enum('XRefSource')
-        canonical_sources = [source_enum[z] for z in xrefsource_order]
-        current_protein = None
-        past_proteins = set([])
-        for xref in xrefs:
-            if xref['EntryNr'] != current_protein:
-                if current_protein:
-                    past_proteins.add(current_protein)
-                    yield (current_protein, current_xref[1])
-                current_protein = xref['EntryNr']
-                current_xref = (1000, b'')  # init with a sentinel
-                if current_protein in past_proteins:
-                    raise DataImportError('Data in /XRef is not grouped w.r.t. EntryNr')
-            try:
-                rank = canonical_sources.index(xref['XRefSource'])
-                if rank < current_xref[0]:
-                    current_xref = (rank, xref['XRefId'])
-            except ValueError:
-                pass
-        if current_protein:
-            yield (current_protein, current_xref[1])
-
-    def add_canonical_id(self):
-        """add one canonical xref id to the /Protein/Entries table."""
-        self.logger.info('adding canonical ids for each protein...')
-        prot_tab = self.h5.get_node('/Protein/Entries')
-        canonical_ids = numpy.chararray(shape=(len(prot_tab),), itemsize=prot_tab.cols.CanonicalId.dtype.itemsize)
-        for eNr, canonical_id in self._iter_canonical_xref():
-            row_nr = eNr - 1
-            row = prot_tab[row_nr]
-            if row['EntryNr'] != eNr:
-                self.logger.warn('Entries table not properly sorted: {}, expected {}'.format(row['EntryNr'], eNr))
-                raise DataImportError('Entries table not properly sorted')
-            canonical_ids[row_nr] = canonical_id
-        prot_tab.modify_column(0, len(prot_tab), 1, column=canonical_ids, colname='CanonicalId')
-        prot_tab.flush()
-
-    def add_domain_info(self, domains):
-        self.logger.info('adding domain information...')
-        domtab = self.h5.create_table('/Annotations', 'Domains', tablefmt.DomainTable, createparents=True,
-                                      expectedrows=1e7)
-        entrytab = self.h5.get_node('/Protein/Entries')
-        md5_to_enr = collections.defaultdict(list)
-        for e in entrytab:
-            md5_to_enr[e['MD5ProteinHash']].append(e['EntryNr'])
-
-        buffer = []
-        for i, domain in enumerate(domains):
-            for entry_nr in md5_to_enr[domain.md5.encode('utf-8')]:
-                buffer.append((entry_nr, domain.id, domain.coords))
-                if len(buffer) > 5000:
-                    domtab.append(buffer)
-                    buffer = []
-            if i % 50000 == 0:
-                self.logger.info('processed {:d} domain annotations so far'.format(i))
-        if len(buffer) > 0:
-            domtab.append(buffer)
-        domtab.flush()
-
-    def add_domainname_info(self, domainname_infos):
-        self.logger.info('adding domain name information...')
-        dom_name_tab = self.h5.create_table('/Annotations', 'DomainDescription', tablefmt.DomainDescriptionTable,
-                                            createparents=True, expectedrows=2e5)
-        buffer = []
-        for i, dom_info in enumerate(domainname_infos):
-            buffer.append(dom_info)
-            if len(buffer) > 5000:
-                self._write_to_table(dom_name_tab, buffer)
-                buffer = []
-            if i % 50000 == 0:
-                self.logger.info('processed {:d} domain name descriptions so far'.format(i))
-        if len(buffer) > 0:
-            self._write_to_table(dom_name_tab, buffer)
-        dom_name_tab.flush()
-
-    def update_summary_stats(self):
-        """update the summary statistics of xrefs & go.
-
-        The function analyses the well-known xref sources as well as
-        GO annotations and computes aggregated counts for
-        all / in OMA Group / in HOGs for all of them.
-        """
-        for tab_name, sum_fun in [('/Annotations/GeneOntology', self.count_xref_summary),
-                                  ('/XRef', self.count_xref_summary)]:
-            summary = sum_fun()
-            tab = self.h5.get_node(tab_name)
-            for attr, val in summary.items():
-                tab.set_attr(attr, val)
-
-        group_sizes = self.collect_group_sizes()
-        summary = self._get_or_create_node('/Summary', 'Various Summary Statistics')
-        for group_type in group_sizes.keys():
-            grp_size_tab = self.create_table_if_needed(
-                summary, '{}_size_hist'.format(group_type),
-                description=tablefmt.GroupsizeHistogram,
-                drop_data=True)
-            data = sorted(group_sizes[group_type].items())
-            grp_size_tab.append(data)
-
-        cov_fracs = self.add_domain_covered_sites_counts()
-        cov_hist, bins = numpy.histogram(cov_fracs[cov_fracs > 0], bins=numpy.linspace(0, 1, 51))
-        cov_hist_data = numpy.zeros(50, dtype=[('BinEndValue', 'f4'), ('Counts', 'i4')])
-        cov_hist_data['BinEndValue'] = bins[1:]
-        cov_hist_data['Counts'] = cov_hist
-        dom_cov_hist_tab = self.create_table_if_needed(summary, 'Domain_coverage_hist',
-                 drop_data=True, obj=cov_hist_data)
-        dom_cov_hist_tab.set_attr('frac_genes_w_domain', len(cov_fracs[cov_fracs > 0]) / len(cov_fracs))
-        dom_cov_hist_tab.set_attr('mean_coverage_overall', numpy.mean(cov_fracs))
-        dom_cov_hist_tab.set_attr('mean_coverage_w_domain', numpy.mean(cov_fracs[cov_fracs > 0]))
-
-    def count_gene_ontology_summary(self):
-        self.logger.info('Bulding gene ontology annotations summary info')
-        go_tab = self.h5.get_node('/Annotations/GeneOntology')
-        prot_tab = self.h5.get_node('/Protein/Entries')
-        exp_codes = frozenset([b'EXP', b'IDA', b'IPI', b'IMP', b'IGI' b'IEP'])
-        cnts = collections.Counter()
-        cur_enr = None
-        for (enr, term), row_iter in itertools.groupby(go_tab, operator.itemgetter('EntryNr','TermNr')):
-            evidences = {row['Evidence'] for row in row_iter}
-            is_iea = b'IEA' in evidences
-            evidences.discard(b'IEA')
-            is_exp = not exp_codes.isdisjoint(evidences)
-            is_cur = len(evidences.difference(exp_codes)) > 0
-            cnts['annotations_any'] += 1
-            if is_exp:
-                cnts['annotations_exp'] += 1
-            if is_cur:
-                cnts['annotations_currated'] += 1
-            if is_iea:
-                cnts['annotations_iea'] += 1
-            if cur_enr != enr:
-                e = next(prot_tab.where('EntryNr == {}'.format(enr))).fetch_all_fields()
-                cnts['proteins_any'] += 1
-                if e['OmaGroup'] != 0:
-                    cnts['protein_OmaGroup'] += 1
-                if len(e['OmaHOG']) > 0:
-                    cnts['protein_HOG'] += 1
-                cur_enr = enr
-        return cnts
-
-    def count_xref_summary(self):
-        self.logger.info('Building cross-ref summary info')
-        xref_tab = self.h5.get_node('/XRef')
-        prot_tab_iter = iter(self.h5.get_node('/Protein/Entries'))
-        source = xref_tab.get_enum('XRefSource')
-        trusted = frozenset(['UniProtKB/SwissProt', 'UniProtKB/TrEMBL', 'RefSeq', 'EntrezGene', 'Ensembl Gene', 'Ensembl Protein'])
-        if len(trusted.difference(source._names.keys())) > 0:
-            raise ValueError('set of trusted xrefs is invalid')
-        cnts = collections.Counter()
-
-        entry = next(prot_tab_iter)
-        for enr, xref_it in itertools.groupby(xref_tab, operator.itemgetter('EntryNr')):
-            while entry['EntryNr'] < enr:
-                entry = next(prot_tab_iter)
-            sources_all = [source._values[x['XRefSource']] for x in xref_it]
-            cnts += collections.Counter(sources_all)
-            has_trusted_xref = len(trusted.intersection(sources_all)) > 0
-            if has_trusted_xref:
-                cnts['trusted_all'] += 1
-                if entry['OmaGroup'] != 0:
-                    cnts['trusted_OmaGroup'] += 1
-                if len(entry['OmaHOG']) > 0:
-                    cnts['trusted_HOG'] += 1
-        return cnts
-
-    def collect_group_sizes(self):
-        self.logger.info("Building grouping size histograms")
-        groupings = ('OmaHOG', 'OmaGroup')
-        memb_cnts = {grp: collections.defaultdict(int) for grp in groupings}
-        fam_re = re.compile(br'([A-Z]+:)?(?P<fam>[0-9]+).*')
-        prot_tab = self.h5.get_node('/Protein/Entries')
-        for row in prot_tab:
-            for grp in groupings:
-                if grp == 'OmaHOG':
-                    m = fam_re.match(row[grp])
-                    if m is None:
-                        continue
-                    grp_id = int(m.group('fam'))
-                else:
-                    grp_id = int(row[grp])
-                    if grp_id == 0:
-                        continue
-                memb_cnts[grp][grp_id] += 1
-        sizes = {grp: collections.defaultdict(int) for grp in groupings}
-        for grp in groupings:
-            for grp_size in memb_cnts[grp].values():
-                sizes[grp][grp_size] += 1
-        return sizes
-
-    def compute_domaincovered_sites(self):
-        dom_tab = self.h5.get_node('/Annotations/Domains')
-        domains = pandas.DataFrame.from_records(dom_tab[:])
-
-        def dlen(coords):
-            doms = [int(pos) for pos in coords.split(b':')]
-            return sum((doms[i + 1] - doms[i] + 1 for i in range(0, len(doms), 2)))
-
-        # sum all parts of each domain region and store total length in DLen column
-        domains = domains.assign(DLen=domains['Coords'].apply(dlen))
-        # sum over all domains per protein
-        cov_sites = domains.groupby('EntryNr').agg({'DLen': sum})
-        return cov_sites
-
-    def add_domain_covered_sites_counts(self):
-        """Stores the number of AA covered by a DomainAnnotation.
-
-        This method adds to the hdf5 file a /Protein/DomainCoverage array that
-        contains the number of AA sites covered by a domain. The position
-        corresponds to the protein entry numbers in /Protein/Entries.
-
-        :Note: The method assumes that the domains are all non-overlapping.
-            If they are not, the reported coverage will be too high!
-
-        :return: covered fractions by domains for each protein
-        :rtype: numpy.array"""
-        self.logger.info("Counting covered sites by domains")
-        cov_sites_df = self.compute_domaincovered_sites()
-
-        prot_tab = self.h5.get_node('/Protein/Entries')
-        enr_col = prot_tab.col('EntryNr')
-        assert numpy.all(numpy.equal(enr_col, numpy.arange(1, len(prot_tab)+1)))
-
-        cov_sites = numpy.zeros(len(prot_tab), dtype=numpy.uint32)
-        for eNr, coverage in zip(cov_sites_df.index, cov_sites_df.DLen.values):
-            cov_sites[eNr-1] = coverage
-        create_node = False
-        try:
-            dom_cov_tab = self.h5.get_node('/Protein/CoveredSitesByDomains')
-            if len(dom_cov_tab) != len(cov_sites):
-                self.h5.remove_node('/Protein/CoveredSitesByDomains')
-                create_node = True
-        except tables.NoSuchNodeError:
-            create_node = True
-        if create_node:
-            dom_cov_tab = self.h5.create_carray('/Protein', 'CoveredSitesByDomains',
-                                                tables.UInt32Atom(), (len(cov_sites),))
-        dom_cov_tab[0:len(cov_sites)] = cov_sites
-        return cov_sites / (prot_tab.col('SeqBufferLength') - 1)
-
-    def add_sequence_suffix_array(self, k=6, fn=None, sa=None):
-        '''
-            Adds the sequence suffix array to the database. NOTE: this
-            (obviously) requires A LOT of memory for large DBs.
-        '''
-        # Ensure we're run in correct order...
-        assert ('Protein' in self.h5.root), 'Add proteins before calc. SA!'
-        idx_compr = tables.Filters(complevel=6, complib='blosc', fletcher32=True)
-
-        # Add to separate file if fn is set.
-        if fn is None:
-            db = self.h5
-        else:
-            fn = os.path.normpath(os.path.join(
-                    os.getenv('DARWIN_BROWSERDATA_PATH', ''),
-                    fn))
-            db = tables.open_file(fn, 'w', filters=idx_compr)
-            db.create_group('/', 'Protein')
-            db.root._f_setattr('conversion_start', time.strftime("%c"))
-            self.logger.info('opened {}'.format(db.filename))
-
-        # Load sequence buffer to memory - this is required to calculate the SA.
-        # Do it here (instead of in PySAIS) so that we can use it for computing
-        # the split points later.
-        seqs = self.h5.get_node('/Protein/SequenceBuffer')[:].tobytes()
-        n = len(self.h5.get_node('/Protein/Entries'))
-
-        # Compute & save the suffix array to DB. TODO: work out what compression
-        # works best!
-        if sa is None:
-            sa = sais(seqs)
-            sa[:n].sort()  # Sort delimiters by position.
-        db.create_carray('/Protein',
-                         name='SequenceIndex',
-                         title='concatenated protein sequences suffix array',
-                         obj=sa,
-                         filters=idx_compr)
-
-        # Create lookup table for fa2go
-        dtype = (numpy.uint32 if (n < numpy.iinfo(numpy.uint32).max) else
-                 numpy.uint64)
-        idx = numpy.zeros(sa.shape, dtype=dtype)
-        mask = numpy.zeros(sa.shape, dtype=numpy.bool)
-
-        # Compute mask and entry index for sequence buff
-        for i in range(n):
-            s = (sa[i - 1] if i > 0 else -1) + 1
-            e = (sa[i] + 1)
-            idx[s:e] = i + 1
-            mask[(e - k):e] = True  # (k-1) invalid and delim.
-
-        # Mask off those we don't want...
-        sa = sa[~mask[sa]]
-
-        # Reorder the necessary elements of entry index
-        idx = idx[sa]
-
-        # Initialise lookup array
-        atom = (tables.UInt32Atom if dtype is numpy.uint32 else tables.UInt64Atom)
-        kmers = KmerEncoder(k, is_protein=True)
-        kmer_lookup_arr = db.create_vlarray('/Protein',
-                              name='KmerLookup',
-                              atom=atom(shape=()),
-                              title='kmer entry lookup table',
-                              filters=idx_compr,
-                              expectedrows=len(kmers))
-        kmer_lookup_arr._f_setattr('k', k)
-
-        # Now find the split points and construct lookup ragged array.
-        ii = 0
-        for kk in tqdm(range(len(kmers)), desc='Constructing kmer lookup'):
-            kmer = kmers.encode(kk)
-            if (ii < len(sa)) and (seqs[sa[ii]:(sa[ii] + k)] == kmer):
-                jj = ii + 1
-                while (jj < len(sa)) and (seqs[sa[jj]:(sa[jj] + k)] == kmer):
-                    jj += 1
-                kmer_lookup_arr.append(idx[ii:jj])
-                # New start
-                ii = jj
-            else:
-                # End or not found
-                kmer_lookup_arr.append([])
-
-        if db.filename != self.h5.filename:
-            self.logger.info('storing external links to SequenceIndex and KmerLookup')
-            self.h5.create_external_link('/Protein', 'KmerLookup',
-                                         self._relative_path_to_external_node(kmer_lookup_arr))
-            self.h5.create_external_link('/Protein', 'SequenceIndex',
-                                         self._relative_path_to_external_node(db.root.Protein.SequenceIndex))
-            db.root._f_setattr('conversion_end', time.strftime("%c"))
-            db.close()
-            self.logger.info('closed {}'.format(db.filename))
-
-    def _relative_path_to_external_node(self, node):
-        rel_path = os.path.relpath(node._v_file.filename, os.path.dirname(self.h5.filename))
-        return str(rel_path + ":" + node._v_pathname)
-
-    def add_hog_domain_prevalence(self):
-        # Check that protein entries / domains are added already to the DB
-        assert True  # TODO
-
-        # Used later
-        hl_tab = self.h5.get_node('/HogLevel')
-        if not hl_tab.colindexed['Fam']:
-            hl_tab.colinstances['Fam'].create_csindex()
-
-        # Load the HOG -> Entry table to memory
-        prot_tab = self.h5.root.Protein.Entries
-        # TODO: work out how to do this in a neater way
-        df = pandas.DataFrame.from_records(((z['EntryNr'], z['OmaHOG'], z['SeqBufferLength'])
-                                        for z in prot_tab.iterrows()),
-                                       columns=['EntryNr', 'OmaHOG', 'SeqBufferLength'])
-        # Strip singletons
-        df = df[~(df['OmaHOG'] == b'')]
-
-        # Reformat HOG ID to plain-integer for top-level grouping only
-        df['OmaHOG'] = df['OmaHOG'].apply(lambda i: int(i[4:].split(b'.')[0]))
-
-        # Load domains
-        domains = pandas.DataFrame.from_records(self.h5.root.Annotations.Domains[:])
-
-        # Ensure sorted by coordinate - TODO: move this to DA import function
-        domains['start'] = domains['Coords'].apply(lambda c:
-                                                   int(c.split(b':')[0]))
-        domains.sort_values(['EntryNr', 'start'], inplace=True)
-        domains = domains[['EntryNr', 'DomainId']]
-
-        # Merge domains / entry-hog tables. Keep entries with no domains
-        # so that we can count the size of the HOGs.
-        df = pandas.merge(df, domains, on='EntryNr', how='left')
-
-        # Gather entry-domain for each HOG.
-        hog2dom = []
-        hog2info = []
-        for (hog_id, hdf) in tqdm(df.groupby('OmaHOG')):
-            size = len(set(hdf['EntryNr']))
-
-            hdf = hdf[~hdf['DomainId'].isnull()]
-            cov = len(set(hdf['EntryNr']))  # Coverage with any DA
-
-            if (size > 2) and (cov > 1):
-                # There are some annotations
-                da = collections.defaultdict(list)
-                for (enum, edf) in hdf.groupby('EntryNr'):
-                    d = edf['DomainId']
-                    d = tuple(d) if (type(d) != bytes) else (d,)
-                    da[d].append(enum)
-
-                da = sorted(da.items(), key=lambda i: len(i[1]), reverse=True)
-                c = len(da[0][1])  # Count of prev. DA
-                if c > 1:
-                    # DA exists in more than one member.
-                    cons_da = da[0][0]
-                    repr_entry = da[0][1][0]
-                    tl = hl_tab.read_where('Fam == {}'.format(hog_id))[0]['Level'].decode('ascii')
-                    rep_len = hdf[hdf['EntryNr'] == repr_entry]['SeqBufferLength']
-                    rep_len = int(rep_len if len(rep_len) == 1 else list(rep_len)[0])
-
-                    # Save the consensus DA
-                    off = len(hog2info)  # Offset in the information table.
-                    hog2dom += [(off, d) for d in cons_da]
-
-                    # Save required information about this group for the web
-                    # view.
-                    hog2info.append((hog_id,      # HOG ID
-                                     repr_entry,  # Repr. entry
-                                     rep_len,     # Repr. entry length
-                                     tl,          # Top level of HOG
-                                     size,        # HOG size
-                                     c))          # Prevalence
-
-        # Create tables in file -- done this way as these end up being pretty
-        # small tables (<25MB)
-        tab = self.h5.create_table('/HOGAnnotations',
-                                   'DomainArchPrevalence',
-                                   tablefmt.HOGDomainArchPrevalenceTable,
-                                   createparents=True,
-                                   expectedrows=len(hog2info))
-        self._write_to_table(tab, hog2info)
-        tab.flush()  # Required?
-
-        # HOG <-> Domain table
-        tab = self.h5.create_table('/HOGAnnotations',
-                                   'Domains',
-                                   tablefmt.HOGDomainPresenceTable,
-                                   createparents=True,
-                                   expectedrows=len(hog2dom))
-        self._write_to_table(tab, hog2dom)
-        tab.flush()  # Required?
-
-
-def download_url_if_not_present(url, force_copy=False):
-    if url.startswith('file://') and not force_copy:
-        fname = url[len('file://'):]
-        if os.path.exists(fname):
-            common.package_logger.info('using file "{}" directly from source without copying.'.format(url))
-            return fname
-    tmpfolder = os.path.join(os.getenv('DARWIN_NETWORK_SCRATCH_PATH', '/tmp'), "Browser", "xref")
-    basename = url.split('/')[-1]
-    fname = os.path.join(tmpfolder, basename)
-    if not os.path.exists(tmpfolder):
-        os.makedirs(tmpfolder)
-    if not os.path.exists(fname):
-        common.package_logger.info("downloading {} into {}".format(url, fname))
-        try:
-            urllib.request.urlretrieve(url, fname)
-        except urllib.request.URLError:
-            common.package_logger.warn('cannot download {}'.format(url))
-    return fname
-
-
-def iter_domains(url):
-    DomainTuple = collections.namedtuple('DomainTuple', ('md5', 'id', 'coords'))
-
-    fname = download_url_if_not_present(url)
-    with gzip.open(fname, 'rt') as uncompressed:
-        dialect = csv.Sniffer().sniff(uncompressed.read(4096))
-        uncompressed.seek(0)
-        csv_reader = csv.reader(uncompressed, dialect)
-        col_md5, col_id, col_coord = (None,) * 3
-        coord_fromat_trans = str.maketrans('-,', '::')
-
-        for lineNr, row in enumerate(csv_reader):
-            if col_md5 is None:
-                # identify which tuples to use.
-                if len(row) >= 9:
-                    # representative_proteins format. use columns 5-7
-                    col_md5, col_id, col_coord = 4, 5, 6
-                elif len(row) == 3:
-                    # additionally created ones, minimal format
-                    col_md5, col_id, col_coord = 0, 1, 2
-                else:
-                    raise DataImportError("Unknown Domain Annotation format in {}".format(uncompressed.filename))
-            try:
-                dom = DomainTuple(row[col_md5], row[col_id], row[col_coord].translate(coord_fromat_trans))
-                if lineNr < 10:
-                    # do some sanity checks on the first few lines
-                    if re.match(r'[0-9a-f]{32}$', dom.md5) is None:
-                        raise DataImportError("md5 hash of line {:d} has unexpected values: {}"
-                                              .format(lineNr, dom.md5))
-                    if re.match(r'([1-4]\.\d+\.\d+\.\d+|PF\d+)$', dom.id) is None:
-                        raise DataImportError("Domain-ID of line {:d} has unexpected value: {}"
-                                              .format(lineNr, dom.id))
-                    if re.match(r'\d+:\d+', dom.coords) is None:
-                        raise DataImportError("Domain coordinates in line {:d} has unexpected value: {}"
-                                              .format(lineNr, dom.coords))
-                yield dom
-            except Exception:
-                common.package_logger.exception('cannot create tuple from line {}'.format(lineNr))
-
-
-def only_pfam_or_cath_domains(iterable):
-    cath_re = re.compile(r'[1-4]\.')
-    for dom in iterable:
-        if dom.id.startswith('PF') or cath_re.match(dom.id) is not None:
-            yield dom
-
-
-def filter_duplicated_domains(iterable):
-    """filter duplicated domain annotations that come from different proteins
-    with the exact same sequence."""
-    seen = set([])
-    ignored = 0
-    for dom in iterable:
-        if not dom in seen:
-            seen.add(dom)
-            yield dom
-        else:
-            ignored += 1
-    common.package_logger.info("skipped {} duplicated domains. {} distinct domains yielded"
-                               .format(ignored, len(seen)))
-
-
-class OmaGroupMetadataLoader(object):
-    """OMA Group Meta data extractor.
-
-    This class provides the means to import the Keywords and Fingerprints
-    of the OMA Groups into the hdf5 database. The data is stored under
-    in the node defined by :attr:`meta_data_path`, which defaults to
-    /OmaGroups/MetaData.
-    """
-    keyword_name = "Keywords.drw"
-    finger_name = "Fingerprints"
-
-    meta_data_path = '/OmaGroups/MetaData'
-
-    def __init__(self, db):
-        self.db = db
-
-    def add_data(self):
-        common.package_logger.info('adding OmaGroup Metadata')
-        nr_groups = self._get_nr_of_groups()
-        has_meta_data = self._check_textfiles_avail()
-        if has_meta_data:
-            data = self._load_data()
-            fingerprints = data['Fingerprints']
-            keywords = data['Keywords']
-        else:
-            common.package_logger.warning('No fingerprint nor keyword information available')
-            fingerprints = [b'n/a'] * nr_groups
-            keywords = [b''] * nr_groups
-        if nr_groups != len(fingerprints) or nr_groups != len(keywords):
-            raise DataImportError('nr of oma groups does not match the number of fingerprints and keywords')
-
-        grptab, keybuf = self._create_db_objects(nr_groups)
-        self._fill_data_into_db(fingerprints, keywords, grptab, keybuf)
-        grptab.modify_column(column=self._get_group_member_counts(), colname='NrMembers')
-        self._create_indexes(grptab)
-
-    def _create_db_objects(self, nrows):
-        key_path = os.path.join(os.path.dirname(self.meta_data_path), 'KeywordBuffer')
-        try:
-            self.db.get_node(self.meta_data_path)
-            self.db.remove_node(self.meta_data_path)
-            self.db.remove_node(key_path)
-        except tables.NoSuchNodeError:
-            pass
-        root, name = self.meta_data_path.rsplit('/', 1)
-        grptab = self.db.create_table(root, name, tablefmt.OmaGroupTable,
-                                      expectedrows=nrows, createparents=True)
-        buffer = self.db.create_earray(root, "KeywordBuffer", tables.StringAtom(1), (0,),
-                                       'concatenated group keywords  descriptions',
-                                              expectedrows=500 * nrows)
-        return grptab, buffer
-
-    def _fill_data_into_db(self, stable_ids, keywords, grp_tab, key_buf):
-        row = grp_tab.row
-        buf_pos = 0
-        for i in range(len(stable_ids)):
-            row['GroupNr'] = i+1
-            row['Fingerprint'] = stable_ids[i]
-            row['KeywordOffset'] = buf_pos
-            row['KeywordLength'] = len(keywords[i])
-            row.append()
-            key = numpy.ndarray((len(keywords[i]),), buffer=keywords[i],
-                                dtype=tables.StringAtom(1))
-            key_buf.append(key)
-            buf_pos += len(keywords[i])
-        grp_tab.flush()
-        key_buf.flush()
-
-    def _create_indexes(self, grp_tab):
-        grp_tab.cols.Fingerprint.create_csindex()
-        grp_tab.cols.GroupNr.create_csindex()
-
-    def _parse_darwin_string_list_file(self, fh):
-        data = fh.read()
-        start, end = data.find(b'['), data.rfind(b', NULL]')
-        if end == -1:
-            end = data.rfind(b']:')
-        part = data[start:end] + b']'
-        as_json = part.replace(b"''", b"__apos__").replace(b"'", b'"')\
-                      .replace(b'__apos__', b"'")
-        as_list = json.loads(as_json.decode())
-        return [el.encode('utf8') for el in as_list]
-
-    def _load_data(self):
-        return callDarwinExport('GetGroupData()')
-
-    def _get_nr_of_groups(self):
-        etab = self.db.get_node('/Protein/Entries')
-        try:
-            return etab[etab.colindexes['OmaGroup'][-1]]['OmaGroup']
-        except KeyError:
-            return max(etab.col('OmaGroup'))
-
-    def _get_group_member_counts(self):
-        grp_nr, cnts = numpy.unique(self.db.get_node('/Protein/Entries').col('OmaGroup'), return_counts=True)
-        if grp_nr[0] == 0:
-            cnts = cnts[1:]
-        assert(len(cnts) == self._get_nr_of_groups())
-        return cnts
-
-    def _check_textfiles_avail(self):
-        rootdir = os.getenv('DARWIN_BROWSERDATA_PATH','')
-        fn1 = os.path.join(rootdir, self.keyword_name)
-        fn2 = os.path.join(rootdir, self.finger_name)
-        return os.path.exists(fn1) and os.path.exists(fn2)
-
-
-class DescriptionManager(object):
-    def __init__(self, db, entry_path, buffer_path):
-        self.db = db
-        self.entry_path = entry_path
-        self.buffer_path = buffer_path
-
-    def __enter__(self):
-        self.entry_tab = self.db.get_node(self.entry_path)
-        if not numpy.all(numpy.equal(self.entry_tab.col('EntryNr'),
-                                     numpy.arange(1, len(self.entry_tab) + 1))):
-            raise RuntimeError('entry table is not sorted')
-
-        root, name = os.path.split(self.buffer_path)
-        self.desc_buf = self.db.create_earray(root, name,
-                                              tables.StringAtom(1), (0,), 'concatenated protein descriptions',
-                                              expectedrows=len(self.entry_tab) * 100)
-        self.cur_eNr = None
-        self.cur_desc = []
-        bufindex_dtype = numpy.dtype([(col, self.entry_tab.coldtypes[col])
-                                      for col in ('DescriptionOffset', 'DescriptionLength')])
-        # columns to be stored in entry table with buffer index data
-        self.buf_index = numpy.zeros(len(self.entry_tab), dtype=bufindex_dtype)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.cur_eNr:
-            self._store_description()
-        self.desc_buf.flush()
-        self.entry_tab.modify_columns(columns=self.buf_index,
-                                      names=self.buf_index.dtype.names)
-        self.entry_tab.flush()
-
-    def add_description(self, eNr, desc):
-        """stages a description for addition. Note that the descriptions
-        must be ordered according to the entryNr, i.e. all descriptions
-        related to eNr X must be staged before changeing to another eNr."""
-        if self.cur_eNr and self.cur_eNr != eNr:
-            self._store_description()
-            self.cur_desc = []
-        self.cur_eNr = eNr
-        self.cur_desc.append(desc)
-
-    def _store_description(self):
-        buf = "; ".join(self.cur_desc).encode('utf-8')
-        buf = buf[0:2 ** 16 - 1]  # limit to max value of buffer length field
-        len_buf = len(buf)
-        idx = self.cur_eNr - 1
-        self.buf_index[idx]['DescriptionOffset'] = len(self.desc_buf)
-        self.buf_index[idx]['DescriptionLength'] = len_buf
-        self.desc_buf.append(numpy.ndarray((len_buf,), buffer=buf, dtype=tables.StringAtom(1)))
-
-
-class GeneOntologyManager(object):
-    ontology_url = "http://purl.obolibrary.org/obo/go/go-basic.obo"
-
-    def __init__(self, db, annotation_path, ontology_path):
-        self.db = db
-        self.annotation_path = annotation_path
-        self.ontology_path = ontology_path
-        self._go_buf = []
-        self.quote_re = re.compile(r'([[,])([\w_:]+)([,\]])')
-
-    def __enter__(self):
-        go_obo_file = download_url_if_not_present(self.ontology_url)
-        # check that ontology file is not broken. if we can build it, it should be ok
-        self.go = GeneOntology(OntologyParser(go_obo_file))
-        self.go.parse()
-
-        with open(go_obo_file, 'rb') as fh:
-            go_obo = fh.read()
-        root, name = os.path.split(self.ontology_path)
-        obo = self.db.create_carray(root, name, title='Gene ontology hierarchy definition', createparents=True,
-                              obj=numpy.ndarray(len(go_obo), buffer=go_obo, dtype=tables.StringAtom(1)))
-        obo._f_setattr('ontology_release', self._get_obo_version(obo))
-
-        root, name = os.path.split(self.annotation_path)
-        self.go_tab = self.db.create_table(root, name, tablefmt.GeneOntologyTable,
-                                      'Gene Ontology annotations', expectedrows=1e8, createparents=True)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self._flush_buffers()
-        self.go_tab.flush()
-
-    def _get_obo_version(self, obo_arr):
-        header = obo_arr[0:1000].tobytes()
-        rel_info = re.search(b'data-version:\s*(?P<version>[\w/_ -]+)', header)
-        if rel_info is not None:
-            rel_info = rel_info.group('version').decode()
-        return rel_info
-
-    def _flush_buffers(self):
-        common.package_logger.info('flushing go annotations buffers')
-        if len(self._go_buf) > 0:
-            self.go_tab.append(self._go_buf)
-        self._go_buf = []
-
-    def add_annotations(self, enr, gos):
-        """parse go annotations and add them to the go buffer"""
-        if not (isinstance(enr, int) and isinstance(gos, str)):
-            raise ValueError('input data invalid')
-        for t in gos.split('; '):
-            t = t.strip()
-            try:
-                term, rem = t.split('@')
-            except ValueError as e:
-                common.package_logger.warning('cannot parse GO annotation: ' + t)
-                continue
-
-            try:
-                term_nr = self.go.term_by_id(term).id
-            except ValueError:
-                common.package_logger.warning('invalid GO term for entry {:d}: {:s} (likely obsolete)'
-                                              .format(enr, term))
-                continue
-            rem = rem.replace('{', '[')
-            rem = rem.replace('}', ']')
-            rem = self.quote_re.sub('\g<1>"\g<2>"\g<3>', rem)
-            for evi, refs in eval(rem):
-                for ref in refs:
-                    self._go_buf.append((enr, term_nr, evi, ref.encode('utf-8')))
-            if len(self._go_buf) > 2e6:
-                self._flush_buffers()
-
-
-class GroupAnnotatorInclGeneRefs(familyanalyzer.GroupAnnotator):
-    def _annotateGroupR(self, node, og, idx=0):
-        if familyanalyzer.OrthoXMLQuery.is_geneRef_node(node):
-            node.set('og', og)
-        else:
-            super()._annotateGroupR(node, og, idx)
-
-
-class HogConverter(object):
-    def __init__(self, entry_tab):
-        self.fam_re = re.compile(r'HOG:(?P<fam_nr>\d+)')
-        self.hogs = numpy.zeros(shape=(len(entry_tab) + 1,), dtype=entry_tab.cols.OmaHOG.dtype)
-        self.entry_tab = entry_tab
-
-    def attach_newick_taxonomy(self, tree):
-        self.taxonomy = familyanalyzer.NewickTaxonomy(tree)
-
-    def _assert_hogid_has_correct_prefix(self, fa_parser):
-        for grp in fa_parser.getToplevelGroups():
-            if not grp.get('id').startswith('HOG:'):
-                grp.set('id', 'HOG:{:07d}'.format(int(grp.get('id'))))
-
-    def convert_file(self, fn):
-        p = familyanalyzer.OrthoXMLParser(fn)
-        self._assert_hogid_has_correct_prefix(p)
-        if hasattr(self, 'taxonomy'):
-            p.augmentTaxonomyInfo(self.taxonomy)
-        else:
-            p.augmentTaxonomyInfo(familyanalyzer.TaxonomyFactory.newTaxonomy(p))
-        GroupAnnotatorInclGeneRefs(p).annotateDoc()
-
-        levs = []
-        for fam in p.getToplevelGroups():
-            m = self.fam_re.match(fam.get('og'))
-            fam_nr = int(m.group('fam_nr'))
-            levs.extend([(fam_nr, n.getparent().get('og'), n.get('value'),) + self.get_hog_scores(n.getparent())
-                         for n in p._findSubNodes('property', root=fam)
-                         if n.get('name') == "TaxRange"])
-
-        geneNodes = p.root.findall('.//{{{ns0}}}geneRef'.
-                                   format(**familyanalyzer.OrthoXMLParser.ns))
-        for x in geneNodes:
-            self.hogs[int(x.get('id'))] = x.get('og')
-
-        return levs
-
-    def write_hogs(self):
-        """update the Entry Table with the newly collected OmaHOG values for all
-        the proteins at once.
-
-        .. note: This method will overwrite any previous value of the OmaHOG column"""
-        self.entry_tab.modify_column(0, len(self.entry_tab), 1, self.hogs[1:], 'OmaHOG')
-        self.entry_tab.flush()
-
-    def get_hog_scores(self, og_node):
-        """extract the scores associated with an orthologGroup node
-
-        only scores that are defined in HOGsTable are extract. The method
-        returns a tuple with the scores in the order of the score fields."""
-        scores = collections.OrderedDict([(score, tablefmt.HOGsTable.columns[score].dflt)
-                                          for score in ('CompletenessScore', 'ImpliedLosses')])
-        for score in og_node.iterfind('{*}score'):
-            score_id = score.get("id")
-            if score_id == "CompletenessScore":
-                scores['CompletenessScore'] = float(score.get('value'))
-            elif score_id == "ImpliedLosses":
-                scores['ImpliedLosses'] = int(score.get('value'))
-        return tuple(scores.values())
-
-
-class XRefImporter(object):
-    """Object to import various types of crossreferences into hdf5.
-
-    The XRefImporter registers at a db_parser object various handlers
-    to import the various types of xrefs, namely ids, go-terms,
-    EC annotations and descriptions."""
-    def __init__(self, db_parser, genomes_tab, xref_tab, ec_tab, go_manager, desc_manager):
-        self.xrefs = []
-        self.ec = []
-        self.xref_tab = xref_tab
-        self.ec_tab = ec_tab
-        self.go_manager = go_manager
-        self.desc_manager = desc_manager
-
-        self.verif_enum = tablefmt.XRefTable.columns.get('Verification').enum
-        xrefEnum = tablefmt.XRefTable.columns.get('XRefSource').enum
-        tag_to_enums = {
-            'GI': (xrefEnum['GI'], 'exact'),
-            'EntrezGene': (xrefEnum['EntrezGene'], 'exact'),
-            'WikiGene': (xrefEnum['WikiGene'], 'unchecked'),
-            'IPI': (xrefEnum['IPI'], 'unchecked'),
-            'Refseq_ID': (xrefEnum['RefSeq'], 'exact'),
-            'SwissProt': (xrefEnum['UniProtKB/SwissProt'], 'exact'),
-            'GeneName': (xrefEnum['Gene Name'], 'unchecked'),
-            'ORFNames': (xrefEnum['ORF Name'], 'unchecked'),
-            'OrderedLocusNames': (xrefEnum['Ordered Locus Name'], 'unchecked'),
-            'ProtName': (xrefEnum['Protein Name'], 'unchecked'),
-            'Synonyms': (xrefEnum['Synonym'], 'unchecked'),
-            'HGNC_Id': (xrefEnum['HGNC'], 'unchecked'),
-            'PMP': (xrefEnum['PMP'], 'exact'),
-            'PDB': (xrefEnum['PDB'], 'unchecked'),
-            'EMBL': (xrefEnum['EMBL'], 'unchecked'),
-            'ID': (xrefEnum['SourceID'], 'exact'),
-            'AC': (xrefEnum['SourceAC'], 'exact'),
-        }
-        for tag, enumval in tag_to_enums.items():
-            db_parser.add_tag_handler(
-                tag,
-                lambda key, enr, typ=enumval: self.multi_key_handler(key, enr, typ[0], typ[1]))
-        db_parser.add_tag_handler('DE',
-                                  lambda key, enr: self.description_handler(key, enr))
-        db_parser.add_tag_handler('GO', self.go_handler)
-        db_parser.add_tag_handler('ID', self.assign_source_handler)
-        db_parser.add_tag_handler('AC', self.assign_source_handler)
-        db_parser.add_tag_handler('EC', self.ec_handler)
-
-        for tag in ['SwissProt_AC', 'UniProt']:  # UniProt/TrEMBL tag is cut to UniProt!
-            db_parser.add_tag_handler(tag,
-                                      lambda key, enr, typ=xrefEnum['UniProtKB/TrEMBL']:
-                                      self.remove_uniprot_code_handler(key, enr, typ))
-
-        # register the potential_flush as end_of_entry_notifier
-        db_parser.add_end_of_entry_notifier(self.potential_flush)
-
-        self.db_parser = db_parser
-        self.xrefEnum = xrefEnum
-        self.ENS_RE = re.compile(r'ENS(?P<species>[A-Z]{0,3})(?P<typ>[GTP])(?P<num>\d{11})')
-        self.FB_RE = re.compile(r'FB(?P<typ>[gnptr]{2})(?P<num>\d{7})')
-        self.NCBI_RE = re.compile(r'[A-Z]{3}\d{5}\.\d$')
-        self.WB_RE = re.compile(r'WBGene\d{8}$')
-        self.EC_RE = re.compile(r'\d+\.(\d+|-)\.(\d+|-)\.(\d+|-)')
-        self.ENSGENOME_RE = re.compile(b'Ensembl (Metazoa|Plant|Fungi|Protist|Bacteria)', re.IGNORECASE)
-
-        self.FLUSH_SIZE = 5e6
-
-        # info about current genome
-        self.genomes_tab = genomes_tab
-        self._cur_genome = None
-
-    def _get_genome_info(self, entry_nr):
-        if not (self._cur_genome is not None and self._cur_genome['EntryOff'] < entry_nr <=
-                self._cur_genome['EntryOff'] + self._cur_genome['TotEntries']):
-            self._cur_genome = self.genomes_tab[self.genomes_tab['EntryOff'].searchsorted(entry_nr+1)-1]
-        return self._cur_genome
-
-    def from_EnsemblGenome(self, entry_nr):
-        genome_info = self._get_genome_info(entry_nr)
-        return self.ENSGENOME_RE.search(genome_info['Release']) is not None
-
-    def flush_buffers(self):
-        common.package_logger.info('flushing xrefs and ec buffers')
-        if len(self.xrefs) > 0:
-            self.xref_tab.append(sorted(uniq(self.xrefs)))
-            self.xrefs = []
-        if len(self.ec) > 0:
-            self.ec_tab.append(sorted(uniq(self.ec)))
-            self.ec = []
-
-    def potential_flush(self):
-        if len(self.xrefs) > self.FLUSH_SIZE:
-            self.flush_buffers()
-
-    def _add_to_xrefs(self, eNr, enum_nr, key, verif='unchecked'):
-        if not isinstance(eNr, int):
-            raise ValueError('eNr is of wrong type:' + str(eNr))
-        self.xrefs.append((eNr, enum_nr, key.encode('utf-8'), self.verif_enum[verif], ))
-
-    def key_value_handler(self, key, eNr, enum_nr, verif='unchecked'):
-        """basic handler that simply adds a key (the xref) under a given enum_nr"""
-        self._add_to_xrefs(eNr, enum_nr, key, verif)
-
-    def multi_key_handler(self, multikey, eNr, enum_nr, verif='unchecked'):
-        """try to split the myltikey field using '; ' as a delimiter and add each
-        part individually under the passed enum_nr id type."""
-        for key in multikey.split('; '):
-            if key.startswith('Rep'):
-                continue
-            pos = key.find('.Rep')
-            if pos > 0:
-                key = key[0:pos]
-            self._add_to_xrefs(eNr, enum_nr, key, verif)
-
-    def assign_source_handler(self, multikey, eNr):
-        """handler that splits the multikey field at '; ' locations and
-        tries to guess for each part the id_type. If a type could be
-        identified, it is added under with this id type, otherwise left out."""
-        for key in multikey.split('; '):
-            ens_match = self.ENS_RE.match(key)
-            if ens_match is not None:
-                typ = ens_match.group('typ')
-                if typ == 'P':
-                    enum_nr = self.xrefEnum['Ensembl Protein']
-                elif typ == 'G':
-                    enum_nr = self.xrefEnum['Ensembl Gene']
-                elif typ == 'T':
-                    enum_nr = self.xrefEnum['Ensembl Transcript']
-                common.package_logger.debug(
-                    'ensembl: ({}, {}, {})'.format(key, typ, enum_nr))
-                self._add_to_xrefs(eNr, enum_nr, key, 'exact')
-
-            for enum, regex in {'FlyBase': self.FB_RE, 'NCBI': self.NCBI_RE, 'WormBase': self.WB_RE}.items():
-                match = regex.match(key)
-                if match is not None:
-                    enum_nr = self.xrefEnum[enum]
-                    self._add_to_xrefs(eNr, enum_nr, key, 'unchecked')
-            if self.from_EnsemblGenome(eNr):
-                self._add_to_xrefs(eNr, self.xrefEnum.EnsemblGenomes, key, 'exact')
-
-    def go_handler(self, gos, enr):
-        self.go_manager.add_annotations(enr, gos)
-
-    def ec_handler(self, ecs, enr):
-        for t in ecs.split('; '):
-            t = t.strip()
-            acc_match = self.EC_RE.match(t)
-            if acc_match is not None:
-                self.ec.append((enr, acc_match.group(0)))
-
-    def description_handler(self, de, eNr):
-        self.desc_manager.add_description(eNr, de)
-
-    def remove_uniprot_code_handler(self, multikey, eNr, enum_nr):
-        """remove the species part (sep by '_') of a uniprot long accession to the short acc"""
-        common.package_logger.debug(
-            'remove_uniprot_code_handler called ({}, {},{})'.format(multikey, eNr, enum_nr))
-        for key in multikey.split('; '):
-            pos = key.find('_')
-            if pos > 0:
-                self._add_to_xrefs(eNr, enum_nr, key[0:pos], 'exact')
-            else:
-                self._add_to_xrefs(eNr, enum_nr, key, 'exact')
-
-    def build_suffix_index(self, force=False):
-        parent, name = os.path.split(self.xref_tab._v_pathname)
-        file_ = self.xref_tab._v_file
-        idx_node = get_or_create_tables_node(file_, os.path.join(parent, "{}_Index".format(name)))
-        for arr_name, typ in (('buffer', tables.StringAtom(1)), ('offset', tables.UInt32Atom())):
-            try:
-                n = idx_node._f_get_child(arr_name)
-                if not force:
-                    raise tables.NodeError("Suffix index for xrefs does already exist. Use 'force' to overwrite")
-                n.remove()
-            except tables.NoSuchNodeError:
-                pass
-            file_.create_earray(idx_node, arr_name, typ, (0,), expectedrows=100e6)
-        buf, off = (idx_node._f_get_child(node) for node in ('buffer', 'offset'))
-        self._build_lowercase_xref_buffer(buf, off)
-        sa = sais(buf)
-        try:
-            idx_node._f_get_child('suffix').remove()
-        except tables.NoSuchNodeError:
-            pass
-        file_.create_carray(idx_node, 'suffix', obj=sa)
-
-    def _build_lowercase_xref_buffer(self, buf, off):
-        cur_pos = 0
-        for xref_row in tqdm(self.xref_tab):
-            lc_ref = xref_row['XRefId'].lower()
-            ref = numpy.ndarray((len(lc_ref),), buffer=lc_ref, dtype=tables.StringAtom(1))
-            buf.append(ref)
-            off.append([cur_pos])
-            cur_pos += len(lc_ref)
-
-
-class DarwinDbEntryParser:
-    def __init__(self):
-        """Initializes a Parser for SGML formatted darwin database file
-        """
-        self.tag_handlers = collections.defaultdict(list)
-        self.end_of_entry_notifier = []
-
-    def add_tag_handler(self, tag, handler):
-        """add a callback handler for a certain tag"""
-        self.tag_handlers[tag].append(handler)
-        common.package_logger.debug('# handlers for {}: {}'.format(tag, len(self.tag_handlers[tag])))
-
-    def add_end_of_entry_notifier(self, handler):
-        self.end_of_entry_notifier.append(handler)
-
-    def parse_entrytags(self, fh):
-        """ AC, CHR, DE, E, EMBL, EntrezGene, GI, GO, HGNC_Name, HGNC_Sym,
-        ID, InterPro, LOC, NR , OG, OS, PMP, Refseq_AC, Refseq_ID, SEQ,
-        SwissProt, SwissProt_AC, UniProt/TrEMBL, WikiGene, flybase_transcript_id
-
-        :param fh: an already opened file handle to the darwin database
-                   file to be parsed."""
-        eNr = 0
-        for line in fh:
-            line = line.strip()
-            if not line.startswith('<E>'):
-                common.package_logger.debug('skipping line:' + line)
-                continue
-
-            eNr += 1
-            common.package_logger.debug('entry {}: {}'.format(eNr, line.encode('utf-8')))
-            entry = lxml.html.fragment_fromstring(line)
-            for tag, handlers in self.tag_handlers.items():
-                common.package_logger.debug('tag {} ({} handlers)'.format(tag, len(handlers)))
-                tag_text = [t.text for t in entry.findall('./' + tag.lower())]
-                for value in tag_text:
-                    # common.package_logger.debug('value of tag: {}'.format(value.encode('utf-8')))
-                    if value is None:
-                        continue
-                    for handler in handlers:
-                        handler(value, eNr)
-                        # common.package_logger.debug('called handler {} with ({},{})'.format(
-                        #    handler, value.encode('utf-8'), eNr))
-            for notifier in self.end_of_entry_notifier:
-                notifier()
-
-
-DomainDescription = collections.namedtuple('DomainDescription',
-                                           tables.dtype_from_descr(tablefmt.DomainDescriptionTable).names)
-
-
-class CathDomainNameParser(object):
-    re_pattern = re.compile(r'(?P<id>[0-9.]*)\s{3,}\w{7}\s{3,}:\s*(?P<desc>.*)')
-    source = b'CATH/Gene3D'
-
-    def __init__(self, url):
-        self.fname = download_url_if_not_present(url)
-
-    def parse(self):
-        open_lib = gzip.open if self.fname.endswith('.gz') else open
-        with open_lib(self.fname, 'rt') as fh:
-            for line in fh:
-                match = self.re_pattern.match(line)
-                if match is not None:
-                    yield DomainDescription(DomainId=match.group('id').encode('utf-8'),
-                                            Source=self.source,
-                                            Description=match.group('desc').encode('utf-8'))
-
-
-class PfamDomainNameParser(CathDomainNameParser):
-    re_pattern = re.compile(r'(?P<id>\w*)\t\w*\t\w*\t\w*\t(?P<desc>.*)')
-    source = b'Pfam'
-
-
-def augment_genomes_json_download_file(fpath, h5, backup='.bak'):
-    """Augment the genomes.json file in the download section with additional info
-
-    This function stores the ncbi taxonomy identifiers of internal nodes and adds
-    the number of ancestral genes to the internal nodes.
-
-    :param fpath: path to genomes.json file
-    :param h5: hdf5 database handle."""
-    common.package_logger.info("Augmenting genomes.json file with Nr of HOGs per level")
-    # load nr of ancestral genomes at each level
-    ancestral_hogs = collections.Counter()
-    step = 2**15
-    hog_tab = h5.get_node('/HogLevel')
-    for start in range(0, len(hog_tab), step):
-        ancestral_hogs.update((l.decode() for l in hog_tab.read(start, stop=start+step, field='Level')))
-    # load taxonomy and sorter by Name
-    tax = h5.get_node('/Taxonomy').read()
-    sorter = numpy.argsort(tax['Name'])
-    with open(fpath, 'rt') as fh:
-        genomes = json.load(fh)
-    os.rename(fpath, fpath + '.bak')
-
-    def traverse(node):
-        if 'children' not in node:
-            return
-        for child in node['children']:
-            traverse(child)
-        try:
-            node['nr_hogs'] = ancestral_hogs[node['name']]
-        except KeyError as e:
-            common.package_logger.warning('no ancestral hog counts for '+node['name'])
-            node['nr_hogs'] = 0
-
-        try:
-            n = node['name'].encode('utf-8')
-            idx = numpy.searchsorted(tax['Name'], n, sorter=sorter)
-            if tax['Name'][sorter[idx]] == n:
-                node['taxid'] = int(tax['NCBITaxonId'][sorter[idx]])
-            else:
-                raise ValueError('not in taxonomy: {}'.format(n))
-        except Exception:
-            common.package_logger.exception('Cannot identify taxonomy id')
-
-    traverse(genomes)
-    with open(fpath, 'wt') as fh:
-        json.dump(genomes, fh)
-
-
-def getLogger(level='DEBUG'):
-    import logging
-
-    log = logging.getLogger('pyoma')
-    if isinstance(level, str):
-        level = logging.getLevelName(level.upper())
-        if not isinstance(level, int):
-            level = logging.DEBUG
-    log.setLevel(level)
-    logHandler = logging.StreamHandler()
-    logHandler.setLevel(level)
-    logHandler.setFormatter(logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-    log.addHandler(logHandler)
-    return log
-
-
-def main(name="OmaServer.h5", k=6, idx_name=None, domains=None, log_level='INFO'):
-    idx_name = (name + '.idx') if idx_name is None else idx_name
-
-    log = getLogger(log_level)
-    x = DarwinExporter(name, logger=log)
-    x.add_version()
-    x.add_species_data()
-    x.add_orthologs()
-    x.add_same_species_relations()
-    x.add_proteins()
-    x.add_hogs()
-    x.add_xrefs()
-    x.add_synteny_scores()
-    x.add_homoeology_confidence()
-    if domains is None:
-        domains = ["file://dev/null"]
-    x.add_domain_info(filter_duplicated_domains(only_pfam_or_cath_domains(itertools.chain(
-        iter_domains('ftp://orengoftp.biochem.ucl.ac.uk/gene3d/CURRENT_RELEASE/' +
-                     'representative_uniprot_genome_assignments.csv.gz'),
-        iter_domains('file://{}/additional_domains.mdas.csv.gz'.format(os.getenv('DARWIN_BROWSERDATA_PATH', '')))
-    ))))
-    x.add_domainname_info(itertools.chain(
-        CathDomainNameParser('http://download.cathdb.info/cath/releases/latest-release/'
-                             'cath-classification-data/cath-names.txt').parse(),
-        PfamDomainNameParser('ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz').parse()))
-    x.add_canonical_id()
-    x.add_group_metadata()
-    x.add_hog_domain_prevalence()
-    x.close()
-
-    x = DarwinExporter(name, logger=log)
-    x.create_indexes()
-    x.add_sequence_suffix_array(k=k, fn=idx_name)
-    x.update_summary_stats()
-
-    genomes_json_fname = os.path.normpath(os.path.join(
-        os.path.dirname(name), '..', 'downloads', 'genomes.json'))
-    augment_genomes_json_download_file(genomes_json_fname, x.h5)
-    x.close()
diff --git a/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py b/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py
deleted file mode 100755
index 1857f3e..0000000
--- a/src/HogProf/build/lib/pyoma/browser/convert_omastandalone.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from .convert import *
-from pyoma.browser import OrthoXMLSplitter
-import os
-
-
-class StandaloneExporter(DarwinExporter):
-    DRW_CONVERT_FILE = os.path.abspath(os.path.splitext(__file__)[0] + ".drw")
-
-    def __init__(self, root, name, **kwargs):
-        os.environ['DARWIN_BROWSERDATA_PATH'] = os.path.abspath(root)
-        super(StandaloneExporter, self).__init__(name, **kwargs)
-        self.transformed = False
-        self.cache_dir = os.path.join(os.getenv('DARWIN_BROWSERDATA_PATH'), 'pyoma')
-
-    def add_homologs(self):
-        self.assert_cached_results()
-        for gs in self.h5.root.Genome.iterrows():
-            genome = gs['UniProtSpeciesCode'].decode()
-            rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome))
-            if 'homologs' not in rel_node_for_genome:
-                pass
-
-    def get_version(self):
-        # TODO: obtain real version
-        return "OmaStandalone; 1.0.x"
-
-    def assert_cached_results(self):
-        if not self.transformed:
-            res = self.call_darwin_export("TransformDataToCache('{}');".format(
-                    self.cache_dir))
-            if res != 'success':
-                raise DarwinException('could not transform data from darwin')
-            self.transformed = True
-            os.environ['DARWIN_NETWORK_SCRATCH_PATH'] = os.getenv('DARWIN_BROWSERDATA_PATH')
-
-    def add_orthologs(self):
-        self.assert_cached_results()
-        for gs in self.h5.root.Genome.iterrows():
-            genome = gs['UniProtSpeciesCode'].decode()
-            rel_node_for_genome = self._get_or_create_node('/PairwiseRelation/{}'.format(genome))
-            if 'VPairs' not in rel_node_for_genome:
-                cache_file = os.path.join(
-                    os.getenv('DARWIN_NETWORK_SCRATCH_PATH', ''),
-                    'pyoma', 'vps', '{}.txt.gz'.format(genome))
-                if os.path.exists(cache_file):
-                    data = load_tsv_to_numpy((cache_file, 0, 0, False,))
-                else:
-                    # fallback to read from VPsDB
-                    data = self.call_darwin_export('GetVPsForGenome({})'.format(genome))
-
-                vp_tab = self.h5.create_table(rel_node_for_genome, 'VPairs', tablefmt.PairwiseRelationTable,
-                                              expectedrows=len(data))
-                if isinstance(data, list):
-                    data = self._convert_to_numpyarray(data, vp_tab)
-                self._write_to_table(vp_tab, data)
-                vp_tab.cols.EntryNr1.create_csindex()
-
-    def add_hogs(self):
-        hog_path = os.path.join(
-            os.environ['DARWIN_BROWSERDATA_PATH'], 'Output')
-
-        entryTab = self.h5.get_node('/Protein/Entries')
-
-        tree_filename = os.path.join(
-            os.environ['DARWIN_BROWSERDATA_PATH'],
-            'EstimatedSpeciesTree.nwk')
-
-        hog_converter = HogConverter(entryTab)
-
-        if os.path.exists(tree_filename):
-            hog_converter.attach_newick_taxonomy(tree_filename)
-
-        fn = 'HierarchicalGroups.orthoxml'
-
-        # Split the OrthoXML up (puts in cache_dir/split_hog).
-        hog_cache_dir = os.path.join(self.cache_dir, 'split_hogs')
-        ortho_splitter = OrthoXMLSplitter.OrthoXMLSplitter(os.path.join(hog_path, fn), cache_dir=hog_cache_dir)
-        ortho_splitter()
-
-        hogTab = self.h5.create_table('/', 'HogLevel', tablefmt.HOGsTable,
-                                      'nesting structure for each HOG', expectedrows=1e8)
-        self.orthoxml_buffer = self.h5.create_earray('/OrthoXML', 'Buffer',
-                                                     tables.StringAtom(1), (0,), 'concatenated orthoxml files',
-                                                     expectedrows=1e9, createparents=True)
-        self.orthoxml_index = self.h5.create_table('/OrthoXML', 'Index', tablefmt.OrthoXmlHogTable,
-                                                   'Range index per HOG into OrthoXML Buffer', expectedrows=5e6)
-
-        try:
-            levels = hog_converter.convert_file(os.path.join(hog_path, fn))
-            hogTab.append(levels)
-            fam_nrs = set([z[0] for z in levels])
-            for fam_nr in fam_nrs:
-                hog_fn = "HOG{:06d}.orthoxml".format(fam_nr)
-                self.add_orthoxml(os.path.join(hog_cache_dir, hog_fn), [fam_nr])
-        except Exception as e:
-            self.logger.error('an error occured while processing ' + fn + ':')
-            self.logger.exception(e)
-
-        hog_converter.write_hogs()
-
-    def _get_genome_database_paths(self):
-        return self.call_darwin_export('GetGenomeFileNames();')
-
-    def xref_databases(self):
-        return self._get_genome_database_paths()
-
-
-def import_oma_run(path, outfile, add_domains=True, log_level='INFO'):
-    log = getLogger(log_level)
-    x = StandaloneExporter(path, outfile, logger=log, mode='write')
-    x.add_version()
-    x.add_species_data()
-    x.add_orthologs()
-    x.add_proteins()
-    x.add_hogs()
-    x.add_xrefs()
-    domain_url = ('ftp://orengoftp.biochem.ucl.ac.uk/gene3d/CURRENT_RELEASE/'+
-                  'representative_uniprot_genome_assignments.csv.gz')
-    if not add_domains:
-        domain_url = 'file:///dev/null'
-    x.add_domain_info(only_pfam_or_cath_domains(iter_domains(domain_url)))
-    x.add_domainname_info(itertools.chain(
-        CathDomainNameParser('http://download.cathdb.info/cath/releases/latest-release/'
-                             'cath-classification-data/cath-names.txt').parse(),
-        PfamDomainNameParser('ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz').parse()))
-    x.add_canonical_id()
-    x.add_group_metadata()
-    x.add_hog_domain_prevalence()
-    x.close()
-
-    x = StandaloneExporter(path, outfile, logger=log)
-    x.create_indexes()
-    x.add_sequence_suffix_array()
-    x.update_summary_stats()
-    x.close()
-
-
-if __name__ == "__main__":
-    import_oma_run('~/Repositories/OmaStandalone', 'oma.h5')
diff --git a/src/HogProf/build/lib/pyoma/browser/db.py b/src/HogProf/build/lib/pyoma/browser/db.py
deleted file mode 100755
index 17315fe..0000000
--- a/src/HogProf/build/lib/pyoma/browser/db.py
+++ /dev/null
@@ -1,1770 +0,0 @@
-from __future__ import division, print_function
-from builtins import chr, range, object, zip, bytes
-import io
-import itertools
-import time
-from Bio.UniProt import GOA
-from bisect import bisect_left
-import dateutil
-import pandas as pd
-import pyopa
-import tables
-import threading
-import numpy
-import numpy.lib.recfunctions
-import re
-import json
-import os
-import collections
-import logging
-from .KmerEncoder import KmerEncoder
-from .models import LazyProperty, KeyWrapper, ProteinEntry, Genome
-from .geneontology import GeneOntology, OntologyParser, AnnotationParser, GOAspect
-from xml.etree import ElementTree as et
-
-logger = logging.getLogger(__name__)
-
-# Raise stack limit for PyOPA ~400MB
-threading.stack_size(4096*100000)
-
-# Global initialisations
-GAF_VERSION = '2.1'
-
-
-def count_elements(iterable):
-    """return the number of elements in an iterator in the most efficient way.
-
-    Be aware that for unbound iterators, this method won't terminate!
-    :param iterable: an iterable object.
-    """
-    counter = itertools.count()
-    collections.deque(zip(iterable, counter), maxlen=0)  # (consume at C speed)
-    return next(counter)
-
-
-_first_cap_re = re.compile('(.)([A-Z][a-z]+)')
-_all_cap_re = re.compile('([a-z0-9])([A-Z])')
-def to_snail_case(name):
-    """function to convert from CamelCase to snail_case"""
-    s1 = _first_cap_re.sub(r'\1_\2', name)
-    return _all_cap_re.sub(r'\1_\2', s1).lower()
-
-
-class Database(object):
-    """This is the main interface to the oma database. Queries
-    will typically be issued by methods of this object. Typically
-    the result of queries will be :py:class:`numpy.recarray` objects."""
-    EXPECTED_DB_SCHEMA = "3.2"
-
-    def __init__(self, db):
-        if isinstance(db, str):
-            logger.info('opening {} for read-only'.format(db))
-            self.db = tables.open_file(db, 'r')
-        elif isinstance(db, tables.File):
-            self.db = db
-        else:
-            raise ValueError(str(db) + ' is not a valid database type')
-
-        try:
-            db_version = self.db.get_node_attr('/', 'db_schema_version')
-        except AttributeError:
-            db_version = "1.0"
-
-        logger.info('database version: {}'.format(db_version))
-        if db_version != self.EXPECTED_DB_SCHEMA:
-            exp_tup = self.EXPECTED_DB_SCHEMA.split('.')
-            db_tup = db_version.split('.')
-            if db_tup[0] != exp_tup[0]:
-                raise DBVersionError('Unsupported database version: {} != {} ({})'
-                                     .format(db_version, self.EXPECTED_DB_SCHEMA, self.db.filename))
-            else:
-                logger.warning("outdated database version, but only minor version change: "
-                               "{} != {}. Some functions might fail"
-                               .format(db_version, self.EXPECTED_DB_SCHEMA))
-        self.db_schema_version = tuple(int(z) for z in db_version.split("."))
-
-        try:
-            self.seq_search = SequenceSearch(self)
-        except DBConsistencyError as e:
-            logger.exception("Cannot load SequenceSearch. Any future call to seq_search will fail!")
-            self.seq_search = object()
-        self.id_resolver = IDResolver(self)
-        self.id_mapper = IdMapperFactory(self)
-        genomes = [Genome(self, g) for g in self.db.root.Genome.read()]
-        self.tax = Taxonomy(self.db.root.Taxonomy.read(),
-                            genomes={g.ncbi_taxon_id: g for g in genomes})
-        self._re_fam = None
-        self.format_hogid = None
-        self._set_hogid_schema()
-
-    @LazyProperty
-    def gene_ontology(self):
-        """returns GeneOntology object containing hierarchy
-        of terms using the is_a and part_of relations. See
-        :meth:`load_gene_ontology` to parametrize the
-        creation of GeneOntology object."""
-        return self.load_gene_ontology(GeneOntology)
-
-    def load_gene_ontology(self, factory=None, rels=None):
-        """Instantiate GeneOntology object
-
-        By default, a GeneOntology object is returned based on
-        the default relations (which are defined in :mod:`.gene_ontology`)
-
-        The factory parameter allows to specify an subtype of
-        GeneOntology, e.g. :class:`.gene_ontology.FreqAwareGeneOntology`,
-
-        The rels parameter should be a list of relation strings that
-        should be used as parents relations.
-
-        :param factory: GeneOntology factory
-        :param rels: list of rels for parent relations
-
-        :returns: GeneOntology object"""
-        try:
-            fp = io.StringIO(self.db.root.Ontologies.GO.read().tobytes().decode('utf-8'))
-        except tables.NoSuchNodeError:
-            p = os.path.join(os.path.dirname(self.db.filename), 'go-basic.obo')
-            fp = open(p, 'rt')
-        if factory is None:
-            factory = GeneOntology
-        go = factory(OntologyParser(fp), rels=rels)
-        go.parse()
-        fp.close()
-        return go
-
-    def get_hdf5_handle(self):
-        """return the handle to the database hdf5 file"""
-        return self.db
-
-    def get_conversion_date(self):
-        """return the conversion end date from the DB attributes"""
-        return dateutil.parser.parse(self.db.root._v_attrs['conversion_end'])
-
-    def ensure_entry(self, entry):
-        """This method allows to use an entry or an entry_nr.
-
-        If necessary it will load the entry from the entry_nr,
-        otherwise returning the same object again.
-
-        :param entry: the entry_nr of a protein to be loaded or a
-            protein entry."""
-        try:
-            t = entry['AltSpliceVariant']
-            return entry
-        except (TypeError, AttributeError, IndexError):
-            if isinstance(entry, (int, numpy.number)):
-                return self.entry_by_entry_nr(entry)
-            raise TypeError('Invalid type to retrieve an Entry')
-        except Exception:
-            raise TypeError('Invalid type to retrieve an Entry')
-
-    def entry_by_entry_nr(self, entry_nr):
-        """Returns the entry from the /Protein/Entries table
-        corresponding to entry_nr.
-
-        :param int entry_nr: a numeric identifier for the protein
-            entry"""
-        entry = self.db.root.Protein.Entries[entry_nr - 1]
-        if entry['EntryNr'] != entry_nr:
-            logger.warning('EntryNr {} not at position {}. Using index instead'.format(entry_nr, entry_nr - 1))
-            entry = self.db.root.Protein.Entries.read_where(
-                'EntryNr == {:d}'.format(entry_nr))
-            if len(entry) != 1:
-                raise ValueError("there are {} entries with entry_nr {}".format(len(entry), entry_nr))
-            entry = entry[0]
-        return entry
-
-    def _set_hogid_schema(self):
-        """Determines the used HOG ID schema
-
-        Some versions of the database have HOG IDs of the form
-        "HOG:0000001" and others without the prefix (e.g. standalone)
-        or with the prefix, but without padding. This method checks
-        which schema is used and sets the appropriate member vars
-        """
-        re_id = re.compile(b'(?P<prefix>HOG:)(?P<nr>\d+)')
-        for entry in self.db.root.Protein.Entries:
-            m = re_id.match(entry['OmaHOG'])
-            if m is None:
-                continue
-            nr = m.group('nr')
-            if len(nr) >= 7 and not nr.startswith(b'0'):
-                continue  # a case where we cannot determine if padded nr
-            is_padded = nr.startswith(b'0')
-            prefix = m.group('prefix').decode()
-            if prefix is None:
-                prefix = ''
-            fmt = "{}{{:{}d}}".format(prefix, "07" if is_padded else "")
-            self._re_fam = re.compile('{}(?P<fam>\d{})'
-                                      .format(prefix, "{7,}" if is_padded else "+")
-                                      .encode('ascii'))
-            self.format_hogid = lambda fam: fmt.format(fam)
-            logger.info("setting HOG ID schema: re_fam: {}, hog_fmt: {}"
-                        .format(self._re_fam, fmt))
-            return
-        raise DBConsistencyError('no protein in a hog')
-
-    def all_proteins_of_genome(self, genome):
-        """return all protein entries of a genome"""
-        rng = self.id_mapper['OMA'].genome_range(genome)
-        prot_tab = self.get_hdf5_handle().get_node('/Protein/Entries')
-        return prot_tab.read_where('(EntryNr >= {}) & (EntryNr <= {})'.format(rng[0], rng[1]))
-
-    def main_isoforms(self, genome):
-        """returns the proteins that are the main isoforms of a genome.
-
-        The main isoform is in this context the isoform that we used in OMA to
-        infer the orthologs. It is the one variant that has the most alignment
-        matches to all other gnomes.
-
-        The genome parameter should be the UniProtSpeciesCode of the species of
-        interest. If it is a numeric value, the genome parameter is interpreted
-        as the protein entrynr. The method returns then the main isoforms for
-        the species to which this protein belongs.
-
-        :Note: OMA only predicts orthologs for the main isoform, so there is no
-            difference if you work with only the main isoforms or all proteins of
-            a genome in terms of orthologs.
-
-        :param genome: UniProtSpeciesCode of the genome of interest, or a gene
-                       number (EntryNr) from the genome of interest.
-        """
-        rng = self.id_mapper['OMA'].genome_range(genome)
-        prot_tab = self.get_hdf5_handle().get_node('/Protein/Entries')
-        return prot_tab.read_where(
-            '(EntryNr >= {}) & (EntryNr <= {}) & ((AltSpliceVariant == EntryNr) | (AltSpliceVariant == 0))'
-            .format(rng[0], rng[1]))
-
-    def get_splicing_variants(self, entry):
-        e = self.ensure_entry(entry)
-        if e['AltSpliceVariant'] == 0:
-            return numpy.array([e], dtype=e.dtype)
-        # TODO: create index on AltSpliceVariant column?!
-        return self.get_hdf5_handle().get_node('/Protein/Entries').read_where(
-            '(EntryNr >= {:d}) & (EntryNr < {:d}) & (AltSpliceVariant == {:d})'
-                .format(e['EntryNr']-100, e['EntryNr']+100, e['AltSpliceVariant']))
-
-    def _get_vptab(self, entry_nr):
-        return self._get_pw_tab(entry_nr, 'VPairs')
-
-    def _get_pw_tab(self, entry_nr, subtab):
-        genome = self.id_mapper['OMA'].genome_of_entry_nr(entry_nr)['UniProtSpeciesCode'].decode()
-        return self.db.get_node('/PairwiseRelation/{}/{}'.format(genome, subtab))
-
-    def count_vpairs(self, entry_nr):
-        vptab = self._get_vptab(entry_nr)
-        try:
-            cnt = count_elements(vptab.where('(EntryNr1=={:d})'.format(entry_nr)))
-        except (TypeError, ValueError):
-            cnt = 0
-        return cnt
-
-    def count_homoeologs(self, entry_nr):
-        pwtab = self._get_pw_tab(entry_nr, 'within')
-        homolog_typ_nr = pwtab.get_enum('RelType')['homeolog']
-        try:
-            cnt = count_elements(pwtab.where('(EntryNr1=={:d}) & (RelType == {:d})'.format(entry_nr, homolog_typ_nr)))
-        except (TypeError, ValueError):
-            cnt = 0
-        return cnt
-
-    def _get_pw_data(self, entry_nr, tab, typ_filter=None, extra_cols=None):
-        query = "(EntryNr1 == {:d})".format(entry_nr)
-        if typ_filter is not None:
-            query += " & (RelType == {:d})".format(typ_filter)
-        dat = tab.read_where(query)
-        typ = tab.get_enum('RelType')
-        cols = ['EntryNr1', 'EntryNr2', 'Score', 'Distance']
-        if extra_cols is not None:
-            cols.extend(extra_cols)
-        res = numpy.lib.recfunctions.append_fields(
-            dat[cols],
-            names='RelType',
-            data=[typ(x) for x in dat['RelType']],
-            usemask=False)
-        return res
-
-    def get_vpairs(self, entry_nr):
-        """returns the verified pairs of a query protein.
-
-        This method returns an instance of a :class:`numpy.recarray` class
-        containing the verified pairs of a query protein entry.
-        The returned array contains columns with EntryNr1 and EntryNr2 to
-        identify the pair together with RelType (indicating the subtype of
-        orthology), the alignment score and the distance. The score and
-        distance will be set to -1 if unknown.
-
-        :param int entry_nr: the numeric entry_nr of the query protein."""
-        vp_tab = self._get_vptab(entry_nr)
-        return self._get_pw_data(entry_nr, vp_tab)
-
-    def get_within_species_paralogs(self, entry_nr):
-        """returns the within species paralogs of a given entry
-
-        This method returns a :class:`numpy.recarray` instance
-        containing the close paralogs. Close paralogs are within
-        species paralogs that are inparalogs to at least one
-        ortholog of the query gene in OMA.
-
-        The returned array contains columns with EntryNr1 and EntryNr2 to
-        identify the pair together with RelType (indicating the subtype of
-        paralogy), the alignment score and the distance. The score and
-        distance will be set to -1 if unknown.
-
-        :param int entry_nr: the numeric entry_id of the query protein"""
-        within_species_paralogs = self._get_pw_tab(entry_nr, 'within')
-        return self._get_pw_data(entry_nr, within_species_paralogs)
-
-    def get_homoeologs(self, entry_nr):
-        within_species = self._get_pw_tab(entry_nr, 'within')
-        homolog_typ_nr = within_species.get_enum('RelType')['homeolog']
-        return self._get_pw_data(entry_nr, within_species,
-                                 typ_filter=homolog_typ_nr,
-                                 extra_cols=['SyntenyConservationLocal', 'Confidence'])
-
-    def neighbour_genes(self, entry_nr, window=1):
-        """Returns neighbor genes around a query gene.
-
-        This method returns a tuple containing a numpy recarray with
-        gene entries located around the query gene, and an index
-        pointing to the query gene. The genes are sorted according to
-        their position on the chromosome.
-
-        The *windows* parameter specifies the number of genes up- and
-        downstream of the query gene that should be reported. Note
-        that the actual number can be smaller if the query gene is close
-        to a chromosome start or end.
-
-        :param entry_nr: the entry number of the query gene
-        :param window: the number of neighboring genes on each
-                           side to return"""
-        if window <= 0 or not isinstance(window, int):
-            raise ValueError('windows parameters must be a positive integer value')
-
-        dat = self.entry_by_entry_nr(entry_nr)
-        target_chr = dat['Chromosome']
-        genome_range = self.id_mapper['OMA'].genome_range(entry_nr)
-        f = 5
-        data = self.db.root.Protein.Entries.read_where(
-            '(EntryNr >= {:d}) & (EntryNr <= {:d}) & '
-            '(Chromosome == {!r}) & '
-            '((AltSpliceVariant == 0) |'
-            ' (AltSpliceVariant == EntryNr))'.format(
-                max(genome_range[0], entry_nr - f * window),
-                min(genome_range[1], entry_nr + f * window),
-                target_chr))
-        data.sort(order=['EntryNr'])
-        idx = data['EntryNr'].searchsorted(entry_nr)
-        res = data[max(0, idx - window):min(len(data), idx + window + 1)]
-        idx = res['EntryNr'].searchsorted(entry_nr)
-        return res, idx
-
-    def parse_hog_id(self, hog_id):
-        hog_id = hog_id if isinstance(hog_id, bytes) else hog_id.encode('ascii')
-        m = self._re_fam.match(hog_id)
-        if m is not None:
-            return int(m.group('fam'))
-        else:
-            raise ValueError('invalid hog id format')
-
-    def hog_family(self, entry):
-        entry = self.ensure_entry(entry)
-        m = self._re_fam.match(entry['OmaHOG'])
-        if m is None:
-            raise Singleton(entry)
-        return int(m.group('fam'))
-
-    def hog_levels_of_fam(self, fam_nr):
-        """get all taxonomic levels covered by a family.
-
-        The family coresponds to the toplevel numeric id of a HOG,
-        i.e. for HOG:002421 the fam_nr should be 2421. If a HOG
-        covers a certain level more than once, it will be returned
-        several times.
-
-        :param fam_nr: the numeric id of the family (== Toplevel HOG)
-        """
-        return self.db.root.HogLevel.read_where(
-            '(Fam=={})'.format(fam_nr))['Level']
-
-    def get_subhogids_at_level(self, fam_nr, level):
-        """get all the hog ids within a given family at a given taxonomic
-        level of interest.
-
-        After a duplication in an ancestor lineage, there exists multiple
-        sub-hogs for any taxonomic level after the duplication. This method
-        allows to get the list of hogids at the requested taxonomic level.
-
-        E.g. assume in family 1 (HOG:0000001) there has been a duplication
-        between Eukaryota and Metazoa. this method would return for
-        get_subhogids_at_level(1, 'Eukaryota') --> ['HOG:0000001']
-        and for
-        get_subhogids_at_level(1, 'Metazoa') --> ['HOG:0000001.1a', 'HOG:0000001.1b']
-
-        :param fam_nr: the numeric family id
-        :param level: the taxonomic level of interest"""
-        lev = level if isinstance(level, bytes) else level.encode('ascii')
-        return self.db.root.HogLevel.read_where(
-            '(Fam=={}) & (Level=={!r})'.format(fam_nr, lev))['ID']
-
-    def member_of_hog_id(self, hog_id, level=None):
-        """return an array of protein entries which belong to a given hog_id.
-
-        E.g. if hog_id = 'HOG122.1a', the method returns all the proteins that
-        have either exactly this hog id or an inparalogous id such a HOG122.1a.4b.2a
-
-        If you are only interested in the members of a specific lineage (identified
-        through its taxonomic range), you can pass the taxonomic range as an
-        additional argument. Only the proteins of genomes belonging to this clade
-        will be returned. Otherwise, all proteins with having this specific hog_id
-        will be returned.
-
-        :param str hog_id: the requested hog_id.
-        :param level: the taxonomic level of interest
-        :type level: str or None
-
-        :return: a numpy.array with the protein entries belonging to the requested hog.
-        :rtype: :class:`numpy.ndarray`
-
-        :Note: Even if you obtained a certain hog_id using
-               :py:meth:`get_subhogids_at_level`
-               using a certain level, if you do not specify the level in
-               :meth:`member_of_hog_id` again, you will likely get proteins from other
-               clades. Only if it happens that the deepest level of the hog_id
-               coincides with the taxonomic range of interest, the two will be identical.
-        """
-        hog_range = self._hog_lex_range(hog_id)
-        # get the proteins which have that HOG number
-        memb = self.db.root.Protein.Entries.read_where(
-            '({!r} <= OmaHOG) & (OmaHOG < {!r})'.format(*hog_range))
-        if level is not None:
-            memb = [x for x in memb if level.encode('ascii') in self.tax.get_parent_taxa(
-                self.id_mapper['OMA'].genome_of_entry_nr(x['EntryNr'])['NCBITaxonId'])['Name']]
-
-        return memb
-
-    def iter_members_of_hog_id(self, hog_id):
-        """iterates over all proteins that belong to a specific hog_id.
-
-        A hog_id might be an ID of the following form: HOG:0000212.1a
-        This method will yield all proteins in the form of
-        :class:`ProteinEntry` instances that are part of this hog_id.
-
-        :param str hog_id: the requested HOG ID.
-        :return: :py:class:`ProteinEntry` objects
-        :rtype: iter(:class:`ProteinEntry`)"""
-        hog_range = self._hog_lex_range(hog_id)
-        it = self.db.root.Protein.Entries.where(
-            '({!r} <= OmaHOG) & (OmaHOG < {!r})'.format(*hog_range))
-        for row in it:
-            yield ProteinEntry(self, row.fetch_all_fields())
-
-    def member_of_fam(self, fam):
-        """returns an array of protein entries which belong to a given fam"""
-        if not isinstance(fam, (int, numpy.number)):
-            raise ValueError('expect a numeric family id')
-        return self.member_of_hog_id(self.format_hogid(fam))
-
-    def hog_members(self, entry, level):
-        """get hog members with respect to a given taxonomic level.
-
-        The method will return a list of protein entries that are all
-        member of the same hog with respect to the taxonomic range
-        of interest.
-
-        :param entry: an entry or entry_nr of a query protein
-        :param level: the taxonomic level of interest"""
-        query = self.ensure_entry(entry)
-        members = self.hog_members_from_hog_id(query['OmaHOG'], level)
-        if query not in members:
-            raise ValueError(u"Level '{0:s}' undefined for query gene".format(level))
-        return members
-
-    def hog_members_from_hog_id(self, hog_id, level):
-        """get hog members with respect to a given taxonomic level.
-
-        The method will return a list of protein entries that are all
-        member of the same hog with respect to the taxonomic range
-        of interest.
-
-        :param bytes hog_id: the query hog id
-        :param str level: the taxonomic level of interest"""
-        if isinstance(hog_id, str):
-            hog_id = hog_id.encode('ascii')
-        query_fam = self.parse_hog_id(hog_id)
-        hoglev = None
-        for hog_candidate in self.db.root.HogLevel.where(
-                '(Fam == {:d}) & (Level == {!r})'.format(query_fam, level.encode('ascii'))):
-            if hog_id.startswith(hog_candidate['ID']):
-                hoglev = hog_candidate
-                break
-        if hoglev is None:
-            raise ValueError(u'Level "{0:s}" undefined for query gene'.format(level))
-        # get the entries which have this hogid (or a sub-hog)
-        members = self.member_of_hog_id(hoglev['ID'])
-        if level != 'LUCA':
-            # last, we need to filter the proteins to the tax range of interest
-            members = [x for x in members if level.encode('ascii') in self.tax.get_parent_taxa(
-                self.id_mapper['OMA'].genome_of_entry_nr(x['EntryNr'])['NCBITaxonId'])['Name']]
-        return members
-
-    def get_orthoxml(self, fam):
-        """returns the orthoxml of a given toplevel HOG family
-
-        :param fam: numeric id of requested toplevel hog"""
-        idx = self.db.root.OrthoXML.Index.read_where('Fam == {:d}'.format(fam))
-        if len(idx) < 1:
-            raise ValueError('cannot retrieve orthoxml for {}'.format(fam))
-        idx = idx[0]
-        return self.db.root.OrthoXML.Buffer[
-               idx['HogBufferOffset']:idx['HogBufferOffset'] + idx['HogBufferLength']].tostring()
-
-    def _hog_lex_range(self, hog):
-        """return the lexographic range of a hog.
-
-        This can be used to search of sub-hogs which are nested in
-        the query hog. The semantics is such that
-        _hog_lex_range[0] <= hog < _hog_lex_range[1].
-        This is equivalent to say that a sub-hog starts with the
-        query hog."""
-        hog_str = hog.decode() if isinstance(hog, bytes) else hog
-        return hog_str.encode('ascii'), (hog_str[0:-1] + chr(1 + ord(hog_str[-1]))).encode('ascii')
-
-    def oma_group_members(self, group_id):
-        """get the member entries of an oma group.
-
-        This method returns a numpy array of protein entries that form
-        an oma group. If the group id is invalid (not positive
-        integer value or a valid Fingerprint), an `InvalidId` Exception
-        is raised.
-
-        :param group_id: numeric oma group id or Fingerprint"""
-        group_nr = self.resolve_oma_group(group_id)
-        members = self.db.root.Protein.Entries.read_where('OmaGroup=={:d}'.format(group_nr))
-        return members
-
-    def resolve_oma_group(self, group_id):
-        if isinstance(group_id, int) and 0 < group_id <= self.get_nr_oma_groups():
-            return group_id
-        elif isinstance(group_id, numpy.integer):
-            return self.resolve_oma_group(int(group_id))
-        elif isinstance(group_id, (bytes, str)):
-            if group_id.isdigit():
-                return self.resolve_oma_group(int(group_id))
-            if isinstance(group_id, str):
-                group_id = group_id.encode('utf-8')
-            if group_id == b'n/a':
-                raise InvalidId('Invalid ID (n/a) for an OMA Group')
-            if not self.seq_search.contains_only_valid_chars(group_id):
-                raise InvalidId("Invalid ID: non-amino-accids characters in Fingerprint or sequence pattern")
-            if len(group_id) == 7:
-                # most likely a fingerprint. let's check that first
-                group_meta_tab = self.db.get_node('/OmaGroups/MetaData')
-                try:
-                    e = next(group_meta_tab.where('(Fingerprint == {!r})'
-                                                  .format(group_id)))
-                    return int(e['GroupNr'])
-                except StopIteration:
-                    pass
-            # search in suffix array
-            entry_nrs = self.seq_search.exact_search(
-                group_id.decode(), only_full_length=False)
-            if len(entry_nrs) == 0:
-                raise InvalidId('No sequence contains search pattern')
-            group_nrs = {self.entry_by_entry_nr(nr)['OmaGroup'] for nr in entry_nrs}
-            group_nrs.discard(0)
-            if len(group_nrs) == 1:
-                return int(group_nrs.pop())
-            elif len(group_nrs) == 0:
-                raise InvalidId("Sequence with pattern '{}' does not belong to any group"
-                                .format(group_id.decode()))
-            else:
-                raise AmbiguousID("sequence pattern matches several oma groups", candidates=group_nrs)
-        raise InvalidId('Invalid type to determine OMA Group: {} (type: {})'.format(group_id, type(group_id)))
-
-    def oma_group_metadata(self, group_nr):
-        """get the meta data associated with a OMA Group
-
-        The meta data contains the fingerprint and the keywords infered for this group.
-        The method retuns this information as a dictionary. The parameter must be
-        the numeric oma group nr.
-
-        :param int group_nr: a numeric oma group id."""
-        if not isinstance(group_nr, (int, numpy.integer)) or group_nr < 0:
-            raise InvalidId('Invalid group nr: {} (type: {})'.format(group_nr, type(group_nr)))
-        meta_tab = self.db.get_node('/OmaGroups/MetaData')
-        try:
-            e = next(meta_tab.where('GroupNr == {:d}'.format(group_nr)))
-            kw_buf = self.db.get_node('/OmaGroups/KeywordBuffer')
-            res = {'fingerprint': e['Fingerprint'].decode(),
-                   'group_nr': int(e['GroupNr']),
-                   'keywords': kw_buf[e['KeywordOffset']:e['KeywordOffset']+e['KeywordLength']].tostring().decode(),
-                   'size': int(e['NrMembers'])}
-            return res
-        except StopIteration:
-            raise InvalidId('invalid group nr')
-
-    def get_nr_oma_groups(self):
-        """returns the number of OMA Groups in the database"""
-        tab = self.db.get_node('/Protein/Entries')
-        try:
-            idx = tab.colindexes['OmaGroup'][-1]
-            return int(tab[idx]['OmaGroup'])
-        except KeyError:
-            hist = self.group_size_histogram('oma')
-            return int(hist['Count'].sum())
-
-    def get_nr_toplevel_hogs(self):
-        """returns the number of toplevel hogs, i.e. roothogs"""
-        hist = self.group_size_histogram('hog')
-        return int(hist['Count'].sum())
-
-    def group_size_histogram(self, typ=None):
-        """returns a table with two columns, e.g. Size and Count.
-
-        if typ is set to 'oma' or not set, then the data for the
-        oma groups is returned. if it is set to 'hog', the data for
-        the rootlevel hogs is returned.
-
-        :param typ: either 'oma' or 'hog', defaults to 'oma'"""
-        if typ is None or typ.lower() == 'oma':
-            tabname = 'OmaGroup'
-        elif typ.lower() == 'hog':
-            tabname = 'OmaHOG'
-        else:
-            raise ValueError('{} is not a valid group typ'.format(typ))
-        tab = self.db.get_node('/Summary/{}_size_hist'.format(tabname))
-        return tab.read()
-
-    def get_sequence(self, entry):
-        """get the protein sequence of a given entry as a string
-
-        :param entry: the entry or entry_nr for which the sequence is requested"""
-        entry = self.ensure_entry(entry)
-        seqArr = self.db.get_node('/Protein/SequenceBuffer')
-        seq = seqArr[entry['SeqBufferOffset']:entry['SeqBufferOffset'] + entry['SeqBufferLength'] - 1]
-        return seq.tostring()
-
-    def get_cdna(self, entry):
-        """get the protein sequence of a given entry as a string"""
-        entry = self.ensure_entry(entry)
-        seqArr = self.db.get_node('/Protein/CDNABuffer')
-        seq = seqArr[entry['CDNABufferOffset']:entry['CDNABufferOffset'] + entry['CDNABufferLength'] - 1]
-        return seq.tostring()
-
-    def get_description(self, entry):
-        entry = self.ensure_entry(entry)
-        descArr = self.db.get_node('/Protein/DescriptionBuffer')
-        desc = descArr[entry['DescriptionOffset']:entry['DescriptionOffset'] + entry['DescriptionLength']]
-        return desc.tostring()
-
-    def get_release_name(self):
-        return str(self.db.get_node_attr('/', 'oma_version'))
-
-    def get_exons(self, entry_nr):
-        genome = self.id_mapper['OMA'].genome_of_entry_nr(entry_nr)['UniProtSpeciesCode'].decode()
-        locus_tab = self.db.get_node('/Protein/Locus/{}'.format(genome))
-        return locus_tab.read_where('EntryNr == {}'.format(entry_nr))
-
-    def get_domains(self, entry_nr):
-        try:
-            return self.db.root.Annotations.Domains.read_where('EntryNr == {:d}'.format(entry_nr))
-        except ValueError as e:
-            raise InvalidId('require a numeric entry id, got {}'.format(entry_nr))
-
-    def get_representative_entry_of_hog(self, fam):
-        """Get the information of the representative entry for a given family (roothog).
-
-        For each family we select a represenative entry that has the most prevalent
-        domain architecture. This method returns the entry_nr that we selected, together
-        with the domain architecture and its prevalence. In case no representative entry
-        has been found, the method raises an :class:`NoReprEntry` Exception.
-
-        :param int fam: The numeric family number."""
-        domprev_tab = self.db.get_node('/HOGAnnotations/DomainArchPrevalence')
-        try:
-            row = next(domprev_tab.where('Fam == {:d}'.format(fam)))
-            fields = (to_snail_case(z) for z in domprev_tab.dtype.names)
-            res = dict(zip(fields, row.fetch_all_fields()))
-            res['domains'] = self.get_domains(int(row['ReprEntryNr']))
-            res['prevalence'] = 100.0 * res['prev_count'] / res['fam_size']
-            return res
-        except StopIteration:
-            raise NoReprEntry()
-
-    def get_prevalent_domains(self, fam):
-        # Gets the prevalent domains for a particular top level HOG / family.
-        # returns: (family_row, similar_families)
-        # family_row contains: family ID, representative entry, DA prevalence.
-        # similar_families contains: same, with similarity score. Ordered.
-        domprev_tab = self.db.get_node('/HOGAnnotations/DomainArchPrevalence')
-        dom2hog_tab = self.db.get_node('/HOGAnnotations/Domains')
-
-        try:
-            fam_row = self.get_representative_entry_of_hog(fam)
-        except NoReprEntry:
-            return None, None
-
-        # Get the family's consensus DA and count them...
-        fam_da = collections.Counter(fam_row['domains']['DomainId'])
-
-        # Retrieve the relevant other families...
-        sim_fams = collections.defaultdict(collections.Counter)
-        for d in fam_da:
-            for hog_with_domain in dom2hog_tab.where('DomainId == {}'.format(d)):
-                sim_fams[hog_with_domain['Offset']][d] += 1
-
-        if len(sim_fams) == 0:
-            return fam_row, None
-
-        # Now get similar families and order them by similarity
-        sim_fams_df = pd.DataFrame(domprev_tab[list(sim_fams.keys())])
-        sim_fams_df['sim'] = list(map(lambda i: sum((sim_fams[i] & fam_da).values()),
-                                      sim_fams.keys()))
-
-        # Sort by similarity & family size
-        sim_fams_df.sort_values(['sim', 'FamSize'], inplace=True, ascending=False)
-        sim_fams_df.reset_index(drop=True, inplace=True)
-
-        # Prevalence
-        sim_fams_df['Prev'] = 100.0 * (sim_fams_df['PrevCount'] / sim_fams_df['FamSize'])
-
-        return fam_row, sim_fams_df
-
-    def get_gene_ontology_annotations(self, entry_nr, stop=None, as_dataframe=False, as_gaf=False):
-        """Retrieve the gene ontology annotations for an entry or entry_range
-
-        The method returns the gene ontology annotations stored in the database
-        for a given entry (if `stop` parameter is not provided) or for all the
-        entries between [entry_nr, stop). Like in slices, the stop entry_nr is
-        not inclusive, where as the entry_nr - the start of the slice - is.
-
-        By default the result are returned as numpy arrays of type
-        :class:`tablefmt.GeneOntologyTable`. If as_dataframe is set to true, the
-        result will be a pandas dataframe, and if as_gaf is set to true, a gaf
-        formatted text file with the annotations is returned.
-
-        :param int entry_nr: numeric protein entry
-        """
-        # function to check if an annotation term is obsolete
-        def filter_obsolete_terms(term):
-            try:
-                self.gene_ontology.term_by_id(term)
-                return True
-            except (KeyError, ValueError):
-                return False
-        try:
-            if stop is None:
-                query = 'EntryNr == {:d}'.format(entry_nr)
-            else:
-                if not isinstance(stop, int) or stop < entry_nr:
-                    raise TypeError("stop argument needs to be a entry number that is larger than 'entry_nr'")
-                query = '(EntryNr >= {:d}) & (EntryNr < {:d})'.format(entry_nr, stop)
-            annots = self.db.root.Annotations.GeneOntology.read_where(query)
-
-            # for test database we also have some obsolete terms. we need to filter those
-            if len(annots) > 0:
-                not_obsolete = numpy.vectorize(filter_obsolete_terms)(annots['TermNr'])
-                annots = annots[not_obsolete]
-        except ValueError as e:
-            raise InvalidId('require a numeric entry id, got {}'.format(entry_nr))
-        if not as_dataframe and not as_gaf:
-            return annots
-
-        # early return if no annotations available
-        if len(annots) == 0:
-            return '!gaf-version: {}\n'.format(GAF_VERSION) if as_gaf else None
-
-        df = pd.DataFrame(annots)
-
-        # 1R DB
-        df['DB'] = 'OMA'
-        # 2R DB Object ID
-        df['DB_Object_ID'] = df['EntryNr'].apply(self.id_mapper['Oma'].map_entry_nr)
-        # 3R DB Object Symbol
-        df['DB_Object_Symbol'] = df['DB_Object_ID']
-        # 4O Qualifier
-        df['Qualifier'] = ''
-        # 5R GO ID
-        df['GO_ID'] = df['TermNr'].apply(lambda t: 'GO:{:07d}'.format(t))
-        # 6R DB:Reference
-        df['DB:Reference'] = df['Reference'].apply(lambda x: x.decode('ascii'))
-        # 7R Evidence code
-        df['Evidence'] = df['Evidence'].apply(lambda x: x.decode('ascii'))
-        # 8O With (or) From
-        df['With'] = ''
-        # 9R Aspect
-        df['Aspect'] = df['GO_ID'].apply(lambda t: GOAspect.to_char(self.gene_ontology.term_by_id(t).aspect))
-        # 10O DB Object Name
-        df['DB_Object_Name'] = ''
-        # 11O DB Object Synonym (|Synonym)
-        df['Synonym'] = ''
-        # 12R DB Object Type
-        df['DB_Object_Type'] = 'protein'
-        # 13R Taxon (|taxon)
-        df['Taxon_ID'] = df['EntryNr'].apply(lambda e: 'taxon:{:d}'
-                                             .format(self.id_mapper['Oma'].genome_of_entry_nr(e)['NCBITaxonId']))
-        # 14R Date
-        df['Date'] = self.get_conversion_date().strftime('%Y%m%d')
-        # 15R Assigned by - TODO: FIX FOR NON OMA!!!
-        df['Assigned_By'] = df['DB']
-        # 16O Annotation Extension
-        df['Annotation_Extension'] = ''
-        # 17O Gene Product Form ID
-        df['Gene_Product_Form_ID'] = ''
-
-        df = df[GOA.GAF20FIELDS]
-        return (df if not as_gaf else
-                ('!gaf-version: {}\n'.format(GAF_VERSION) +
-                 '\n'.join(df.apply(lambda e: '\t'.join(map(str, e)), axis=1)) +
-                 '\n'))
-
-
-class SuffixSearcher(object):
-    def __init__(self, suffix_index_node, buffer=None, lookup=None):
-        if isinstance(suffix_index_node, tables.Group):
-            self.buffer_arr = buffer if buffer else suffix_index_node._f_get_child('buffer')
-            self.suffix_arr = suffix_index_node._f_get_child('suffix')
-            self.lookup_arr = lookup if lookup else suffix_index_node._f_get_child('offset')
-        else:
-            self.buffer_arr = buffer
-            self.suffix_arr = suffix_index_node
-            self.lookup_arr = lookup
-        self.lookup_arr = self.lookup_arr[:]
-
-    def find(self, query):
-        n = len(query)
-        if n > 0:
-            slicer = KeyWrapper(self.suffix_arr,
-                                key=lambda i:
-                                self.buffer_arr[i:(i + n)].tobytes())
-            ii = bisect_left(slicer, query)
-            if ii and (slicer[ii] == query):
-                # Left most found.
-                jj = ii + 1
-                while (jj < len(slicer)) and (slicer[jj] == query):
-                    # zoom to end -> -> ->
-                    jj += 1
-
-                # Find entry numbers and filter to remove incorrect entries
-                return numpy.searchsorted(self.lookup_arr, self.suffix_arr[ii:jj]+1) - 1
-        return []
-
-
-class SequenceSearch(object):
-    '''
-        Contains all the methods for searching the sequence
-
-        TODO: implement taxonomic filtering.
-    '''
-    from .KmerEncoder import DIGITS_AA
-    PROTEIN_CHARS = frozenset(map(lambda x: x.decode(), DIGITS_AA))
-    PAM100 = pyopa.generate_env(pyopa.load_default_environments()['log_pam1'],
-                                100)
-
-    def __init__(self, db):
-        # Backup reference to used DB method.
-        self.get_sequence = db.get_sequence
-
-        # Assume the index is stored in the main DB if there is no .idx file
-        self.db = db.get_hdf5_handle()
-        self.db_idx = (self.db if not os.path.isfile(self.db.filename + '.idx') else
-                       tables.open_file(self.db.filename + '.idx', 'r'))
-
-        # Protein search arrays.
-        try:
-            self.seq_idx = self.db_idx.root.Protein.SequenceIndex
-            if isinstance(self.seq_idx, tables.link.ExternalLink):
-                self.seq_idx = self.seq_idx()
-            self.kmer_lookup = self.db_idx.root.Protein.KmerLookup
-            if isinstance(self.kmer_lookup, tables.link.ExternalLink):
-                self.kmer_lookup = self.kmer_lookup()
-        except (AttributeError, OSError) as e:
-            raise DBConsistencyError("Suffix index for protein sequences is not available: "+str(e))
-        self.seq_buff = self.db.root.Protein.SequenceBuffer
-        self.n_entries = len(self.db.root.Protein.Entries)
-
-        # Kmer lookup arrays / kmer setup
-        self.k = self.kmer_lookup._f_getattr('k')
-        self.encoder = KmerEncoder(self.k)
-        logger.info('KmerLookup of size k={} loaded'.format(self.k))
-
-    def get_entry_length(self, ii):
-        """Get length of a particular entry."""
-        return self.db.root.Protein.Entries[ii - 1]['SeqBufferLength'] - 1
-
-    @LazyProperty
-    def entry_idx(self):
-        '''
-            Caches the index lookup part of the SA.
-        '''
-        return self.seq_idx[:self.n_entries]
-
-    def get_entrynr(self, ii):
-        '''
-            Get the entry number(s) corresponding to a location in the sequence
-            buffer.
-        '''
-        return (numpy.searchsorted(self.entry_idx, ii) + 1)
-
-    def contains_only_valid_chars(self, seq):
-        """returns true iff `seq` contains only valid AA chars.
-
-        The method ignores the case of the seq, i.e. upper
-        or lower case chars both match.
-
-        :param (bytes, str) seq: sequence to be checked
-        :returns bool
-        """
-        if isinstance(seq, bytes):
-            seq = seq.decode()
-        return all(map(lambda c: c in self.PROTEIN_CHARS, seq.upper()))
-
-    def _sanitise_seq(self, seq):
-        '''
-            Sanitise a string protein sequence. Deletes "invalid" characters.
-            TODO: add functionality for biopython sequence / skbio sequence.
-        '''
-        assert type(seq) == str
-        return ''.join(filter(lambda c: c in self.PROTEIN_CHARS,
-                              seq.upper())).encode('ascii')
-
-    def search(self, seq, n=None, coverage=None, is_sanitised=None):
-        '''
-            Searches the database for entries that match. If can't find an exact
-            match performs a kmer + local alignment approach to approximate
-            search.
-        '''
-        seq = (self._sanitise_seq(seq) if not is_sanitised else seq)
-        m = self.exact_search(seq, is_sanitised=True)
-        # TODO: taxonomic filtering.
-        if len(m) == 0:
-            # Do approximate search
-            m = self.approx_search(seq, n=n, coverage=coverage, is_sanitised=True)
-            # TODO: taxonomic filtering.
-            return ('approx', m) if m is not [] else None
-        else:
-            return 'exact', m
-
-    def exact_search(self, seq, only_full_length=True, is_sanitised=None):
-        '''
-            Performs an exact match search using the suffix array.
-        '''
-        # TODO: work out whether to just use the approximate search and then
-        # check if any are actually exact matches. Do the counting and then
-        # do an equality checking on any of the sequences that have the correct
-        # number of kmer matches.
-        seq = (seq if is_sanitised else self._sanitise_seq(seq))
-        nn = len(seq)
-        if nn > 0:
-            z = KeyWrapper(self.seq_idx,
-                           key=lambda i:
-                           self.seq_buff[i:(i + nn)].tobytes())
-            ii = bisect_left(z, seq, lo=self.n_entries)
-
-            if ii and (z[ii] == seq):
-                # Left most found.
-                jj = ii + 1
-                while (jj < len(z)) and (z[jj] == seq):
-                    # zoom to end -> -> ->
-                    jj += 1
-
-                # Find entry numbers and filter to remove incorrect entries
-                return list(filter(lambda e: (not only_full_length) or self.get_entry_length(e) == nn,
-                                   self.get_entrynr(self.seq_idx[ii:jj])))
-
-        # Nothing found.
-        return []
-
-    def approx_search(self, seq, n=None, is_sanitised=None, coverage=None):
-        '''
-            Performs an exact match search using the suffix array.
-        '''
-        seq = (seq if is_sanitised else self._sanitise_seq(seq))
-        n = (n if n is not None else 50)
-        coverage = (0.0 if coverage is None else coverage)
-
-        # 1. Do kmer counting vs entry numbers TODO: switch to np.unique?
-        c = collections.Counter()
-        for z in map(lambda kmer: numpy.unique(self.kmer_lookup[int(kmer)],
-                                               return_counts=True),
-                         self.encoder.decompose(seq)):
-            c.update(dict(zip(*z)))
-
-        # 2. Filter to top n if necessary
-        z = len(seq) - self.k + 1
-        cut_off = coverage * z
-        c = [(x[0], (x[1] / z)) for x in c.items() if x[1] >= cut_off]
-        c = (sorted(c,
-                    reverse=True,
-                    key=lambda x: x[1])[:n] if n > 0 else c)
-
-        # 3. Do local alignments and return count / score / alignment
-        if len(c) > 0:
-            return sorted([(m[0], {'kmer_coverage': m[1],
-                                   'score': a[0],
-                                   'alignment': a[1]})
-                           for (m, a) in self._align_entries(seq, c)],
-                          key=lambda z: z[1]['score'],
-                      reverse=True)
-        return []
-
-    def _align_entries(self, seq, matches):
-        # Does the alignment for the approximate search
-        def align(s1, s2s, env, aligned):
-            for s2 in s2s:
-                z = pyopa.align_double(s1, s2, env, False, False, True)
-                a = pyopa.align_strings(s1, s2, env, False, z)
-                aligned.append((z[0], ((a[0].convert_readable(),
-                                        (z[3], z[1])),
-                                       (a[1].convert_readable(),
-                                        (z[4], z[2])))))
-
-        aligned = []
-        query = pyopa.Sequence(seq.decode('ascii'))
-        entries = list(map(lambda m:
-                           pyopa.Sequence(self.get_sequence(int(m[0])).decode('ascii')),
-                           matches))
-        t = threading.Thread(target=align,
-                             args=(query, entries, self.PAM100, aligned))
-        t.start()
-        t.join()
-        assert (len(aligned) > 0), 'Alignment thread crashed.'
-        return zip(matches, aligned)
-
-
-class OmaIdMapper(object):
-    def __init__(self, db):
-        self.genome_table = db.get_hdf5_handle().root.Genome.read()
-        self._entry_off_keys = self.genome_table.argsort(order=('EntryOff'))
-        self._genome_keys = self.genome_table.argsort(
-            order=('UniProtSpeciesCode'))
-        self._taxid_keys = self.genome_table.argsort(order=('NCBITaxonId'))
-        self._omaid_re = re.compile(r'(?P<genome>[A-Z][A-Z0-9]{4})(?P<nr>\d+)')
-        self._db = db
-
-    def genome_of_entry_nr(self, e_nr):
-        """returns the genome code belonging to a given entry_nr"""
-        idx = self.genome_table['EntryOff'].searchsorted(
-            e_nr - 1, side='right',
-            sorter=self._entry_off_keys)
-        return self.genome_table[self._entry_off_keys[idx - 1]]
-
-    def map_entry_nr(self, entry_nr):
-        genome = self.genome_of_entry_nr(entry_nr)
-        return "{0:s}{1:05d}".format(genome['UniProtSpeciesCode'].decode(),
-                                     entry_nr - genome['EntryOff'])
-
-    def genome_from_UniProtCode(self, code):
-        code = code.encode('ascii')
-        idx = self.genome_table['UniProtSpeciesCode'].searchsorted(
-            code, sorter=self._genome_keys)
-        try:
-            genome = self.genome_table[self._genome_keys[idx]]
-        except IndexError:
-            raise UnknownSpecies('{} is unknown'.format(code))
-
-        if genome['UniProtSpeciesCode'] != code:
-            raise UnknownSpecies('{} is unknown'.format(code))
-        return genome
-
-    def genome_from_taxid(self, taxid):
-        try:
-            taxid = int(taxid)
-            idx = self.genome_table['NCBITaxonId'].searchsorted(
-                taxid, sorter=self._taxid_keys)
-            genome = self.genome_table[self._taxid_keys[idx]]
-        except (IndexError, ValueError):
-            raise UnknownSpecies('TaxonId "{}" is unknown'.format(taxid))
-        if genome['NCBITaxonId'] != taxid:
-            raise UnknownSpecies('TaxonId "{}" is unknown'.format(taxid))
-        return genome
-
-    def identify_genome(self, code):
-        """identify genome based on either a UniProtSpeciesCode or an
-        NCBI Taxonomy Id"""
-        if isinstance(code, int) or code.isdigit():
-            return self.genome_from_taxid(code)
-        else:
-            return self.genome_from_UniProtCode(code)
-
-    def omaid_to_entry_nr(self, omaid):
-        """returns the internal numeric entrynr from a
-        UniProtSpeciesCode+nr id. this is the inverse
-        function of 'map_entry_nr'."""
-        match = self._omaid_re.match(omaid)
-        if match is None:
-            raise InvalidOmaId(omaid)
-        code, nr = match.group('genome'), int(match.group('nr'))
-        genome = self.genome_from_UniProtCode(code)
-        if nr <= 0 or nr > genome['TotEntries']:
-            raise InvalidOmaId(omaid)
-        return genome['EntryOff'] + int(match.group('nr'))
-
-    def genome_range(self, query):
-        """returns the internal range of EntryNr associated with
-        'query'. 'query' can be either a numeric id of a protein
-        or a UniProtSpeciesCode of a genome. If 'query' is unknown
-        by the database, an InvalidOmaId exception is raised.
-
-        The return range is a tuple of length two, and the numbers
-        indicated the *inclusive* boundaries, e.g. (1,5) indicates
-        that the entries 1,2,3,4 and 5 belong to the query species"""
-        if isinstance(query, (int, numpy.integer)):
-            genome_row = self.genome_of_entry_nr(query)
-            if query <= 0 or query > genome_row['EntryOff'] + genome_row['TotEntries']:
-                raise InvalidOmaId(query)
-        else:
-            genome_row = self.genome_from_UniProtCode(query)
-        return (genome_row['EntryOff'] + 1,
-                genome_row['EntryOff'] + genome_row['TotEntries'],)
-
-    def species_ordering(self, root=None):
-        """get ordering of the genomes with respect to taxonomy.
-
-        This method returns a linear ordering of all the genomes with
-        respect to their lineage, i.e. genomes that are evolutionary
-        "close" to each other appear close in the ordering.
-        Optionally, one can give a root genome, that will be the species
-        the ordering is going to start with.
-
-        :param root: UniProtSpeciesCode of the root genome.
-        :returns: a list of species codes in the correct order."""
-        if root is None:
-            root = self.genome_table[0]['UniProtSpeciesCode']
-        root_genome = self.genome_from_UniProtCode(root)
-        lins = {g['UniProtSpeciesCode']: [lev['Name'] for lev in self._db.tax.get_parent_taxa(g['NCBITaxonId'])][::-1]
-                for g in self.genome_table}
-        root_lin = lins[root_genome['UniProtSpeciesCode']]
-        sort_key = {}
-        for g, lin_g in lins.items():
-            for k in range(min(len(root_lin), len(lin_g))):
-                if root_lin[k] != lin_g[k]:
-                    k -= 1
-                    break
-            sort_key[g] = (-k, lin_g)
-        sorted_genomes = sorted(list(sort_key.keys()), key=lambda g: sort_key[g])
-        return {g.decode(): v for v, g in enumerate(sorted_genomes)}
-
-
-class AmbiguousID(Exception):
-    def __init__(self, message, candidates):
-        super(AmbiguousID, self).__init__(message, candidates)
-        self.candidates = candidates
-
-
-class IDResolver(object):
-    def __init__(self, db):
-        entry_nr_col = db.get_hdf5_handle().root.Protein.Entries.cols.EntryNr
-        self.max_entry_nr = entry_nr_col[int(entry_nr_col.index[-1])]
-        self._db = db
-
-    def _from_numeric(self, e_id):
-        nr = int(e_id)
-        if not 0 < nr <= self.max_entry_nr:
-            raise InvalidId('{0:d} out of protein range: {1:}'.format(nr, e_id))
-        return nr
-
-    def _from_omaid(self, e_id):
-        return int(self._db.id_mapper['OMA'].omaid_to_entry_nr(e_id))
-
-    def search_xrefs(self, e_id):
-        """search for all xrefs. TODO: what happens if xref is ambiguous?"""
-        res = set([x['EntryNr'] for x in self._db.id_mapper['XRef'].search_xref(e_id)])
-        if len(res) == 0:
-            # let's try to mach as substring using suffix array case insensitive
-            res = set([x['EntryNr'] for x in self._db.id_mapper['XRef'].search_xref(e_id, match_any_substring=True)])
-            if len(res) == 0:
-                raise InvalidId(e_id)
-        if len(res) > 1:
-            # check whether its only different isoforms, then return canonical isoform
-            splice_variants = set([x['AltSpliceVariant'] for x in (self._db.entry_by_entry_nr(eNr) for eNr in res)])
-            logger.info('xref {} has {} entries, {} splice variants'.format(e_id, len(res), len(splice_variants)))
-            if len(splice_variants) > 1 or 0 in splice_variants:
-                raise AmbiguousID('Cross-ref "{}" is ambiguous'.format(e_id), res)
-            else:
-                res = splice_variants
-        return int(res.pop())
-
-    def resolve(self, e_id):
-        """maps an id to the entry_nr of the current OMA release."""
-        try:
-            nr = self._from_numeric(e_id)
-        except ValueError:
-            try:
-                nr = self._from_omaid(e_id)
-            except (InvalidOmaId, UnknownSpecies) as e:
-                nr = self.search_xrefs(e_id)
-        return nr
-
-
-class Taxonomy(object):
-    """Taxonomy provides an interface to navigate the taxonomy data.
-
-    The input data is the same as what is stored in the Database in
-    table "/Taxonomy"."""
-
-    def __init__(self, data, genomes=None, _valid_levels=None):
-        if not isinstance(data, numpy.ndarray):
-            raise ValueError('Taxonomy expects a numpy table.')
-        self.genomes = genomes if genomes is not None else {}
-        self.tax_table = data
-        self.taxid_key = self.tax_table.argsort(order=('NCBITaxonId'))
-        self.parent_key = self.tax_table.argsort(order=('ParentTaxonId'))
-        self.all_hog_levels = _valid_levels
-        if _valid_levels is None:
-            self._load_valid_taxlevels()
-
-    def _load_valid_taxlevels(self):
-        forbidden_chars = re.compile(r'[^A-Za-z. -]')
-        try:
-            with open(os.environ['DARWIN_BROWSERDATA_PATH'] + '/TaxLevels.drw') as f:
-                taxStr = f.read()
-            tax_json = json.loads(("[" + taxStr[14:-3] + "]").replace("'", '"'))
-            self.all_hog_levels = frozenset([t.encode('ascii') for t in
-                                             tax_json if forbidden_chars.search(t) is None])
-        except (IOError, KeyError):
-            self.all_hog_levels = frozenset([l for l in self.tax_table['Name']
-                                             if forbidden_chars.search(l.decode()) is None])
-
-    def _table_idx_from_numeric(self, tid):
-        i = self.tax_table['NCBITaxonId'].searchsorted(
-            tid, sorter=self.taxid_key)
-        idx = self.taxid_key[i]
-        if self.tax_table[idx]['NCBITaxonId'] != tid:
-            raise InvalidTaxonId(u"{0:d} is an invalid/unknown taxonomy id".format(tid))
-        return idx
-
-    def _get_root_taxon(self):
-        i1 = self.tax_table['ParentTaxonId'].searchsorted(0, sorter=self.parent_key)
-        i2 = self.tax_table['ParentTaxonId'].searchsorted(0, sorter=self.parent_key, side='right')
-        if i2 - i1 == 0:
-            raise DBConsistencyError('Not a single root in Taxonomy: {}'
-                                     .format(self.tax_table[self.parent_key[i1]]))
-        elif i2 - i1 == 1:
-            res = self.tax_table[self.parent_key[i1]]
-        else:
-            res = numpy.array([(0, -1, b'LUCA')], dtype=self.tax_table.dtype)[0]
-        return res
-
-    def _taxon_from_numeric(self, tid):
-        idx = self._table_idx_from_numeric(tid)
-        return self.tax_table[idx]
-
-    def _direct_children_taxa(self, tid):
-        i = self.tax_table['ParentTaxonId'].searchsorted(tid, sorter=self.parent_key)
-        idx = []
-        while i < len(self.parent_key) and self.tax_table[self.parent_key[i]]['ParentTaxonId'] == tid:
-            idx.append(self.parent_key[i])
-            i += 1
-        return self.tax_table.take(idx)
-
-    def get_parent_taxa(self, query):
-        """Get array of taxonomy entries leading towards the
-        root of the taxonomy.
-
-        :param query: the starting taxonomy level"""
-        idx = []
-        parent = query
-        count = 0
-        while parent != 0:
-            i = self._table_idx_from_numeric(parent)
-            idx.append(i)
-            tmp = self.tax_table[i]['ParentTaxonId']
-            if tmp == parent:
-                raise InvalidTaxonId(u"{0:d} has itself as parent".format(tmp))
-            parent = tmp
-            count += 1
-            if count > 100:
-                raise InvalidTaxonId(u"{0:d} exceeds max depth of 100. Infinite recursion?".format(query))
-        return self.tax_table.take(idx)
-
-    def _get_taxids_from_any(self, it, skip_missing=True):
-        if not isinstance(it, numpy.ndarray):
-            try:
-                it = numpy.fromiter(it, dtype='i4')
-            except ValueError:
-                it = numpy.fromiter(it, dtype='S255')
-        if it.dtype.type is numpy.string_:
-            try:
-                ns = self.name_key
-            except AttributeError:
-                ns = self.name_key = self.tax_table.argsort(order='Name')
-            idxs = self.tax_table['Name'].searchsorted(it, sorter=ns)
-            idxs = numpy.clip(idxs, 0, len(ns) - 1)
-            taxs = self.tax_table[ns[idxs]]
-            keep = taxs['Name'] == it
-            if not skip_missing and not keep.all():
-                raise KeyError('not all taxonomy names could be found')
-            res = taxs['NCBITaxonId'][keep]
-        else:
-            res = it
-        return res
-
-    def get_induced_taxonomy(self, members, collapse=True, augment_parents=False):
-        """Extract the taxonomy induced by a given set of `members`.
-
-        This method allows to extract the part which is induced by a
-        given set of levels and leaves that should be part of the
-        new taxonomy. `members` must be an iterable, the levels
-        must be either numeric taxids or scientific names.
-
-        Unless `augment_parents` is set to true, the resulting sub-taxonomy
-        will only contain levels that are specified in `members`. If
-        `augment_parents` is set to True, also all parent nodes of the
-        levels passed in members are considered for the sub-taxonomy.
-
-        :param iter members: an iterable containing the levels
-            and leaves that should remain in the new taxonomy. can be
-            either axonomic ids or scientific names.
-
-        :param bool collapse: whether or not levels with only one child
-            should be skipped or not. This defaults to True
-
-        :param bool augment_parents: whether or not to consider parent
-            levels of members for the resulting taxonomy."""
-
-        taxids_to_keep = numpy.sort(self._get_taxids_from_any(members))
-        if augment_parents:
-            # find all the parents of all the members, add them to taxids_to_keep
-            additional_levels = set([])
-            for cur_tax in taxids_to_keep:
-                try:
-                    additional_levels.update(set(self.get_parent_taxa(cur_tax)['NCBITaxonId']))
-                except KeyError:
-                    logger.info("{} seems not to exist in Taxonomy".format(cur_tax))
-                    pass
-            # add and remove duplicates
-            all_levels = numpy.append(taxids_to_keep, list(additional_levels))
-            taxids_to_keep = numpy.unique(all_levels)
-
-        idxs = numpy.searchsorted(self.tax_table['NCBITaxonId'], taxids_to_keep, sorter=self.taxid_key)
-        idxs = numpy.clip(idxs, 0, len(self.taxid_key) - 1)
-        subtaxdata = self.tax_table[self.taxid_key[idxs]]
-        if not numpy.alltrue(subtaxdata['NCBITaxonId'] == taxids_to_keep):
-            raise KeyError('not all levels in members exists in this taxonomy')
-
-        updated_parent = numpy.zeros(len(subtaxdata), 'bool')
-        for i, cur_tax in enumerate(taxids_to_keep):
-            if updated_parent[i]:
-                continue
-            # get all the parents and check which ones we keep in the new taxonomy.
-            parents = self.get_parent_taxa(cur_tax)['NCBITaxonId']
-            mask = numpy.in1d(parents, taxids_to_keep)
-            # find the position of them in subtaxdata (note: subtaxdata and
-            # taxids_to_keep have the same ordering).
-            new_idx = taxids_to_keep.searchsorted(parents[mask])
-            taxids = taxids_to_keep[new_idx]
-            # parent taxid are ncbitaxonids shifted by one position!
-            parents = numpy.roll(taxids, -1)
-            parents[-1] = 0
-            subtaxdata['ParentTaxonId'][new_idx] = parents
-            updated_parent[new_idx] = True
-
-        if collapse:
-            nr_children = collections.defaultdict(int)
-            for p in subtaxdata['ParentTaxonId']:
-                nr_children[p] += 1
-            rem = [p for (p, cnt) in nr_children.items() if cnt == 1 and p != 0]
-            if len(rem) > 0:
-                idx = taxids_to_keep.searchsorted(rem)
-                return self.get_induced_taxonomy(numpy.delete(taxids_to_keep, idx))
-        return Taxonomy(subtaxdata, genomes=self.genomes, _valid_levels=self.all_hog_levels)
-
-    def newick(self):
-        """Get a Newick representation of the Taxonomy
-
-        Note: as many newick parsers do not support quoted labels,
-        the method instead replaces spaces with underscores."""
-        def newick_enc(s):
-            return s.translate({ord(' '): u'_', ord('('): u'[', ord(')'): u']'})
-
-        def _rec_newick(node):
-            children = []
-            for child in self._direct_children_taxa(node['NCBITaxonId']):
-                children.append(_rec_newick(child))
-
-            if len(children) == 0:
-                return newick_enc(node['Name'].decode())
-            else:
-                t = ",".join(children)
-                return '(' + t + ')' + newick_enc(node['Name'].decode())
-
-        return _rec_newick(self._get_root_taxon()) + ';'
-
-    def as_dict(self):
-        """Encode the Taxonomy as a nested dict.
-
-         This representation can for example be used to serialize
-         a Taxonomy in json format."""
-
-        def _rec_phylogeny(node):
-            res = {'name': node['Name'].decode(), 'id': int(node['NCBITaxonId'])}
-            children = []
-            for child in self._direct_children_taxa(node['NCBITaxonId']):
-                children.append(_rec_phylogeny(child))
-            if len(children) > 0:
-                res['children'] = children
-            else:
-                try:
-                    g = self.genomes[res['id']]
-                    res['code'] = g.uniprot_species_code
-                except KeyError:
-                    pass
-            return res
-
-        return _rec_phylogeny(self._get_root_taxon())
-
-    def as_phyloxml(self):
-        """Encode the Taxonomy as phyloxml output"""
-
-        def _rec_phyloxml(node):
-            n = et.Element("clade")
-            tax = et.SubElement(n, "taxonomy")
-            id_ = et.SubElement(tax, "id", provider="uniprot")
-            id_.text = str(node['NCBITaxonId'])
-
-            children = []
-            for child in self._direct_children_taxa(node['NCBITaxonId']):
-                children.append(_rec_phyloxml(child))
-            if len(children) == 0:
-                try:
-                    g = self.genomes[int(node['NCBITaxonId'])]
-                    code = et.SubElement(tax, 'code')
-                    code.text = g.uniprot_species_code
-                except ValueError:
-                    pass
-            sci = et.SubElement(tax, 'scientific_name')
-            sci.text = node['Name'].decode()
-            n.extend(children)
-            return n
-
-        root = et.Element('phyloxml', xmlns="http://www.phyloxml.org")
-        phylo = et.SubElement(root, "phylogeny", rooted="true", rerootable="false")
-        name = et.SubElement(phylo, "name")
-        name.text = "(Partial) species phylogeny from OMA Browser"
-        phylo.append(_rec_phyloxml(self._get_root_taxon()))
-
-        return et.tostring(root, encoding='utf-8')
-
-
-class InvalidTaxonId(Exception):
-    pass
-
-
-class DBVersionError(Exception):
-    pass
-
-
-class DBConsistencyError(Exception):
-    pass
-
-
-class InvalidId(Exception):
-    pass
-
-
-class InvalidOmaId(InvalidId):
-    pass
-
-
-class UnknownIdType(Exception):
-    pass
-
-
-class UnknownSpecies(Exception):
-    pass
-
-
-class Singleton(Exception):
-    def __init__(self, entry, msg=None):
-        super(Singleton, self).__init__(msg)
-        self.entry = entry
-
-
-class NoReprEntry(Exception):
-    pass
-
-
-class IdMapperFactory(object):
-    def __init__(self, db_obj):
-        self.db = db_obj
-        self.mappers = {}
-
-    def __getitem__(self, idtype):
-        return self.get_mapper(idtype)
-
-    def get_mapper(self, idtype):
-        try:
-            mapper = self.mappers[idtype]
-        except KeyError:
-            try:
-                mapper = globals()[str(idtype).title() + 'IdMapper'](self.db)
-                self.mappers[idtype] = mapper
-            except KeyError:
-                raise UnknownIdType('{} is unknown'.format(str(idtype)))
-        return mapper
-
-
-class XrefIdMapper(object):
-    def __init__(self, db):
-        self._db = db
-        self.xref_tab = db.get_hdf5_handle().get_node('/XRef')
-        self.xrefEnum = self.xref_tab.get_enum('XRefSource')
-        self.idtype = frozenset(list(self.xrefEnum._values.keys()))
-        self.xref_index = SuffixSearcher(db.get_hdf5_handle().get_node('/XRef_Index'))
-
-    def map_entry_nr(self, entry_nr):
-        """returns the XRef entries associated with the query protein.
-
-        The types of XRefs that are returned depends on the idtype
-        class member variable. In the base-class, idtype contains
-        all valid xref types. Typically, subclasses of XrefIdMapper
-        will change this set.
-
-        :param entry_nr: the numeric id of the query protein.
-        :returns: list of dicts with 'source' and 'xref' keys."""
-        res = [{'source': self.xrefEnum._values[row['XRefSource']],
-                'xref': row['XRefId'].decode()}
-                for row in self.xref_tab.where('EntryNr=={:d}'.format(entry_nr))
-                if row['XRefSource'] in self.idtype]
-        return res
-
-    def canonical_source_order(self):
-        """returns the list of xref sources in order of their importance.
-
-        Most important source - in the base class for example UniProtKB/SwissProt
-        are first. The canonical order is defined in the enum definition.
-
-        :returns: list of source strings"""
-        return [self.xrefEnum(z) for z in sorted(self.idtype)]
-
-    def iter_xrefs_for_entry_nr(self, entry_nr):
-        """Iterate over the xrefs of a given entry number.
-
-        This method returns a dict with 'source' and 'xref' fields
-        (both str) holding the information of the xref record.
-
-        :param entry_nr: the numeric id of the query protein"""
-        for row in self.xref_tab.where('EntryNr=={:d}'.format(entry_nr)):
-            if row['XRefSource'] in self.idtype:
-                yield {'source': self.xrefEnum._values[row['XRefSource']],
-                       'xref': row['XRefId'].decode()}
-
-    def _combine_query_values(self, field, values):
-        parts = ['({}=={})'.format(field, z) for z in values]
-        return '|'.join(parts)
-
-    def map_many_entry_nrs(self, entry_nrs):
-        """map several entry_nrs with as few db queries as possible
-        to their cross-references. The function returns a
-        :class:`numpy.recarray` containing all fields as defined in
-        the table.
-
-        :param entry_nrs: a list with numeric protein entry ids"""
-        mapped_junks = []
-        junk_size = 32 - len(self.idtype)  # respect max number of condition variables.
-        source_condition = self._combine_query_values('XRefSource', self.idtype)
-        for start in range(0, len(entry_nrs), junk_size):
-            condition = "({}) & ({})".format(
-                self._combine_query_values('EntryNr',
-                                           entry_nrs[start:start + junk_size]),
-                source_condition)
-            mapped_junks.append(self.xref_tab.read_where(condition))
-        return numpy.lib.recfunctions.stack_arrays(
-            mapped_junks,
-            usemask=False)
-
-    def search_xref(self, xref, is_prefix=False, match_any_substring=False):
-        """identify proteins associcated with `xref`.
-
-        The crossreferences are limited to the types in the class
-        member `idtype`. In the base class, all types are valid
-        xrefs. The method returns a :class:`numpy.recarry` defined
-        for the XRef table with all entries pointing to `xref`.
-
-        The method by default returns only exact matches. By setting
-        `is_prefix` to True, one can indicated that the requested xref
-        should be interpreted as a prefix and all entries matching this
-        prefix should be returned.
-
-        :param str xref: an xref to be located
-        :param bool is_prefix: treat xref as a prefix and return
-                     potentially several matching xrefs"""
-        if match_any_substring:
-            query = xref.encode('utf-8').lower()
-            res = self.xref_tab[self.xref_index.find(query)]
-        else:
-            if is_prefix:
-                up = xref[:-1] + chr(ord(xref[-1])+1)
-                cond = '(XRefId >= {!r}) & (XRefId < {!r})'.format(
-                    xref.encode('utf-8'), up.encode('utf-8'))
-            else:
-                cond = 'XRefId=={!r}'.format(xref.encode('utf-8'))
-            res = self.xref_tab.read_where(cond)
-        if len(res) > 0 and len(self.idtype) < len(self.xrefEnum):
-            res = res[numpy.in1d(res['XRefSource'], list(self.idtype))]
-        return res
-
-    def source_as_string(self, source):
-        """string representation of xref source enum value
-
-        this auxiliary method converts the numeric value of
-        a xref source into a string representation.
-
-        :param int source: numeric value of xref source"""
-        try:
-            return self.xrefEnum._values[source]
-        except KeyError:
-            raise ValueError("'{}' is not a valid xref source value".format(source))
-
-    def xreftab_to_dict(self, tab):
-        """convert a xreftable to a dictionary per entry_nr.
-
-        All rows in `tab` are converted into a nested dictionary
-        where the outer key is a protein entry number and the
-        inner key the xref source type.
-
-        :param tab: a :class:`numpy.recarray` corresponding to XRef
-            table definition to be converted"""
-        xrefdict = collections.defaultdict(dict)
-        for row in tab:
-            try:
-                typ = self.xrefEnum._values[row['XRefSource']]
-            except IndexError:
-                logger.warning('invalid XRefSource value in {}'.format(row))
-                continue
-            if typ not in xrefdict[row['EntryNr']]:
-                xrefdict[row['EntryNr']][typ] = {'id': row['XRefId']}
-        return xrefdict
-
-
-class UniProtIdMapper(XrefIdMapper):
-    def __init__(self, db):
-        super(UniProtIdMapper, self).__init__(db)
-        self.idtype = frozenset([self.xrefEnum[z]
-                                 for z in ['UniProtKB/SwissProt', 'UniProtKB/TrEMBL']])
-
-
-class LinkoutIdMapper(XrefIdMapper):
-    def __init__(self, db):
-        super(LinkoutIdMapper, self).__init__(db)
-        self.idtype = frozenset([self.xrefEnum[z]
-                                 for z in ['UniProtKB/SwissProt', 'UniProtKB/TrEMBL',
-                                           'Ensembl Protein', 'Ensembl Gene',
-                                           'EntrezGene']])
-
-    def url(self, typ, id_):
-        # TODO: improve url generator in external module with all xrefs
-        url = None
-        try:
-            id_ = id_.decode()
-        except AttributeError:
-            pass
-
-        if typ.startswith('UniProtKB'):
-            url = 'http://uniprot.org/uniprot/{}'.format(id_)
-        elif typ == 'EntrezGene':
-            url = 'http://www.ncbi.nlm.nih.gov/gene/{}'.format(id_)
-        elif typ.startswith('Ensembl'):
-            url = 'http://ensembl.org/id/{}'.format(id_)
-        return url
-
-    def xreftab_to_dict(self, tab):
-        xref = super(LinkoutIdMapper, self).xreftab_to_dict(tab)
-        for d in list(xref.values()):
-            for typ, elem in list(d.items()):
-                elem['url'] = self.url(typ, elem['id'])
-        return xref
-
-    def iter_xrefs_for_entry_nr(self, entry_nr):
-        """same as base clase but includes also the url as a field"""
-        for xref in super(LinkoutIdMapper, self).iter_xrefs_for_entry_nr(entry_nr):
-            xref['url'] = self.url(xref['source'], xref['xref'])
-            yield xref
-
-
-class DomainNameIdMapper(object):
-    def __init__(self, db):
-        self.domain_src = db.get_hdf5_handle().root.Annotations.DomainDescription.read()
-        self.domain_src.sort(order='DomainId')
-
-    def _get_dominfo(self, domain_id):
-        idx = self.domain_src['DomainId'].searchsorted(domain_id)
-        if self.domain_src[idx]['DomainId'] != domain_id:
-            raise KeyError("no domain info available for {}".format(domain_id))
-        return self.domain_src[idx]
-
-    def get_info_dict_from_domainid(self, domain_id):
-        info = self._get_dominfo(domain_id)
-        return {'name': info['Description'].decode(), 'source': info['Source'].decode(),
-                'domainid': domain_id.decode()}
-
-
-class FastMapper(object):
-    """GO Function projection to sequences from OMA hdf5 file"""
-
-    def __init__(self, db):
-        self.db = db
-
-    def iter_projected_goannotations(self, records):
-        # gene ontology fast mapping, uses exact / approximate search.
-        # todo: implement taxonomic restriction.
-        # Input: iterable of biopython SeqRecords
-
-        for rec in records:
-            logger.debug('projecting function to {}'.format(rec))
-            r = self.db.seq_search.search(str(rec.seq))
-            if r is not None:
-                logger.debug(str(r))
-                if r[0] == 'exact':
-                    tdfs1 = []
-                    for enum in r[1]:
-                        df = self.db.get_gene_ontology_annotations(enum, as_dataframe=True)
-                        if df is not None:
-                            df['With'] = 'Exact:{}'.format(self.db.id_mapper['Oma'].map_entry_nr(enum))
-                            tdfs1.append(df)
-                    go_df = pd.concat(tdfs1, ignore_index=True)
-
-                else:
-                    # Take best match. TODO: remove those below some level of match.
-                    match_enum = r[1][0][0]
-                    match_score = r[1][0][1]['score']
-                    logger.debug('match: enum: {}, score:{}'.format(match_enum, match_score))
-                    go_df = self.db.get_gene_ontology_annotations(match_enum, as_dataframe=True)
-                    if go_df is not None:
-                        go_df['With'] = 'Approx:{}:{}'.format(self.db.id_mapper['Oma'].map_entry_nr(match_enum),
-                                                        match_score)
-                if go_df is not None:
-                    go_df['DB'] = 'OMA_FastMap'
-                    go_df['Assigned_By'] = go_df['DB']
-                    go_df['DB_Object_ID'] = rec.id
-                    go_df['DB_Object_Symbol'] = go_df['DB_Object_ID']
-                    go_df['Evidence'] = 'IEA'
-                    go_df['DB:Reference'] = 'OMA_Fun:002'
-                    go_df['Taxon_ID'] = 'taxon:-1'
-                    len_with_dupl = len(go_df)
-                    go_df.drop_duplicates(inplace=True)
-                    logger.debug('cleaning duplicates: from {} to {} annotations'.format(len_with_dupl, len(go_df)))
-                    for row in go_df.to_dict('records'):
-                        yield row
-
-    def write_annotations(self, file, seqrecords):
-        """Project annotations and write them to file
-
-        This method takes a filehandle and an iterable of BioPython
-        SeqRecords objects as input. The function computes the
-        projected annotations and writes them to the file in gaf
-        format.
-
-        :param file: filehandle to write annotations to
-        :param seqrecords: input sequencs to project functions to
-        """
-
-        file.write('!gaf-version: {}\n'.format(GAF_VERSION))
-        file.write('!Project Name: OMA Fast Function Projection\n')
-        file.write('!Date created: {}\n'.format(time.strftime("%c")))
-        file.write('!Contact Email: contact@omabrowser.org\n')
-        for anno in self.iter_projected_goannotations(seqrecords):
-            GOA.writerec(anno, file, GOA.GAF20FIELDS)
diff --git a/src/HogProf/build/lib/pyoma/browser/geneontology.py b/src/HogProf/build/lib/pyoma/browser/geneontology.py
deleted file mode 100755
index 7782e97..0000000
--- a/src/HogProf/build/lib/pyoma/browser/geneontology.py
+++ /dev/null
@@ -1,424 +0,0 @@
-from builtins import int, bytes, str
-import collections
-import csv
-import logging
-import math
-import re
-import numpy
-
-"""
-IMPORTANT NOTE:
----------------
-This module has been copied from the dessimoz zoo library
-directly. If you want to add functionality to this module,
-make sure it is also integrated into the zoo. At the moment
-we don't want to depend with pyoma on the zoo library as it
-has many dependencies that are difficult to maintain.
-
-
-Gene ontology module defining classes and methods to parse
-and navigate the gene ontology DAG as well as to parse GO
-annotations.
-
-
-:author: Adrian Altenhoff
-:institute: ETH Zurich
-"""
-
-NUM_ONT = 3
-NUM_GO_ID_DIGITS = 7
-UP_RELS = frozenset(['is_a', 'part_of'])
-_REV_RELS = {'is_a': 'can_be', 'part_of': 'has_part'}
-
-
-def reverse_name_of_rels(rels):
-    down = frozenset([_REV_RELS[z] for z in rels])
-    return down
-
-
-def validate_go_id(term):
-    if isinstance(term, (int, numpy.integer)):
-        return int(term)
-
-    term = term.strip()
-    if term.startswith('GO:'):
-        digits = term[3:]
-    else:
-        digits = term
-    if not digits.isdigit() or len(digits) > NUM_GO_ID_DIGITS:
-        raise ValueError("GO ID {} is not a valid go term".format(term))
-    return int(digits)
-
-
-class GOAspect(object):
-    aspects = dict(molecular_function=0, biological_process=1, cellular_component=2)
-    aspect2char = {0: 'F', 1: 'P', 2: 'C'}
-
-    @classmethod
-    def from_string(cls, aspect):
-        return cls.aspects[aspect]
-
-    @classmethod
-    def to_string(cls, aspectnr):
-        for o, i in cls.aspects.items():
-            if i == aspectnr:
-                return o
-        raise KeyError('aspect number not found: ' + str(aspectnr))
-
-    @classmethod
-    def to_char(cls, aspectnr):
-        # Converts an encoded aspect to the character required for GOA files
-        return cls.aspect2char[aspectnr]
-
-
-class GOterm(object):
-    """A class representing a single Gene Ontology term.
-
-    This class can serve as a factory for the OntologyParser. For that,
-    pass it as a factory on 'Term'. """
-
-    def __init__(self, stanza):
-        self.id = validate_go_id(stanza['id'][0])
-        self.name = ' '.join(stanza['name'])
-        self.definition = ' '.join(stanza['def'])
-        self.aspect = GOAspect.from_string(' '.join(stanza['namespace']))
-        self.is_a = [validate_go_id(parent) for parent in stanza['is_a']]
-        for rel in stanza['relationship']:
-            reltype, partner = rel.strip().split()
-            if not reltype in self.__dict__.keys():
-                self.__dict__[reltype] = list()
-            self.__dict__[reltype].append(validate_go_id(partner))
-
-    def replace_parentnames_by_refs(self, ont):
-        for rel in [('is_a', 'can_be'), ('part_of', 'has_part')]:
-            if rel[0] in self.__dict__.keys():
-                for i, parent_id in enumerate(self.__dict__[rel[0]]):
-                    parent_obj = ont[parent_id]
-                    self.__dict__[rel[0]][i] = parent_obj
-                    parent_obj._add_relation(self, rel[1])
-
-    def _add_relation(self, term, rel):
-        if rel not in self.__dict__.keys():
-            self.__dict__[rel] = list()
-        self.__dict__[rel].append(term)
-
-    def get_parents(self, rels=None):
-        """iterate over the direct parent GO terms.
-
-        by default "is_a" and "part_of" relations are followed. This can be overwritten
-        with the `rels`.
-
-        :param rels: a set of relations to follow."""
-        if rels is None:
-            rels = UP_RELS
-        for rel in rels:
-            try:
-                for term in getattr(self, rel):
-                    yield term
-            except AttributeError:
-                pass
-
-    def __str__(self):
-        fmt = "GO:{{0:0{}d}}".format(NUM_GO_ID_DIGITS)
-        return fmt.format(self.id)
-
-
-class AbstractParser(object):
-    def __init__(self, fp):
-        self.close_fp = False
-        if isinstance(fp, str):
-            if fp.endswith('.gz'):
-                from gzip import GzipFile
-                fp = GzipFile(fp, 'rb')
-            else:
-                fp = open(fp, 'r')
-            self.close_fp = True
-        self.fp = fp
-        self._read_headers()
-
-    def _read_headers(self):
-        pass
-
-    def close_if_opened(self):
-        if self.close_fp:
-            self.fp.close()
-
-
-class OntologyParser(AbstractParser):
-    """A general purpose Ontolgoy parser
-
-    Any ontology in the OBO format can be parsed with this object. The
-    stanzas are converted to objects using the factories passed in the
-    initializer."""
-    def __init__(self, fp, factories=None):
-        """creates an ontology parser
-
-        :param fp: a filehandle or path to file (either plaintext or
-            gzipped) containing the ontology.
-        :param factories: a dictionary containing per stanza class
-            (e.g. [Term]) a factory that returns an object from the
-            data. The data is passed as dict to the factory"""
-        if not factories:
-            factories = dict(Term=GOterm)
-        super(OntologyParser, self).__init__(fp)
-        self.factories = factories
-        self.tag_value_pair_re = re.compile(r"\s*(?P<tag>[^:]+):\s*(?P<value>[^!]*)")
-        self.stanza_name_re = re.compile(r"\[(?P<name>[^]]*)\]")
-
-    def stanzas(self):
-        """iterates over the stanzas in the ontology yielding 
-        objects according to the factory parameter provided 
-        in the constructor."""
-        curStanza = None
-        for line in self.fp:
-            line = line.strip()
-            if not line or line[0] == '!':
-                continue
-
-            # check whether a new stanza starts
-            match = self.stanza_name_re.match(line)
-            if match is not None:
-                obj = self._create_obj_from_stanza(curStanza)
-                if obj is not None:
-                    yield obj
-                curStanza = collections.defaultdict(list)
-                curStanza['_name'] = match.group("name")
-            elif curStanza is not None:
-                # it has to be value-pair line. add it to the stanza
-                match = self.tag_value_pair_re.match(line)
-                curStanza[match.group('tag')].append(match.group('value'))
-        obj = self._create_obj_from_stanza(curStanza)
-        if obj is not None:
-            yield obj
-        self.close_if_opened()
-
-    def _create_obj_from_stanza(self, stanza):
-        """method which creates the appropriate object from a given
-        stanza or returns None, e.g. for stanza types without a provided
-        factory or obsolete terms or ..."""
-        res = None
-        if stanza is not None:
-            try:
-                factory = self.factories[stanza['_name']]
-            except KeyError:
-                # no factory for this stanza type. ignore
-                pass
-            else:
-                if ((not "is_obsolete" in stanza) or
-                        (not 'true' in stanza['is_obsolete'])):
-                    res = factory(stanza)
-        return res
-
-    def __iter__(self):
-        return self.stanzas()
-
-
-class GeneOntology(object):
-    """The GeneOntology object contains the whole ontology in an internal
-    format and allows to traverse it efficiently."""
-    def __init__(self, parser, rels=None):
-        if rels is None:
-            rels = UP_RELS
-        if not isinstance(parser, OntologyParser):
-            raise Exception('requires an OntologyParser instance')
-        self.parser = parser
-        self.up_rels = UP_RELS.intersection(rels)
-        self.down_rels = reverse_name_of_rels(self.up_rels)
-
-    def parse(self):
-        """parse the ontology data.
-
-        This method should be called after instantiation and before any traversal"""
-        self.terms = dict()
-        for cur_term in self.parser:
-            self.terms[cur_term.id] = cur_term
-
-        # replace parents nrs by the references to the GO terms objects
-        # this can be only done once all the terms have been created
-        for term in self.terms.values():
-            term.replace_parentnames_by_refs(self.terms)
-
-    def ensure_term(self, term):
-        """returns the term object associated with term. if term is already
-        a GOterm object, it is simply return. term_ids can be either numeric
-        ids of existing GO-terms or propper GO-terms ids, i.e. GO:000002
-
-        :param term: a term_id or a GOterm object"""
-        if isinstance(term, GOterm):
-            return term
-        else:
-            return self.term_by_id(term)
-
-    def term_by_id(self, term_id):
-        """Returns the term object associated with term_id.
-
-        :param term_id: a GO-term number or a GO-term id (GO:008150)."""
-        try:
-            term = self.terms[validate_go_id(term_id)]
-            return term
-        except KeyError:
-            raise ValueError(str(term_id) + ' is an invalid GO term.')
-
-    def get_superterms_incl_queryterm(self, term, max_steps=-1):
-        """returns a set with all the superterms of a query term.
-
-        :param max_steps: The search can be limited to contain only
-            terms that are at most 'max_steps' upwards. If set to -1, no
-            limit is applied and the search goes up to the root."""
-        term = self.ensure_term(term)
-        return self._traverseGraph(term, max_steps, self.up_rels)
-
-    def get_subterms(self, term, max_steps=-1):
-        term = self.ensure_term(term)
-        return self._traverseGraph(term, max_steps, self.down_rels)
-
-    def _traverseGraph(self, node, max_steps, rels):
-        """_traverseGraph traverses the graph in a breath first manner
-        and reports all the nodes reachable within max_steps."""
-        remain = set([node])
-        found = set()
-        while len(remain) > 0 and max_steps != 0:
-            novel = set()
-            for t in remain:
-                for rel in rels:
-                    try:
-                        novel.update(t.__dict__[rel])
-                    except KeyError:
-                        pass
-            found.update(remain)
-            remain = novel.difference(found)
-            max_steps -= 1
-        return found
-
-
-class FreqAwareGeneOntology(GeneOntology):
-    """GO hierarchy represents the Gene Ontology vocabulary. 
-    
-    It gets loaded from the xml file and, in conjunction with 
-    an annotation file (GOA) the relative frequencies per term get
-    estimated. this estimation respects the hierarchy of the 
-    vocabulary.
-    Further, this class provides methods to traverse the hierarchy
-    in an easy way."""
-
-    def __init__(self, fp, rels=UP_RELS):
-        super(FreqAwareGeneOntology, self).__init__(fp, rels=rels)
-        self.reset_freqs()
-
-    def reset_freqs(self):
-        self.cnts = dict()
-        self.tot_cnts = [0] * NUM_ONT
-
-    def estimate_freqs(self, annotations):
-        for anno in annotations:
-            try:
-                self._update_counts(self.term_by_id(anno['TermNr']))
-            except ValueError:
-                logging.info("invalid annotation term_id in freq estim:" +
-                             str(anno.term_id))
-
-    def _update_counts(self, term):
-        for cur_term in self.get_superterms_incl_queryterm(term):
-            self.cnts[cur_term.id] = self.cnts.get(cur_term.id, 0) + 1
-        self.tot_cnts[cur_term.aspect] += 1
-
-    def get_term_frequency(self, term):
-        term = self.ensure_term(term)
-        try:
-            freq = self.cnts.get(term.id, 0) / self.tot_cnts[term.aspect]
-            return freq
-        except ZeroDivisionError:
-            return 0
-
-    def last_common_ancestor(self, *terms):
-        cand = self.get_superterms_incl_queryterm(terms[0])
-        for t in terms[1:]:
-            cand.intersection_update(self.get_superterms_incl_queryterm(t))
-        lca = min(cand, key=self.get_term_frequency)
-        return lca
-
-    def lin_similarity(self, term1, term2):
-        term1 = self.ensure_term(term1)
-        term2 = self.ensure_term(term2)
-        if term1.aspect != term2.aspect:
-            # early abort, since the two terms will be by 
-            # definition not similar
-            sim = 0
-        else:
-            lca = self.last_common_ancestor(term1, term2)
-            sim = (2 * math.log(self.get_term_frequency(lca)) /
-                   (math.log(self.get_term_frequency(term1)) +
-                    math.log(self.get_term_frequency(term2))))
-        return sim
-
-
-class AnnotationFilter(object):
-    EXP_CODES = frozenset(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'])
-    TRUST_IEA_REFS = frozenset([
-        'GO_REF:0000002', 'GOA:interpro', 'GOA:interpro|GO_REF:0000002',  # InterPro
-        'GO_REF:0000003', 'GOA:spec', 'GOA:spec|GO_REF:0000003'  # EC number
-        'GO_REF:0000004', 'GOA:spkw', 'GOA:spkw|GO_REF:0000004',
-        'GO_REF:0000037', 'GO_REF:0000038'  # SwissProt Keywords
-        'GO_REF:0000023', 'GOA:spsl', 'GOA:spsl|GO_REF:0000023',
-        'GO_REF:0000039', 'GO_REF:0000040',  # UniProtKB Subcellular Location
-    ])
-
-    @staticmethod
-    def is_negated(a):
-        return a.qualifier.find('NOT') >= 0
-
-    @classmethod
-    def is_exp_annotation(cls, a):
-        return a.evidence in cls.EXP_CODES and not cls.is_negated(a)
-
-    @classmethod
-    def is_trusted_electronic(cls, a):
-        return a.evidence == 'IEA' and a.db_ref in cls.TRUST_IEA_REFS and not cls.is_negated(a)
-
-    @classmethod
-    def is_exp_or_trusted_electronic(cls, a):
-        return cls.is_exp_annotation(a) or cls.is_trusted_electronic(a)
-
-
-# definition of the GOA Annotations. for performance reasons, we 
-# keep this as a namedtuple collection. 
-GOA_Annotation = collections.namedtuple('GOA_Annotation',
-         ['db', 'db_obj_id', 'db_obj_sym', 'qualifier', 'term_id',
-          'db_ref', 'evidence', 'with_from', 'aspect', 'db_obj_name',
-          'db_obj_syn', 'db_obj_typ', 'taxon', 'date', 'assigned_by',
-          'ext', 'gene_product_from_id'])
-
-
-class AnnotationParser(object):
-    def __init__(self, fp, factory=GOA_Annotation._make):
-        self._needs_close = False
-        if isinstance(fp, str):
-            if fp.endswith('.gz'):
-                from gzip import GzipFile
-                fp = GzipFile(fp, 'rb')
-                self._needs_close = True
-            else:
-                fp = open(fp, 'rb')
-        self.fp = fp
-        self.factory = factory
-
-        self._read_headers()
-
-    def _read_headers(self):
-        pass
-
-    def annotations(self):
-        """Iterates over the annotations in the file yielding objects
-        constructed by the factory argument passed to the constructor
-        of this class for each annotation."""
-        csv_reader = csv.reader((l for l in self.fp if not l.startswith('!')),
-                                delimiter='\t')
-        for row in csv_reader:
-            yield self.factory(row)
-        if self._needs_close:
-            self.fp.close()
-
-    def __iter__(self):
-        return self.annotations()
-
-
diff --git a/src/HogProf/build/lib/pyoma/browser/homoeologs.py b/src/HogProf/build/lib/pyoma/browser/homoeologs.py
deleted file mode 100755
index ce0177c..0000000
--- a/src/HogProf/build/lib/pyoma/browser/homoeologs.py
+++ /dev/null
@@ -1,238 +0,0 @@
-import pandas
-import logging
-import collections
-import numpy as np
-import matplotlib
-matplotlib.use('agg')
-from skfuzzy import control as ctrl
-from skfuzzy import gaussmf
-import sklearn
-import sklearn.preprocessing
-import tables
-
-try:
-    from tqdm import tqdm
-except ImportError:
-    tqdm = lambda x, **kwargs: x
-logger = logging.getLogger(__name__)
-
-
-def define_universe(df):
-    # New Antecedent/Consequent objects hold universe variables and membership functions
-    distance = ctrl.Antecedent(np.arange(0, df['Distance'].max() + .01, .01), 'distance')
-    synteny = ctrl.Antecedent(np.arange(0, 1.01, .01), 'synteny_score')
-    total_nb_hom = ctrl.Antecedent(np.arange(2, df['TotalCopyNr'].max() + 1, 1), 'total_nb_homoeologs')
-    conf = ctrl.Consequent(np.arange(0, 101, 1), 'conf')
-    return distance, synteny, total_nb_hom, conf
-
-
-def create_fuzzy_rules(distance, synteny, total_nb_hom, conf):
-    """Takes the antecedent and consequent objects as input"""
-
-    # very low confidence
-    rule1 = ctrl.Rule(synteny['low'] & distance['high'] & total_nb_hom['high'], conf['very_low'])
-
-    # low confidence
-    rule2 = ctrl.Rule((synteny['low'] & distance['high'] & total_nb_hom['low']) |
-                      (synteny['low'] & distance['high'] & total_nb_hom['med']) |
-                      (synteny['low'] & distance['med'] & total_nb_hom['high']) |
-                      (synteny['med'] & distance['high'] & total_nb_hom['high']),
-                      conf['low'])
-
-    # medium confidence
-    rule3 = ctrl.Rule((synteny['high'] & distance['high'] & total_nb_hom['high']) |
-
-                      (synteny['low'] & distance['med'] & total_nb_hom['low']) |
-                      (synteny['low'] & distance['med'] & total_nb_hom['med']) |
-                      (synteny['med'] & distance['high'] & total_nb_hom['low']) |
-                      (synteny['med'] & distance['high'] & total_nb_hom['med']) |
-                      (synteny['med'] & distance['med'] & total_nb_hom['high']) |
-                      (synteny['med'] & distance['med'] & total_nb_hom['low']) |
-                      (synteny['med'] & distance['med'] & total_nb_hom['med']) |
-                      (synteny['low'] & distance['low'] & total_nb_hom['low']) |
-                      (synteny['low'] & distance['low'] & total_nb_hom['med']) |
-                      (synteny['low'] & distance['low'] & total_nb_hom['high']),
-                      conf['med'])
-
-    # high confidence
-    rule4 = ctrl.Rule((synteny['high'] & distance['high'] & total_nb_hom['low']) |
-                      (synteny['high'] & distance['high'] & total_nb_hom['med']) |
-                      (synteny['high'] & distance['low'] & total_nb_hom['high']) |
-                      (synteny['high'] & distance['low'] & total_nb_hom['med']) |
-                      (synteny['high'] & distance['med'] & total_nb_hom['high']) |
-                      (synteny['high'] & distance['med'] & total_nb_hom['low']) |
-                      (synteny['high'] & distance['med'] & total_nb_hom['med']) |
-                      (synteny['med'] & distance['low'] & total_nb_hom['high']) |
-                      (synteny['med'] & distance['low'] & total_nb_hom['med']) |
-                      (synteny['med'] & distance['low'] & total_nb_hom['low']),
-                      conf['high'])
-
-    # very high confidence
-    rule5 = ctrl.Rule(synteny['high'] & distance['low'] & total_nb_hom['low'],
-                      conf['very_high'])
-    return [rule1, rule2, rule3, rule4, rule5]
-
-
-def get_distance_mf(df, distance):
-    # here, the first numnber is the universe, second number the central point, third the standard deviation
-    distance['low'] = gaussmf(distance.universe, 0, (df['Distance'].max() / 10))
-
-    distance['med'] = gaussmf(distance.universe,
-                                   (df['Distance'].max() / 4),
-                                   (df['Distance'].max() / 10))
-
-    distance['high'] = gaussmf(distance.universe,
-                                    df['Distance'].max(),
-                                    (df['Distance'].max() / 2.5))
-    return distance
-
-
-def get_synteny_mf(df, synteny, view=False):
-    # synteny (gaussian)
-    synteny['low'] = gaussmf(synteny.universe, 0, .15)
-    synteny['med'] = gaussmf(synteny.universe, .3, .15)
-    synteny['high'] = gaussmf(synteny.universe, .7, .25)
-    return synteny
-
-
-def get_total_nb_hom_mf(df, total_nb_hom):
-    copy_nr_median = df['TotalCopyNr'].median()
-    total_nb_hom['low'] = gaussmf(total_nb_hom.universe,
-                                       copy_nr_median, copy_nr_median)
-
-    total_nb_hom['med'] = gaussmf(total_nb_hom.universe,
-                                       4 * copy_nr_median,
-                                       1.5 * copy_nr_median)
-
-    total_nb_hom['high'] = gaussmf(total_nb_hom.universe,
-                                        df['TotalCopyNr'].max(),
-                                        df['TotalCopyNr'].max() / 2.5)
-    return total_nb_hom
-
-
-def get_conf_mf(df, conf):
-    # confidence (gaussian)
-    conf['very_low'] = gaussmf(conf.universe, 0, 20)
-    conf['low'] = gaussmf(conf.universe, 50, 10)
-    conf['med'] = gaussmf(conf.universe, 70, 10)
-    conf['high'] = gaussmf(conf.universe, 90, 10)
-    conf['very_high'] = gaussmf(conf.universe, 100, 10)
-    return conf
-
-
-def get_conf_score(simulation, input_dic):
-    """This function takes the simulation and outputs confidence score
-    'input_dic' is a dictionary of the inputs for a homoeolog pair"""
-
-    simulation.inputs(input_dic)
-    simulation.compute()
-    return simulation.output['conf']
-
-
-class HomeologsConfidenceCalculator(object):
-    def __init__(self, h5_handle, genome):
-        self.h5_handle = h5_handle
-        self.genome = genome
-        if isinstance(h5_handle, tables.File):
-            self.h5_handle = h5_handle
-        elif isinstance(h5_handle, (str, bytes)):
-            self.h5_handle = tables.open_file(h5_handle, 'r')
-        else:
-            raise TypeError("expected h5_handle to be either h5-file handle or a path to file")
-
-        genome_row = next(self.h5_handle.root.Genome.where('UniProtSpeciesCode == genome'))
-        self.genome_range = (int(genome_row['EntryOff']) + 1,
-                             int(genome_row['EntryOff'] + genome_row['TotEntries']))
-        genome_df = pandas.DataFrame(self.h5_handle.root.Protein.Entries.read_where(
-            '(EntryNr >= {}) & (EntryNr <= {})'.format(*self.genome_range)))
-        self.genome_df = genome_df[
-            (genome_df['AltSpliceVariant'] == 0) | (genome_df['AltSpliceVariant'] == genome_df['EntryNr'])]
-        self.genome_df.reset_index(inplace=True)
-        self.relations_df = self._load_pairwise_relations()
-
-    def _load_pairwise_relations(self):
-        """load the homoeologous relations of the cannonical splice variants only
-        The method returns a pandas dataframe with the relations."""
-        df = pandas.DataFrame(
-            self.h5_handle.get_node('/PairwiseRelation/{}/within'.format(self.genome)).read_where('RelType == 5'))
-        df = df[df['EntryNr1'].isin(self.genome_df['EntryNr']) & df['EntryNr2'].isin(self.genome_df['EntryNr'])]
-        return df[['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal', 'Distance']]
-
-    def _count_homeologs_per_entry(self, df):
-        return collections.Counter(df['EntryNr1'])
-
-    def _augment_dataframe_with_all_features(self, df):
-        counts = self._count_homeologs_per_entry(df)
-        df['TotalCopyNr'] = df.apply(lambda x: counts[x['EntryNr1']] + counts[x['EntryNr2']], axis=1)
-        df.loc[df.SyntenyConservationLocal < 0, 'SyntenyConservationLocal'] = 0
-        return df
-
-    def calculate_scores(self):
-        # load dataframe
-        df = self.relations_df
-        df = self._augment_dataframe_with_all_features(df)
-
-        distanceObj, syntenyObj, total_nb_homObj, confObj = define_universe(df)
-        distance = get_distance_mf(df, distanceObj)
-        synteny = get_synteny_mf(df, syntenyObj)
-        total_nb_hom = get_total_nb_hom_mf(df, total_nb_homObj)
-        conf = get_conf_mf(df, confObj)
-
-        # create simulation
-        rules = create_fuzzy_rules(distance, synteny, total_nb_hom, conf)
-        control_system = ctrl.ControlSystem(rules)
-        simulation = ctrl.ControlSystemSimulation(control_system)
-
-        def defuzzify(row):
-            return get_conf_score(simulation,
-                                  {'distance': row['Distance'],
-                                   'synteny_score': row['SyntenyConservationLocal'],
-                                   'total_nb_homoeologs': row['TotalCopyNr']})
-
-        df['fuzzy_confidence'] = df.apply(defuzzify, axis=1)
-
-        # scale the confidence between minimum value and 100
-        min_max_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(df['fuzzy_confidence'].min(), 100))
-        df['fuzzy_confidence_scaled'] = min_max_scaler.fit_transform(df['fuzzy_confidence'].values.reshape(-1, 1))
-        return df
-
-
-class HomeologsConfidenceCalculatorFromTSV(HomeologsConfidenceCalculator):
-    def __init__(self, infile):
-        self.relations_df = pandas.read_csv(infile, sep='\t')
-        expected_columns = ['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal', 'Distance']
-        if len(set(expected_columns) - set(self.relations_df.columns.values)) > 0:
-            raise KeyError("provided inputfile does not have all expected columns. "
-                           "Expected columns are {}".format(expected_columns))
-
-
-if __name__ == "__main__":
-    import argparse
-    # Get arguments from command line
-    parser = argparse.ArgumentParser(
-        description='Computes homoeology confidence score using fuzzy logic')
-    grp = parser.add_mutually_exclusive_group(required=True)
-    grp.add_argument('--h5', help="name of hdf5 file, full path")
-    grp.add_argument('--csv', help="tab-separated file with input data as alternative to hdf5 file")
-    parser.add_argument('--genome',
-                        help="5 letter code of polyploid genome to analyze. "
-                             "Must be specified if used with --h5 option.")
-    parser.add_argument('--outfile',
-                        help="name where results will be stored (file name created to include parameters)",
-                        default="homoeolog_confidence.tsv")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    if args.h5 is not None and args.genome is None:
-        import sys
-        sys.stderr.write("genomes argument required if using with an hdf5 file as input")
-        sys.exit(1)
-
-    if args.h5:
-        import tables
-        scorer = HomeologsConfidenceCalculator(tables.open_file(args.h5), args.genome)
-    else:
-        scorer = HomeologsConfidenceCalculatorFromTSV(args.csv)
-    data = scorer.calculate_scores()
-    data.to_csv(args.outfile, sep='\t', header=True, index=True)
diff --git a/src/HogProf/build/lib/pyoma/browser/linkout.py b/src/HogProf/build/lib/pyoma/browser/linkout.py
deleted file mode 100755
index 98fa0ee..0000000
--- a/src/HogProf/build/lib/pyoma/browser/linkout.py
+++ /dev/null
@@ -1,276 +0,0 @@
-import collections
-import operator
-import os
-import logging
-import math
-import re
-import ftplib
-
-from tqdm import tqdm
-from lxml import etree
-from .db import Database
-
-logger = logging.getLogger(__name__)
-
-"""Module to generate external data and crosslinks for NCBI.
-
-"""
-
-
-class NCBILinkOutXML(object):
-    root_node = "not_set"
-    provider_id = "9822"
-
-    def __init__(self):
-        root = etree.Element(self.root_node)
-        for key, value in self.root_children().items():
-            root.append(self.text_elemement(key, value))
-        self.tree = etree.ElementTree(root)
-        self._add_doctype()
-
-    def root_children(self):
-        return {}
-
-    def _add_doctype(self):
-        self.tree.docinfo.public_id = '-//NLM//DTD LinkOut 1.0//EN'
-        self.tree.docinfo.system_url = 'https://www.ncbi.nlm.nih.gov/projects/linkout/doc/LinkOut.dtd'
-
-    def text_elemement(self, tag, text):
-        el = etree.Element(tag)
-        el.text = text
-        return el
-
-    def write(self, fh):
-        fh.write(etree.tostring(self.tree, pretty_print=True, xml_declaration=True, encoding='utf-8'))
-
-
-class Provider(NCBILinkOutXML):
-    root_node = "Provider"
-
-    def root_children(self):
-        elements = collections.OrderedDict(
-            [("ProviderId", self.provider_id),
-             ("Name", "OMA Browser: Orthologous MAtrix"),
-             ("NameAbbr", "OMA"),
-             #("SubjectType", "taxonomy/phylogenetic"),
-             ("Url", "http://omabrowser.org"),
-             ("Brief", "OMA is a method and database for the inference of orthologs among complete genomes. "
-                       "We provide browsable orthology predictions, APIs, flat file downloads among thousands "
-                       "of genomes.")])
-        return elements
-
-
-class Resource(NCBILinkOutXML):
-    root_node = "LinkSet"
-    link_id = 1
-
-    def _add_objs(self, accs):
-        objsel = etree.Element("ObjectSelector")
-        objsel.append(self.text_elemement("Database", self.database()))
-        objlst = etree.Element("ObjectList")
-        objsel.append(objlst)
-        for acc in accs:
-            objlst.append(self.object_node(acc))
-        return objsel
-
-    def object_node(self, acc):
-        return self.text_elemement("ObjId", acc)
-
-    def _add_url_section(self, acc):
-        el = etree.Element('ObjectUrl')
-        el.append(self.text_elemement('Base', self.base_url()))
-        nxt = rule = etree.Element("Rule")
-        for k, rule_part in enumerate(self.rule_url(acc)):
-            if isinstance(rule_part, str):
-                if k == 0:
-                    nxt.text = rule_part
-                else:
-                    nxt.tail = rule_part
-            elif rule_part.tag == etree.Entity:
-                nxt.append(rule_part)
-                nxt = rule_part
-        el.append(rule)
-        el.append(self.text_elemement('SubjectType', "taxonomy/phylogenetic"))
-        return el
-
-    def add_link(self, accs):
-        lnk = etree.Element("Link")
-        lnk.append(self.text_elemement('LinkId', str(self.link_id)))
-        lnk.append(self.text_elemement('ProviderId', self.provider_id))
-        lnk.append(self._add_objs(accs))
-        lnk.append(self._add_url_section(accs))
-        self.tree.getroot().append(lnk)
-        self._bump_link_id()
-
-    @classmethod
-    def _bump_link_id(cls):
-        cls.link_id += 1
-
-    def database(self):
-        return "not set"
-
-    def base_url(self):
-        return "https://omabrowser.org/oma/hogs/"
-
-    def rule_url(self, acc):
-        return "",
-
-
-class GenesResource(Resource):
-    DISKSIZE_HEADER = 200
-    DISKSIZE_PER_LINK = 435
-    base_name = 'resource_genes'
-
-    def base_url(self):
-        return "https://omabrowser.org/cgi-bin/gateway.pl/"
-
-    def rule_url(self, acc):
-        return "?f=DisplayEntry&p1=" + next(iter(acc.values())),
-
-    def database(self):
-        return "Gene"
-
-
-class ProteinResource(Resource):
-    DISKSIZE_HEADER = 500
-    DISKSIZE_PER_LINK = 45
-    base_name = 'resource_protein'
-
-    def base_url(self):
-        return "https://omabrowser.org/oma/hogs/"
-
-    def object_node(self, acc):
-        return self.text_elemement("Query", "{}[accn]".format(acc))
-
-    def rule_url(self, acc):
-        return etree.Entity("lo.pacc"), "/vis/"
-
-    def database(self):
-        return "Protein"
-
-
-class TaxonomyResource(Resource):
-    DISKSIZE_HEADER = 200
-    DISKSIZE_PER_LINK = 435
-    base_name = 'resource_taxonomy'
-
-    def database(self):
-        return "taxonomy"
-
-    def base_url(self):
-        return "https://omabrowser.org/cgi-bin/gateway.pl/"
-
-    def rule_url(self, acc):
-        return "?f=DisplayOS&p1=" + next(iter(acc.values())),
-
-
-class LinkoutBuffer(object):
-    def __init__(self, resource, outdir='/tmp', bulk_add=True, max_file_size=20*2**20):
-        self.max_records = math.floor((max_file_size - resource.DISKSIZE_HEADER) /
-                                      resource.DISKSIZE_PER_LINK)
-        self.cur_nr = 0
-        self.buf = []
-        self.bulk_add = bulk_add
-        self.resource_type = resource
-        self.outdir = outdir
-        logger.info('Setup Linkout buffer for {} with max {} records ({}bytes) per file, bulk_add={}'
-                    .format(resource.__name__, self.max_records, max_file_size, bulk_add))
-
-    def add(self, obj):
-        self.buf.append(obj)
-        if len(self.buf) >= self.max_records:
-            self.flush()
-
-    def flush(self):
-        res = self.resource_type()
-        if self.bulk_add:
-            res.add_link(self.buf)
-        else:
-            for obj in self.buf:
-                res.add_link(obj)
-        fn = os.path.join(self.outdir,
-                          '{}_{:02d}.xml'.format(res.base_name, self.cur_nr))
-        with open(fn, 'wb') as fh:
-            res.write(fh)
-        self.cur_nr += 1
-        self.buf = []
-
-
-class GenesPriorizationHandler(object):
-    """Adapter to LinkoutBuffer to select only a limited number of crossrefs.
-    NCBI linkout caps at 10%"""
-
-    def __init__(self, max_linkouts=None, db=None, **kwargs):
-        self.max_links = int(max_linkouts) if max_linkouts else 20357436//10  # obtained in Jan2018
-        logger.info('Limiting Genes to {} links max'.format(self.max_links))
-        self.genes_buffer = LinkoutBuffer(GenesResource, **kwargs)
-        self.genes = []
-        self.db = db
-
-    def add(self, key, value):
-        self.genes.append((key, value))
-
-    def _genome_size_map(self):
-        gs = self.db.get_hdf5_handle().get_node('/Genome').read()
-        return {row['UniProtSpeciesCode'].decode(): row['TotEntries'] for row in gs}
-
-    def flush(self):
-        priority_prefixes = ['HUMAN', 'MOUSE', 'RATNO', 'PIGXX', 'DRO', 'SCH', 'YEAST', 'ARA',
-                             'WHEAT', 'PLAF', 'ECO', 'BAC', 'PANTR', 'ORY', 'GOSHI', 'BRA',
-                             'DANRE', 'CAE', 'MYC', 'STR', 'MAIZE', 'GORGO', 'PANTR', 'PONAB',
-                             'MACMU', 'YARLI', 'PEDHC', 'TRICA', 'XENTR', 'YERPE', 'POPTR']
-        pat = re.compile(r"^({})".format('|'.join(priority_prefixes)))
-        if len(self.genes) > self.max_links:
-            # final sort order will be 'priority genome', genome size and proteins within genome
-            self.genes.sort(key=operator.itemgetter(1))
-            if self.db is not None:
-                genome_size = self._genome_size_map()
-                self.genes.sort(key=lambda x: genome_size[x[1][0:5]], reverse=True)
-            self.genes.sort(key=lambda x: pat.match(x[1]) is None)
-        for link_acc, link_target in self.genes[0:self.max_links]:
-            self.genes_buffer.add({link_acc: link_target})
-        self.genes_buffer.flush()
-
-        c = collections.defaultdict(int)
-        for acc, target in self.genes[self.max_links:]:
-            c[target[0:5]] += 1
-        logger.info('Skipping genes link in the following species: {}'.format(c))
-
-
-def prepare_linkout_files(outdir='/tmp', infile='../pyomabrowser/OmaServer.h5'):
-    prov = Provider()
-    with open(os.path.join(outdir, 'provider.xml'), 'wb') as fh:
-        prov.write(fh)
-
-    db = Database(infile)
-    xrefs = db.get_hdf5_handle().get_node('/XRef')
-    xref_source_enum = xrefs.get_enum('XRefSource')
-
-    protein_buffer = LinkoutBuffer(ProteinResource, outdir=outdir, bulk_add=True)
-    genes_buffer = GenesPriorizationHandler(db=db, outdir=outdir, bulk_add=False)
-    for xref in tqdm(xrefs):
-        if xref['XRefSource'] == xref_source_enum['RefSeq']:
-            protein_buffer.add(xref['XRefId'].decode())
-        elif xref['XRefSource'] == xref_source_enum['EntrezGene']:
-            genes_buffer.add(xref['XRefId'].decode(),
-                             db.id_mapper['OMA'].map_entry_nr(xref['EntryNr']))
-    protein_buffer.flush()
-    genes_buffer.flush()
-
-    with open(os.path.join(outdir, 'resource_taxonomy.xml'), 'wb') as fh:
-        taxs = TaxonomyResource()
-        for row in db.id_mapper['OMA'].genome_table:
-            taxs.add_link({str(row['NCBITaxonId']): row['UniProtSpeciesCode'].decode()})
-        taxs.write(fh)
-
-
-def copy_to_ncbi(dir, password, host='ftp-private.ncbi.nlm.nih.gov', user='omabrow'):
-    with ftplib.FTP(host, user, password) as session:
-        session.cwd('/holdings')
-
-        for fname in os.listdir(dir):
-            if fname.endswith('.xml'):
-                with open(os.path.join(dir, fname), 'rb') as fh:
-                    cmd = "STOR {}".format(fname)
-                    session.storbinary(cmd, fp=fh)
-                logger.info('finished transfering '+fname)
\ No newline at end of file
diff --git a/src/HogProf/build/lib/pyoma/browser/locus_parser.py b/src/HogProf/build/lib/pyoma/browser/locus_parser.py
deleted file mode 100755
index 51597fd..0000000
--- a/src/HogProf/build/lib/pyoma/browser/locus_parser.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import numpy
-import collections
-from lark import Lark, Transformer, ParseError
-from tables import dtype_from_descr
-from .tablefmt import LocusTable
-import logging
-
-
-logger = logging.getLogger(__name__)
-
-"""This package is intended to parse the darwin locus structure
-and create a numpy recarray out of it. """
-
-
-Exon = collections.namedtuple('Exon', ['start', 'end', 'strand'])
-
-grammar = '''?locus : join | complement | complement_join | location
-             join  : "join" "(" (complement | location ) ("," (complement | location ))+ ")"
-             complement : "complement" "(" location ")"
-             complement_join : "complement" "(" "join" "(" location ("," location)+ ")" ")"
-             location : pos [".." pos ] | "FromElsewhere" "('" _SEQID "'," pos [".." pos] ")" 
-             ?pos : num | "Before" "(" num ")" | "After" "(" num ")"
-             ?num : NUMBER      -> number
-             _SEQID: /[A-Za-z0-9._-]+/
-             
-             %import common.NUMBER
-             %import common.WS
-             %ignore WS'''
-
-
-class LocusTransformer(Transformer):
-    def number(self, vals):
-        return int(vals[0])
-
-    def location(self, value):
-        return Exon(value[0], value[1] if len(value) > 1 else value[0], 1)
-
-    def complement(self, value):
-        rev = [e._replace(strand=-1*e.strand) for e in value]
-        if len(rev) == 1:
-            return rev[0]
-        else:
-            return rev
-
-    def complement_join(self, value):
-        return self.complement(value)
-
-    def join(self, values):
-        return values
-
-
-class LocusParser(object):
-    def __init__(self):
-        self.parser = Lark(grammar, start='locus')
-        self.locus_transformer = LocusTransformer()
-        self.dtype = dtype_from_descr(LocusTable)
-
-    def parse(self, locus_string, entry_nr=0):
-        try:
-            tree = self.parser.parse(locus_string)
-        except ParseError as e:
-            raise ValueError("cannot parse '{}' locus string".format(locus_string))
-        data = self.locus_transformer.transform(tree)
-        nr_exons = 1 if isinstance(data, Exon) else len(data)
-        locus_data = numpy.empty(nr_exons, dtype=self.dtype)
-        locus_data[['Start', 'End', 'Strand']] = data
-        locus_data['EntryNr'] = entry_nr
-        return locus_data
diff --git a/src/HogProf/build/lib/pyoma/browser/models.py b/src/HogProf/build/lib/pyoma/browser/models.py
deleted file mode 100755
index 9ad654e..0000000
--- a/src/HogProf/build/lib/pyoma/browser/models.py
+++ /dev/null
@@ -1,429 +0,0 @@
-from __future__ import division
-
-import collections
-
-import time
-
-
-def format_sciname(sci, short=False):
-    p = set([sci.find(x) for x in ['(', 'serogroup', 'serotype', 'serovar',
-                                   'biotype', 'subsp', 'pv.', 'bv.']])
-    if sci.startswith('Escherichia coli'):
-        p.add(sci.find('O'))
-    p.discard(-1)
-    p = min(p) if len(p) > 0 else len(sci)
-    return {'species': sci[0:p], 'strain': sci[p:]}
-
-
-class LazyProperty(object):
-    """Decorator to evaluate a property only on access.
-
-    Compute the attribute value and caches it in the instance.
-    Python Cookbook (Denis Otkidach) http://stackoverflow.com/users/168352/denis-otkidach
-    This decorator allows you to create a property which can be computed once and
-    accessed many times."""
-
-    def __init__(self, method, name=None):
-        # record the unbound-method and the name
-        self.method = method
-        self.name = name or method.__name__
-        self.__doc__ = method.__doc__
-
-    def __get__(self, inst, cls):
-        if inst is None:
-            return self
-        # compute, cache and return the instance's attribute value
-        result = self.method(inst)
-        # setattr redefines the instance's attribute so this doesn't get called again
-        setattr(inst, self.name, result)
-        return result
-
-
-class KeyWrapper(object):
-    '''
-        Enables the use of functions, e.g. bisect, with a key function.
-    '''
-    def __init__(self, it, key):
-        self.it = it
-        self.key = key
-
-    def __getitem__(self, i):
-        return self.key(self.it[i])
-
-    def __len__(self):
-        return len(self.it)
-
-
-class Singleton(type):
-    """A meta-class to enforce a Singleton, e.g. a class that can be
-    instantiated only exactly once.
-
-    Modified from Python Cookbook, 3rd Edition, p 357ff.
-
-    :Example:
-
-        class Foo(metaclass=Singleton):
-            def __init__(self):
-                pass  #This part is executed only once
-    """
-    def __init__(self, *args, **kwargs):
-        self.__instance = None
-        super(Singleton, self).__init__(*args, **kwargs)
-
-    def __call__(self, *args, **kwargs):
-        if self.__instance is None:
-            self.__instance = super(Singleton, self).__call__(*args, **kwargs)
-        return self.__instance
-
-
-class ProteinEntry(object):
-    """Model for a protein object
-
-    This class provides an easy to use interface for a given protein
-    form the database.
-
-    If instantiated with an entry_nr only, no data is loaded until a
-    property or method is accessed. Properties that need to access
-    additional data or loaded lazily and are cached in the object
-    (but not kept after deletion of object)."""
-    def __init__(self, db, e):
-        self._stored_entry = e
-        self._db = db
-
-    @LazyProperty
-    def _entry(self):
-        return (self._db.entry_by_entry_nr(self._stored_entry)
-                if isinstance(self._stored_entry, int)
-                else self._stored_entry)
-
-    @classmethod
-    def from_entry_nr(cls, db, eNr):
-        # e = db.entry_by_entry_nr(eNr)
-        return cls(db, int(eNr))
-
-    @property
-    def entry_nr(self):
-        return int(self._entry['EntryNr'])
-
-    @property
-    def locus_start(self):
-        return int(self._entry['LocusStart'])
-
-    @property
-    def locus_end(self):
-        return int(self._entry['LocusEnd'])
-
-    @property
-    def strand(self):
-        return int(self._entry['LocusStrand'])
-
-    @LazyProperty
-    def exons(self):
-        return ExonStructure.from_entry_nr(self._db, self.entry_nr)
-
-    @property
-    def oma_group(self):
-        return int(self._entry['OmaGroup'])
-
-    @property
-    def oma_hog(self):
-        return self._entry['OmaHOG'].decode()
-
-    @property
-    def chromosome(self):
-        return self._entry['Chromosome'].decode()
-
-    @property
-    def canonicalid(self):
-        return self._entry['CanonicalId'].decode()
-
-    @property
-    def sequence_md5(self):
-        return self._entry['MD5ProteinHash'].decode()
-
-    @LazyProperty
-    def genome(self):
-        g = self._db.id_mapper['OMA'].genome_of_entry_nr(self._entry['EntryNr'])
-        return Genome(self._db, g)
-
-    @LazyProperty
-    def omaid(self):
-        return self._db.id_mapper['OMA'].map_entry_nr(self._entry['EntryNr'])
-
-    @LazyProperty
-    def cdna(self):
-        return self._db.get_cdna(self._entry).decode()
-
-    @property
-    def gc_content(self):
-        cdna = self.cdna
-        cnts = list(map(cdna.count, 'GCAT'))
-        try:
-            return sum(cnts[0:2])/sum(cnts)
-        except ZeroDivisionError:
-            return 0
-
-    @LazyProperty
-    def sequence(self):
-        return self._db.get_sequence(self._entry).decode()
-
-    @property
-    def sequence_length(self):
-        return int(self._entry['SeqBufferLength']) - 1
-
-    @LazyProperty
-    def description(self):
-        return self._db.get_description(self._entry).decode()
-
-    @property
-    def subgenome(self):
-        return self._entry['SubGenome'].decode()
-
-    @LazyProperty
-    def hog_family_nr(self):
-        from .db import Singleton as HOGSingleton
-        try:
-            fam = self._db.hog_family(self._entry)
-        except HOGSingleton:
-            fam = 0
-        return fam
-
-    @property
-    def is_main_isoform(self):
-        return (self._entry['AltSpliceVariant'] == 0 or
-                self._entry['AltSpliceVariant'] == self._entry['EntryNr'])
-
-    @LazyProperty
-    def alternative_isoforms(self):
-        return [ProteinEntry(self._db, e)
-                for e in self._db.get_splicing_variants(self._entry)
-                if e['EntryNr'] != self.entry_nr]
-
-    def __repr__(self):
-        return "<{}({}, {})>".format(self.__class__.__name__, self.entry_nr, self.omaid)
-
-    def __len__(self):
-        return self.sequence_length
-
-
-class Genome(object):
-    def __init__(self, db, g):
-        self._genome = g
-        self._db = db
-
-    @property
-    def ncbi_taxon_id(self):
-        return int(self._genome['NCBITaxonId'])
-
-    @property
-    def uniprot_species_code(self):
-        return self._genome['UniProtSpeciesCode'].decode()
-
-    @property
-    def sciname(self):
-        return self._genome['SciName'].decode()
-
-    @property
-    def common_name(self):
-        try:
-            return self._genome['CommonName'].decode()
-        except ValueError:
-            return ""
-
-    @property
-    def synonym_name(self):
-        return self._genome['SynName'].decode()
-
-    @LazyProperty
-    def species_and_strain_as_dict(self):
-        return format_sciname(self.sciname)
-
-    def species(self):
-        return self.species_and_strain_as_dict['species']
-
-    def strain(self):
-        return self.species_and_strain_as_dict['strain']
-
-    @property
-    def url(self):
-        return self._genome['Url'].decode()
-
-    @property
-    def source(self):
-        return self._genome['Source'].decode()
-
-    @property
-    def release(self):
-        return self._genome['Release'].decode()
-
-    @property
-    def last_modfied_timestamp(self):
-        return self._genome['Date']
-
-    @property
-    def last_modified(self):
-        return self.modification_date("%Y-%b-%d")
-
-    def modification_date(self, fmt):
-        if self._db.db_schema_version >= (3, 2):
-            return time.strftime(fmt, time.localtime(self.last_modfied_timestamp))
-        else:
-            return 'n/a'
-
-    @property
-    def nr_entries(self):
-        return int(self._genome['TotEntries'])
-
-    @property
-    def entry_nr_offset(self):
-        return int(self._genome['EntryOff'])
-
-    @LazyProperty
-    def kingdom(self):
-        # TODO: store directly in db
-        return self._db.tax.get_parent_taxa(self._genome['NCBITaxonId'])[-1]['Name'].decode()
-
-    @property
-    def is_polyploid(self):
-        return self._genome['IsPolyploid']
-
-    @LazyProperty
-    def lineage(self):
-        return [lev['Name'].decode() for lev in self._db.tax.get_parent_taxa(
-            self._genome['NCBITaxonId'])]
-
-    @LazyProperty
-    def chromosomes(self):
-        chrs = collections.defaultdict(list)
-        entry_tab = self._db.get_hdf5_handle().get_node('/Protein/Entries')
-        for row in entry_tab.where('(EntryNr > {}) & (EntryNr <= {})'
-                .format(self.entry_nr_offset, self.entry_nr_offset+self.nr_entries)):
-            chrs[row['Chromosome'].decode()].append(row['EntryNr'])
-        return chrs
-
-    def __repr__(self):
-        return "<{}({}, {})>".format(self.__class__.__name__, self.uniprot_species_code,
-                                     self.ncbi_taxon_id)
-
-    def __len__(self):
-        return self.nr_entries
-
-
-class PairwiseRelation(object):
-    def __init__(self, db, relation):
-        self._relation = relation
-        self._db = db
-
-    @property
-    def distance(self):
-        return float(self._relation['Distance'])
-
-    @property
-    def score(self):
-        return float(self._relation['Score'])
-
-    @property
-    def alignment_overlap(self):
-        return float(self._relation['AlignmentOverlap'])
-
-    @property
-    def synteny_conservation_local(self):
-        return float(self._relation['SyntenyConservationLocal'])
-
-    @property
-    def confidence(self):
-        return float(self._relation['Confidence'])
-
-    @LazyProperty
-    def rel_type(self):
-        if not isinstance(self._relation['RelType'], str):
-            type_map = self._db._get_pw_tab(self._relation['EntryNr1'], 'VPairs').get_enum("RelType")
-            return type_map(self._relation['RelType'])
-        else:
-            return self._relation['RelType']
-
-    @LazyProperty
-    def entry_1(self):
-        return ProteinEntry(self._db, self._db.entry_by_entry_nr(self._relation['EntryNr1']))
-
-    @LazyProperty
-    def entry_2(self):
-        return ProteinEntry(self._db, self._db.entry_by_entry_nr(self._relation['EntryNr2']))
-
-
-class GeneOntologyAnnotation(object):
-    def __init__(self, db, anno):
-        self.db = db
-        self.anno = anno
-
-    @LazyProperty
-    def term(self):
-        return self.db.gene_ontology.term_by_id(self.anno['TermNr'])
-
-    @property
-    def evidence(self):
-        return self.anno['Evidence'].decode()
-
-    @property
-    def reference(self):
-        return self.anno['Reference'].decode()
-
-    @property
-    def entry_nr(self):
-        return int(self.anno['EntryNr'])
-
-    @LazyProperty
-    def aspect(self):
-        from .geneontology import GOAspect
-        return GOAspect.to_string(self.term.aspect)
-
-
-class ExonStructure(object):
-    def __init__(self, db, exons):
-        self._stored = exons
-        self._db = db
-
-    @LazyProperty
-    def _exons(self):
-        return (self._db.get_exons(self._stored)
-                if isinstance(self._stored, int)
-                else self._stored)
-
-    @classmethod
-    def from_entry_nr(cls, db, eNr):
-        return cls(db, int(eNr))
-
-    def _iter_exons(self):
-        if self._exons['Strand'][0] < 0:
-            self._exons[::-1].sort(order='Start')
-        else:
-            self._exons.sort(order='Start')
-        for exon in self._exons:
-            yield Exon(exon)
-
-    def __len__(self):
-        return len(self._exons)
-
-    def __repr__(self):
-        return "<{}(entry_nr={}, nr_exons={})>"\
-            .format(self.__class__.__name__,
-                    self._exons[0]['EntryNr'], len(self))
-
-    def __str__(self):
-        exs = list(str(e) for e in self._iter_exons())
-        if len(exs) > 1:
-            return "join({})".format(", ".join(exs))
-        else:
-            return exs[0]
-
-
-class Exon(object):
-    def __init__(self, exon):
-        self.exon = exon
-
-    def __str__(self):
-        if self.exon['Strand'] < 0:
-            template = "complement({}..{})"
-        else:
-            template = "{}..{}"
-        return template.format(self.exon['Start'], self.exon['End'])
diff --git a/src/HogProf/build/lib/pyoma/browser/synteny.py b/src/HogProf/build/lib/pyoma/browser/synteny.py
deleted file mode 100755
index 0abbcfe..0000000
--- a/src/HogProf/build/lib/pyoma/browser/synteny.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import pandas
-import tables
-import logging
-try:
-    from tqdm import tqdm
-except ImportError:
-    tqdm = lambda x, **kwargs: x
-logger = logging.getLogger(__name__)
-
-
-class SyntenyScorer(object):
-    def __init__(self, h5_handle, genome, windowsize=10):
-        self.h5_handle = h5_handle
-        self.genome = genome
-        self.windowsize = windowsize
-        if isinstance(h5_handle, tables.File):
-            self.h5_handle = h5_handle
-        elif isinstance(h5_handle, (str, bytes)):
-            self.h5_handle = tables.open_file(h5_handle, 'r')
-        else:
-            raise TypeError("expected h5_handle to be either h5-file handle or a path to file")
-
-        genome_row = next(self.h5_handle.root.Genome.where('UniProtSpeciesCode == genome'))
-        self.genome_range = (int(genome_row['EntryOff']) + 1,
-                             int(genome_row['EntryOff'] + genome_row['TotEntries']))
-        genome_df = pandas.DataFrame(self.h5_handle.root.Protein.Entries.read_where(
-            '(EntryNr >= {}) & (EntryNr <= {})'.format(*self.genome_range)))
-        self.genome_df = genome_df[(genome_df['AltSpliceVariant'] == 0) | (genome_df['AltSpliceVariant'] == genome_df['EntryNr'])]
-        self.genome_df.reset_index(inplace=True)
-        self.relations_df = self._load_pairwise_relations()
-
-    def _load_pairwise_relations(self):
-        df = pandas.DataFrame(self.h5_handle.get_node('/PairwiseRelation/{}/within'.format(self.genome)).read_where('RelType == 5'))
-        return df[['EntryNr1', 'EntryNr2', 'SyntenyConservationLocal']]
-
-    def get_neighbor_genes(self, query):
-        q = self.genome_df[self.genome_df['EntryNr'] == query]
-        if len(q) == 0:
-            logger.error("querying neighbor genes for non-primary variant (EntryNr: {})".format(query))
-            return []
-        query_chr = q['Chromosome']
-        neighbor = self.genome_df[max(0, q.index.item() - self.windowsize // 2): q.index.item() + self.windowsize//2 + 1]
-        return neighbor[neighbor['Chromosome'] == query_chr.item()]
-
-    def score_of_pair(self, entry1, entry2):
-        neigh1 = self.get_neighbor_genes(entry1)
-        neigh2 = self.get_neighbor_genes(entry2)
-        if len(neigh1) <= 1 or len(neigh2) <= 1:
-            raise TooSmallChromosome("too few genes on chromosome: {}, {}".format(len(neigh1), len(neigh2)))
-
-        rels_among_windows = self.relations_df[
-            (self.relations_df['EntryNr1'] >= neigh1.iloc[0]['EntryNr']) &
-            (self.relations_df['EntryNr1'] <= neigh1.iloc[-1]['EntryNr']) &
-            (self.relations_df['EntryNr2'] >= neigh2.iloc[0]['EntryNr']) &
-            (self.relations_df['EntryNr2'] <= neigh2.iloc[-1]['EntryNr'])]
-
-        score1 = (len(set(rels_among_windows['EntryNr1'])) - 1) / (len(neigh1) - 1)
-        score2 = (len(set(rels_among_windows['EntryNr2'])) - 1) / (len(neigh2) - 1)
-        res = {'entry_nr1': int(entry1), 'entry_nr2': int(entry2),
-               'chr1': neigh1.iloc[0]['Chromosome'].decode(),
-               'chr2': neigh2.iloc[0]['Chromosome'].decode(),
-               'nr_genes_window1': len(neigh1), 'nr_genes_window2': len(neigh2),
-               'synteny_score_1': score1, 'synteny_score_2': score2,
-               'mean_synteny_score': (score1 + score2) / 2}
-        return res
-
-    def compute_scores(self):
-        res = []
-        for idx, rel in tqdm(self.relations_df.iterrows(), total=len(self.relations_df)):
-            try:
-                res.append(self.score_of_pair(rel['EntryNr1'], rel['EntryNr2']))
-            except TooSmallChromosome as e:
-                logging.info("Skipping {}/{}: {}".format(int(rel['EntryNr1']), int(rel['EntryNr2']), e))
-                pass
-        return pandas.DataFrame(res)
-
-
-class TooSmallChromosome(Exception):
-    pass
-
-
-#### MAIN ####
-if __name__ == "__main__":
-    import argparse
-    # Get arguments from command line
-    parser = argparse.ArgumentParser(
-        description='Returns windows and their proportion of homoeolog VPs for a given chromosome')
-    parser.add_argument('--h5', help='name of h5 file, full path', required=True)
-    parser.add_argument('--window_genes', help='window size in genes', default=10)
-    parser.add_argument('--genome', help='5 letter code of polyploid genome to analyze')
-    parser.add_argument('--outfile', help='name where results will be stored (file name created to include parameters)', \
-                        default="synteny_results.tsv")
-
-    args = parser.parse_args()
-    h5file_path = args.h5
-    logging.basicConfig(level=logging.INFO)
-
-    scorer = SyntenyScorer(tables.open_file(h5file_path), args.genome)
-    data = scorer.compute_scores()
-    columns = ['entry_nr1', 'chr1', 'nr_genes_window1', 'entry_nr2', 'chr2', 'nr_genes_window2', 'synteny_score_1',
-               'synteny_score_2', 'mean_synteny_score']
-    data[columns].to_csv(args.outfile, sep='\t', header=True, index=True)
diff --git a/src/HogProf/build/lib/pyoma/browser/tablefmt.py b/src/HogProf/build/lib/pyoma/browser/tablefmt.py
deleted file mode 100755
index eea8c7f..0000000
--- a/src/HogProf/build/lib/pyoma/browser/tablefmt.py
+++ /dev/null
@@ -1,162 +0,0 @@
-import tables
-
-"""This module contains the definitions of the database tables
-used in the browser database. Some of these tables are used
-multiple times, e.g. the PairwiseRelationTable is used
-for each genome pair.
-
-From these table definitions one can easily extract the numpy
-dtype that can hold the data:
-
-   >>>tables.dtype_from_descr(HOGsTable)
-   dtype([('Fam', '<i4'), ('ID', 'S255'), ('Level', 'S255')])
-"""
-
-
-class HOGsTable(tables.IsDescription):
-    Fam = tables.Int32Col(pos=1)
-    ID = tables.StringCol(255, pos=2)
-    Level = tables.StringCol(255, pos=3)
-    CompletenessScore = tables.Float32Col(pos=4, dflt=-1)
-    ImpliedLosses = tables.Int32Col(pos=5, dflt=-1)
-
-
-class OrthoXmlHogTable(tables.IsDescription):
-    Fam = tables.UInt32Col(pos=0)
-    HogBufferOffset = tables.UInt32Col(pos=1)
-    HogBufferLength = tables.UInt32Col(pos=2)
-
-
-class ProteinTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=1)
-    SeqBufferOffset = tables.UInt64Col(pos=2)
-    SeqBufferLength = tables.UInt32Col(pos=3)
-    OmaGroup = tables.UInt32Col(pos=4, dflt=0)
-    OmaHOG = tables.StringCol(255, pos=5, dflt=b"")
-    Chromosome = tables.StringCol(255, pos=6)
-    LocusStart = tables.UInt32Col(pos=7)
-    LocusEnd = tables.UInt32Col(pos=8)
-    LocusStrand = tables.Int8Col(pos=9, dflt=1)
-    AltSpliceVariant = tables.Int32Col(pos=10, dflt=0)
-    CanonicalId = tables.StringCol(20, pos=11, dflt=b"")
-    CDNABufferOffset = tables.UInt64Col(pos=12)
-    CDNABufferLength = tables.UInt32Col(pos=13)
-    MD5ProteinHash = tables.StringCol(32, pos=14)
-    DescriptionOffset = tables.UInt32Col(pos=15)
-    DescriptionLength = tables.UInt16Col(pos=16)
-    SubGenome = tables.StringCol(1, pos=17, dflt=b"")
-    RootHogUpstream = tables.Int32Col(pos=18, dflt=-1)
-    RootHogDownStream = tables.Int32Col(pos=19, dflt=-1)
-
-
-class LocusTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=1)
-    Start = tables.UInt32Col(pos=2)
-    End = tables.UInt32Col(pos=3)
-    Strand = tables.Int8Col(pos=4)
-
-
-class PairwiseRelationTable(tables.IsDescription):
-    EntryNr1 = tables.UInt32Col(pos=0)
-    EntryNr2 = tables.UInt32Col(pos=1)
-    RelType = tables.EnumCol(
-        tables.Enum({'1:1': 0, '1:n': 1, 'm:1': 2, 'm:n': 3,
-                     'close paralog': 4, 'homeolog': 5, 'n/a': 6}),
-        'n/a', base='uint8', pos=2)
-    Score = tables.Float32Col(pos=3, dflt=-1)
-    Distance = tables.Float32Col(pos=4, dflt=-1)
-    AlignmentOverlap = tables.Float16Col(pos=5, dflt=-1)
-    SyntenyConservationLocal = tables.Float16Col(pos=6, dflt=-1)
-    Confidence = tables.Float16Col(pos=7, dflt=-1)
-
-
-class XRefTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=1)
-    XRefSource = tables.EnumCol(
-        tables.Enum({'UniProtKB/SwissProt': 0, 'UniProtKB/TrEMBL': 10,
-                     'Ensembl Protein': 20, 'Ensembl Gene': 25,  'Ensembl Transcript': 30,
-                     'RefSeq': 40, 'EntrezGene': 50, 'FlyBase': 60, 'WormBase': 65,
-                     'EnsemblGenomes': 70, 'NCBI': 75, 'EMBL': 80,
-                     'SourceID': 95, 'SourceAC': 100,
-                     'HGNC': 105, 'Gene Name': 110, 'Synonym': 115, 'Protein Name': 120,
-                     'ORF Name': 125, 'Ordered Locus Name': 130,
-                     'PMP': 150, 'PDB': 155, 'WikiGene': 160,
-                     'IPI': 240, 'GI': 241, 'n/a': 255}),  # last line: deprecated systems
-        'n/a', base='uint8', pos=2)
-    XRefId = tables.StringCol(50, pos=3)
-    Verification = tables.EnumCol(
-        tables.Enum({'exact': 0, 'modified': 1, 'unchecked': 2}),
-        'unchecked', base='uint8', pos=4)
-
-
-class GeneOntologyTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=1)
-    TermNr = tables.UInt32Col(pos=2)
-    Evidence = tables.StringCol(3, pos=3)
-    Reference = tables.StringCol(255, pos=4)
-
-
-class ECTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=1)
-    ECacc = tables.StringCol(16, pos=2)
-
-
-class GenomeTable(tables.IsDescription):
-    NCBITaxonId = tables.UInt32Col(pos=0)
-    UniProtSpeciesCode = tables.StringCol(5, pos=1)
-    TotEntries = tables.UInt32Col(pos=2)
-    TotAA = tables.UInt32Col(pos=3)
-    EntryOff = tables.UInt32Col(pos=4)
-    SciName = tables.StringCol(255, pos=5)
-    CommonName = tables.StringCol(64, pos=6)
-    SynName = tables.StringCol(64, pos=7)
-    Release = tables.StringCol(128, pos=8)
-    Url = tables.StringCol(255, pos=9)
-    Source = tables.StringCol(255, pos=10)
-    Date = tables.Time32Col(pos=11)
-    IsPolyploid = tables.BoolCol(pos=12)
-
-
-class TaxonomyTable(tables.IsDescription):
-    NCBITaxonId = tables.UInt32Col(pos=0)
-    ParentTaxonId = tables.UInt32Col(pos=1)
-    Name = tables.StringCol(255, pos=2)
-
-
-class DomainTable(tables.IsDescription):
-    EntryNr = tables.UInt32Col(pos=0)
-    DomainId = tables.StringCol(20, pos=1)
-    Coords = tables.StringCol(255, pos=2)
-
-
-class DomainDescriptionTable(tables.IsDescription):
-    DomainId = tables.StringCol(20, pos=0)
-    Source = tables.StringCol(11, pos=1)
-    Description = tables.StringCol(150, pos=2)
-
-
-class HOGDomainArchPrevalenceTable(tables.IsDescription):
-    Fam = tables.UInt32Col(pos=0)
-    ReprEntryNr = tables.UInt32Col(pos=1)
-    ReprEntryLength = tables.UInt32Col(pos=2)
-    TopLevel = tables.StringCol(255, pos=3)
-    FamSize = tables.UInt32Col(pos=4)
-    PrevCount = tables.UInt32Col(pos=5)
-
-
-class HOGDomainPresenceTable(tables.IsDescription):
-    Offset = tables.UInt32Col(pos=0)
-    DomainId = tables.StringCol(20, pos=1)
-
-
-class GroupsizeHistogram(tables.IsDescription):
-    Size = tables.UInt32Col(pos=0)
-    Count = tables.UInt32Col(pos=1)
-
-
-class OmaGroupTable(tables.IsDescription):
-    GroupNr = tables.UInt32Col(pos=0)
-    Fingerprint = tables.StringCol(7, pos=1)
-    KeywordOffset = tables.UInt32Col(pos=2)
-    KeywordLength = tables.UInt16Col(pos=3)
-    NrMembers = tables.UInt16Col(pos=4)
diff --git a/src/HogProf/build/lib/tests/__init__.py b/src/HogProf/build/lib/tests/__init__.py
deleted file mode 100755
index e69de29..0000000
diff --git a/src/HogProf/build/lib/tests/test_hashutils.py b/src/HogProf/build/lib/tests/test_hashutils.py
deleted file mode 100755
index f9b20dd..0000000
--- a/src/HogProf/build/lib/tests/test_hashutils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import unittest
-from utils import hashutils
-
-
-class HashutilsTest(unittest.TestCase):
-
-    def test_hogid2fam(self):
-        self.assertEqual(hashutils.hogid2fam('HOG:0279153'), 279153)
-
-    def test_fam2hogid(self):
-        self.assertEqual(hashutils.fam2hogid(279153), 'HOG:0279153')
-        self.assertEqual(hashutils.fam2hogid('279153'), 'HOG:0279153')
-
-    def test_result2hogid(self):
-        self.assertEqual(hashutils.result2hogid('279153-gain'), 'HOG:0279153')
-        self.assertEqual(hashutils.result2hogid('279153-gain-duplication'), 'HOG:0279153')
-
-    def test_result2fam(self):
-        self.assertEqual(hashutils.result2fam('279153-gain-duplication'), 279153)
-        self.assertEqual(hashutils.result2fam('279153-gain'), 279153)
-
-    def test_result2events(self):
-        self.assertEqual(hashutils.result2events('279153-gain'), ['gain'])
-        self.assertEqual(hashutils.result2events('279153-gain-duplication-loss'), ['gain', 'duplication', 'loss'])
-
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/src/HogProf/build/lib/tests/test_hpputils.py b/src/HogProf/build/lib/tests/test_hpputils.py
deleted file mode 100755
index 97d3973..0000000
--- a/src/HogProf/build/lib/tests/test_hpputils.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import unittest
-from utils import files_utils
-
-
-class HpputilsTest(unittest.TestCase):
-
-    def test_replace_characters(self):
-        self.assertEqual(files_utils.replace_characters('test.replace,character(to underscore)working:well'),
-                         'test_replace_character_to_underscore_working_well')
-
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/src/HogProf/build/lib/tests/test_profiler.py b/src/HogProf/build/lib/tests/test_profiler.py
deleted file mode 100755
index feb5c23..0000000
--- a/src/HogProf/build/lib/tests/test_profiler.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import unittest
-
-
-class ProfilerTest(unittest.TestCase):
-    pass
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/src/HogProf/build/lib/utils/__init__.py b/src/HogProf/build/lib/utils/__init__.py
deleted file mode 100755
index 2106d1f..0000000
--- a/src/HogProf/build/lib/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-name = "utils"
diff --git a/src/HogProf/build/lib/utils/config_utils.py b/src/HogProf/build/lib/utils/config_utils.py
deleted file mode 100755
index 0b01753..0000000
--- a/src/HogProf/build/lib/utils/config_utils.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import json
-import os
-config = {
-    "email": "dmoi@unil.ch"
-}
diff --git a/src/HogProf/build/lib/utils/files_utils.py b/src/HogProf/build/lib/utils/files_utils.py
deleted file mode 100755
index 1357714..0000000
--- a/src/HogProf/build/lib/utils/files_utils.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import ete3
-import pandas as pd
-from Bio import Entrez
-import copy
-import pickle
-import os
-from utils import config_utils
-
-
-def get_tree(taxa , savename = None):
-    """
-    get_tree() - Generates a taxonomic tree using the ncbi taxonomy and saves it in a newick file format.
-
-    Attributes:
-    ncbi (ete3.NCBITaxa): An instance of the NCBITaxa class.
-    tax (set): A set of taxa to include in the tree.
-    genomes (set): A set of genomes to include in the tree.
-    savename (str, optional): A name to save the tree file. If None, saves the tree as mastertree.nwk
-
-    Returns:
-    tree_string (str): A newick string representation of the tree.
-    tree (ete3.PhyloTree): An ete3 object of the generated tree.
-    """
-    
-    ncbi = ete3.NCBITaxa()
-    tax = set(tax)
-    genomes = set(genomes)
-    tax.remove(0)
-    print(len(tax))
-
-    tree = ete3.PhyloTree( name = '')
-    tree.add_child(name ='131567')
-
-    topo = ncbi.get_topology(tax , collapse_subspecies=False)
-    tax = set([ str(taxid) for taxid in tax])
-    tree.add_child(topo)
-    orphans = list(genomes - set([x.name for x in tree.get_leaves()]))
-    print('missing taxa:')
-    print(len(orphans))
-    Entrez.email = config_utils.email
-    orphans_info1 = {}
-    orphans_info2 = {}
-    for x in orphans:
-        search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml')
-        record = next(Entrez.parse(search_handle))
-        print(record)
-        orphans_info1[ record['ParentTaxId']] = x
-        orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']]
-    for n in tree.traverse():
-        if n.name in orphans_info1:
-            n.add_sister(name = orphans_info1[n.name])
-            print(n)
-    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
-    tree = add_orphans(orphans_info2, tree, genomes)
-    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
-    tree_string = tree.write(format=1)
-    if savename is None:
-        with open( config_utils.datadir +'mastertree.nwk' , 'w') as nwkout:
-            nwkout.write(tree_string)
-        with open( config_utils.datadir +'mastertree.pkl' , 'wb') as pklout:
-            pklout.write(pickle.dumps(tree))
-    else:
-        with open( config_utils.datadir + savename +'_master_tree.nwk' , 'w') as nwkout:
-            nwkout.write(tree_string)
-        with open( config_utils.datadir + savename + '_master_tree.pkl' , 'wb') as pklout:
-            pklout.write(pickle.dumps(tree))
-    return tree_string, tree
-
-
-
-def generate_taxa_index(tree , taxfilter, taxmask):
-    """
-    Generates an index for the global taxonomic tree for all OMA
-    :param tree: ete3 tree
-    :return: taxaIndex: dictionary key: node name (species name); value: index
-        taxaIndexReverse: dictionary key: index: value: species name
-    """
-    newtree = copy.deepcopy(tree)
-    for n in newtree.traverse():
-        if taxmask:
-            if str(n.name) == str(taxmask):
-                newtree = n
-                break
-        if taxfilter:
-            if n.name in taxfilter:
-                #set weight for descendants of n to 0
-                n.delete()
-    taxa_index = {}
-    taxa_index_reverse = {}
-    for i, n in enumerate(tree.traverse()):
-        taxa_index_reverse[i] = n.name
-        taxa_index[n.name] = i-1
-
-    return taxa_index, taxa_index_reverse
-
-
-def add_orphans(orphan_info, tree, genome_ids_list, verbose=False):
-    """
-    Fix the NCBI taxonomy by adding missing species.
-    :param: orphan_info: a dictionary containing info from the NCBI on the missing taxa
-    :param: tree : an ete3 tree missing some species
-    :param: genome_ids_list: the comlete set of taxids that should be on the tree
-    :verbose: Bool print debugging stuff
-    :return: tree: a species tree with the orphan genomes added
-    """
-    first = True
-
-
-    newdict = {}
-
-    leaves = set([leaf.name for leaf in tree.get_leaves()])
-
-    orphans = set(genome_ids_list) - leaves
-    oldkeys = set(list(newdict.keys()))
-
-    keys = set()
-    i = 0
-    print(i)
-
-    while first or ( len(orphans) > 0  and keys != oldkeys ) :
-        first = False
-        oldkeys = keys
-        leaves = set([leaf.name for leaf in tree.get_leaves()])
-        orphans = set(genome_ids_list) - leaves
-        print(len(orphans))
-        for orphan in orphans:
-            if str(orphan_info[orphan][-1]) in newdict:
-                newdict[str(orphan_info[orphan][-1])].append(orphan)
-            else:
-                newdict[str(orphan_info[orphan][-1])] = [orphan]
-        keys = set(list(newdict.keys()))
-        for n in tree.traverse():
-            if n.name in newdict and n.name not in leaves:
-                for orph in newdict[n.name]:
-                    n.add_sister(name=orph)
-                del newdict[n.name]
-
-        for orphan in orphans:
-            if len(orphan_info[orphan]) > 1:
-                orphan_info[orphan].pop()
-
-        newdict = {}
-    nodes = {}
-    print(orphans)
-    #clean up duplicates
-    for n in tree.traverse():
-        if n.name not in nodes:
-            nodes[ n.name] =1
-        else:
-            nodes[ n.name] +=1
-
-    for n in tree.traverse():
-        if nodes[ n.name] >1:
-            if n.is_leaf()== False:
-                n.delete()
-                nodes[ n.name]-= 1
-
-
-    return tree
diff --git a/src/HogProf/build/lib/utils/goatools_utils.py b/src/HogProf/build/lib/utils/goatools_utils.py
deleted file mode 100755
index d3c7dee..0000000
--- a/src/HogProf/build/lib/utils/goatools_utils.py
+++ /dev/null
@@ -1,186 +0,0 @@
-
-from goatools import semantic
-from goatools.obo_parser import GODag
-
-import json
-from utils import hashutils
-from utils import config_utils
-import pickle
-from goatools.go_enrichment import GOEnrichmentStudy
-
-##############enrichment##############################################
-
-def return_enrichment_study_obj(gaf_taxfiltered):
-    '''
-    Generate go enrichment study object with a background dataset.
-    '''
-
-    obodag = GODag(config_utils.datadir+"/GOData/go-basic.obo")
-    goeaobj = GOEnrichmentStudy(
-        gaf_taxfiltered.keys(), #
-        gaf_taxfiltered, # geneid/GO associations possible with tree used for DB
-        obodag, # Ontologies
-        propagate_counts = False,
-        alpha = 0.15, # default significance cut-off
-        methods = ['fdr_bh']) # defult multipletest correction method
-    return goeaobj
-
-def buildGAF(gaf_file , universe= None):
-    gaf_filtered = {}
-    with open(gaf_file, mode='r') as gafin:
-        for line in gafin:
-            words = line.split()
-            if words[0] not in gaf_filtered:
-                gaf_filtered[words[0]]=set([words[1]])
-            else:
-                gaf_filtered[words[0]].add(words[1])
-
-    if universe:
-        gaf_filtered = { prot:gaf_filtered[prot] for prot in universe}
-
-
-    return gaf_filtered
-
-def run_GOEA_onresults(results, db_obj, goeaobj, outname = None):
-    '''
-        Perform enrichment analysis on returned results
-        grabs all member protein of all hogs in result
-        returns goe results and HOG composition
-    '''
-    #print(db_obj.member_of_hog_id(int(results[0])))
-    hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ]
-    #print( db_obj.member_of_hog_id(hogids[0]) )
-    HOGS={}
-    print('compiling hogs')
-    prots = []
-    for i,result in enumerate(hogids):
-        if i %10 ==0:
-            print(i)
-        HOGS[result]=[]
-        for member in db_obj.iter_members_of_hog_id(result):
-            HOGS[result].append(member.omaid)
-            prots.append(member.omaid)
-    print('done')
-    print('running GO enrichment study')
-
-
-    goea_results_all = goeaobj.run_study(prots )
-    print('done')
-    with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save:
-       save.write(pickle.dumps(HOGS,2))
-
-    goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all)
-    print('DONE!')
-    return goea_results_all, HOGS
-
-
-def run_GOEA_onresults_tar(results, tar, goeaobj, outname = None):
-    '''
-        Perform enrichment analysis on returned results
-        grabs all member protein of all hogs in result
-        returns goe results and HOG composition
-    '''
-    ## TODO: finish this function with tar hog to list of prot IDS
-    #print(db_obj.member_of_hog_id(int(results[0])))
-    #hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ]
-    #print( db_obj.member_of_hog_id(hogids[0]) )
-
-
-    HOGS={}
-    print('compiling hogs')
-    prots = []
-    for i,result in enumerate(hogids):
-        if i %10 ==0:
-            print(i)
-        HOGS[result]=[]
-        for member in db_obj.iter_members_of_hog_id(result):
-            HOGS[result].append(member.omaid)
-            prots.append(member.omaid)
-    print('done')
-    print('running GO enrichment study')
-
-    goea_results_all = goeaobj.run_study(prots )
-    print('done')
-    with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save:
-       save.write(pickle.dumps(HOGS,2))
-
-    goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all)
-    print('DONE!')
-    return goea_results_all, HOGS
-
-
-######################resnik semsim ###################################################
-
-def resnik_sim_hdf5(go_id1, go_id2, godag, termcounts, hdf5):
-    '''
-        Computes Resnik's similarity measure.
-    '''
-    try:
-        msca_goid = deepest_common_ancestor_hdf5([goterm2id(go_id1), goterm2id(go_id2)], godag, hdf5)
-        score = semantic.get_info_content(msca_goid, termcounts)
-    except:
-        score = -1
-    return score
-
-
-def deepest_common_ancestor_hdf5(go_ids, godag, hdf5):
-    '''
-        Gets the nearest common ancestor
-        using the above function.
-        Only returns single most specific - assumes unique exists.
-    '''
-    # Take the element at maximum depth.
-    return max(common_parent_go_ids_hdf5(go_ids, hdf5), key=lambda t: godag[t].depth)
-
-def common_parent_go_ids_hdf5(go_ids, hdf5_set):
-    '''
-        Finds the common ancestors in the GO
-        tree of the list of goids in the input.
-    '''
-    candidates = set(hdf5_set[go_ids[0]].tolist())
-    for go_id in go_ids[1:]:
-        candidates_to_add = set(hdf5_set[go_id].tolist())
-        candidates.intersection_update(candidates_to_add)
-    corrected_candidates = [id2goterm(c) for c in candidates]
-    return corrected_candidates
-
-def resnik_sim_pandas(tup, df , termcounts):
-    '''
-        Computes Resnik's similarity measure.
-    '''
-    go_id1, go_id2 = tup
-    #print(df.head())
-    if go_id1 == go_id2:
-        return semantic.get_info_content(go_id1, termcounts)
-
-    elif go_id2 in df.index and go_id1 in df.index:
-
-        ancestors = df.loc[str(go_id2)].parents
-        ancestors += df.loc[str(go_id1)].parents
-        terms = df.loc[ancestors]
-        ancestors_set = terms.parents.tolist()
-        intersection = set(ancestors_set[0]).intersection(* ancestors_set[1:])
-        common_ancestors = df.loc[list(intersection)]
-        common_ancestors = common_ancestors.sort_values('depth', ascending= False)
-        msca_goid = common_ancestors.index.tolist()[0]
-        return semantic.get_info_content(msca_goid, termcounts)
-
-    else:
-        return -1
-
-
-def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf , genomes = None):
-    '''
-        iterate over hog members and get the go information from a gaf in memory
-    '''
-    fam = hashutils.hogid2fam(hog_id)
-    go_terms = { mr.omaid:gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf  }
-    return go_terms
-
-
-def goterm2id(go_term_to_modif):
-
-    return int(go_term_to_modif.split(':')[1])
-
-def id2goterm(go_term_to_modif):
-    return 'GO:{:07d}'.format(go_term_to_modif)
diff --git a/src/HogProf/build/lib/utils/hashutils.py b/src/HogProf/build/lib/utils/hashutils.py
deleted file mode 100755
index 6f293b5..0000000
--- a/src/HogProf/build/lib/utils/hashutils.py
+++ /dev/null
@@ -1,166 +0,0 @@
-
-
-import datasketch
-import itertools
-import ete3
-import copy
-import math
-import numpy as np
-import pandas as pd
-
-
-def generate_treeweights( mastertree, taxaIndex , taxfilter, taxmask ):
-    #weighing function for tax level, masking levels etc. sets all weights to 1
-    """
-    Generate the weights of each taxonomic level to be applied during the
-    constructin of weighted minhashes
-    :param mastertree: full corrected ncbi taxonomy
-    :param taxaIndex: dict mapping taxa to columns
-    :param taxfilter: list of branches to delete
-    :param taxmask: if this is not NONE taxmask, the DB is constructed with this subtree
-    :return: weights: a vector of weights for each tax level
-    """
-
-    weights = { type: np.zeros((len(taxaIndex),1)) for type in ['presence', 'loss', 'dup']}
-    print(len(taxaIndex))
-    newtree = mastertree
-    for event in weights:
-        for n in newtree.traverse():
-            if taxmask:
-                if str(n.name) == str(taxmask):
-                    newtree = n
-                    break
-            if taxfilter:
-                if n.name in taxfilter:
-                    n.delete()
-    for event in weights:
-        for n in newtree.traverse():
-            weights[event][taxaIndex[n.name]] = 1
-    return weights
-
-def hash_tree(tp , taxaIndex , treeweights , wmg):
-    """
-    Generate a weighted minhash and binary matrix row for a tree profile
-
-    :param tp: a pyham tree profile
-    :param taxaIndex: dict mapping taxa to columns
-    :param treeweights: a vector of weights for each tax levels
-    :param wmg: Datasketch weighted minhash generator
-    :return hog_matrix: a vector of weights for each tax level
-    :return weighted_hash: a weighted minhash of a HOG
-
-    """
-
-    losses = [ taxaIndex[n.name]  for n in tp.traverse() if n.lost and n.name in taxaIndex  ]
-    dupl = [ taxaIndex[n.name]  for n in tp.traverse() if n.dupl  and n.name in taxaIndex  ]
-    presence = [ taxaIndex[n.name]  for n in tp.traverse() if n.nbr_genes > 0  and n.name in taxaIndex ]
-    indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) )
-    hog_matrix_weighted = np.zeros((1, 3*len(taxaIndex)))
-    hog_matrix_binary = np.zeros((1, 3*len(taxaIndex)))
-    for i,event in enumerate(indices):
-        if len(indices[event])>0:
-            taxindex = np.asarray(indices[event])
-            hogindex = np.asarray(indices[event])+i*len(taxaIndex)
-            hog_matrix_weighted[:,hogindex] = treeweights[hogindex].ravel()
-            hog_matrix_binary[:,hogindex] = 1
-    weighted_hash = wmg.minhash(list(hog_matrix_weighted.flatten()))
-
-    return  hog_matrix_binary , weighted_hash
-
-def tree2str_DCA(tp , taxaIndex ):
-    """
-    Generate a string where each column is a tax level
-    each letter code corresponds to an event type
-    each row is a protein family. for use with DCA pipelines
-
-    :param tp: a pyham tree profile
-    :param taxaIndex: dict mapping taxa to columns
-    :return dcaMat: a weighted minhash of a HOG
-    """
-    #convert a tree profile to a weighted minhash
-
-
-    losses = [ taxaIndex[n.name]  for n in tp.traverse() if n.lost and n.name in taxaIndex  ]
-    dupl = [ taxaIndex[n.name]  for n in tp.traverse() if n.dupl  and n.name in taxaIndex  ]
-    presence = [ taxaIndex[n.name]  for n in tp.traverse() if n.nbr_genes > 0  and n.name in taxaIndex ]
-
-
-    Ds = list( set(dupl).intersection(set(presence)))
-    Ps=  list(set(presence).difference(set(dupl)))
-    Ls=  list(set(losses))
-    charar = np.chararray(len(taxaIndex) )
-    #set to absent
-
-    charar.fill( 'A')
-    charar[Ds] = 'D'
-    charar[Ls] = 'L'
-    charar[Ps] = 'P'
-
-    return charar
-
-def row2hash(row , taxaIndex , treeweights , wmg):
-    """
-    turn a dataframe row with an orthoxml file to hash and matrix row
-    :param row: lsh builder dataframe row
-    :param taxaIndex: dict mapping taxa to columnsfam
-    :param treeweights: a vector of weights for each tax levels
-    :param wmg: Datasketch weighted minhash generator
-    :return: hog_matrix: a vector of weights for each tax level
-    :return: weighted_hash: a weighted minhash of a HOG
-    """
-    #convert a dataframe row to a weighted minhash
-    fam, treemap = row.tolist()
-    hog_matrix,weighted_hash = hash_tree(treemap , taxaIndex , treeweights , wmg)
-    return [weighted_hash,hog_matrix]
-
-
-def fam2hash_hdf5(fam,  hdf5, dataset = None, nsamples = 128  ):
-    #read the stored hash values and return a weighted minhash
-    """
-    Read the stored hash values and return a weighted minhash
-    :param fam: hog id
-    :param hdf5: h5py object of the hashvalues
-    :param dataset: which dataset to use when constructing the hash
-    :return: minhash1: the weighted hash of your HOG
-    """
-    if dataset is None:
-        dataset = list(hdf5.keys())[0]
-    hashvalues = np.asarray(hdf5[dataset][fam, :].reshape(nsamples,2 ))
-    hashvalues = hashvalues.astype('int64')
-    minhash1 = datasketch.WeightedMinHash( seed = 1, hashvalues=hashvalues)
-    return minhash1
-
-def hogid2fam(hog_id):
-    """
-    For use with OMA HOGs
-    Get fam given hog id
-    :param hog_id: hog id
-    :return: fam
-
-    """
-
-    if not hog_id:
-        return hog_id
-    if type(hog_id) is int:
-        return hog_id
-
-    if ':' in hog_id:
-        hog_id = hog_id.split(':')[1]
-        if '.' in hog_id:
-            hog_id = hog_id.split('.')[0]
-        hog_id = hog_id.replace("'",'')
-        fam = int(hog_id)
-    else:
-        fam = int(hog_id)
-    return fam
-
-
-def fam2hogid(fam_id):
-    """
-    For use with OMA HOGs
-    Get hog id given fam
-    :param fam_id: fam
-    :return: hog id
-    """
-    hog_id = "HOG:" + (7-len(str(fam_id))) * '0' + str(fam_id)
-    return hog_id
diff --git a/src/HogProf/build/lib/utils/preprocess_config.py b/src/HogProf/build/lib/utils/preprocess_config.py
deleted file mode 100755
index 38427c1..0000000
--- a/src/HogProf/build/lib/utils/preprocess_config.py
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#turn the goDAG into a set of dictionaries
-preprocessGO = True
-
-string_interactors = '/scratch/cluster/monthly/dmoi/stringdata/protein.links.detailed.v10.5.txt'
-
-preprocessSTRINGDB = False
-uniprotmappings = '/scratch/cluster/monthly/dmoi/uniprotmapping/idmapping.dat'
-startseq = 'Q7VBF3'
-
-preprocessUNIPROT = False
-#empty redis before storing string info
-clearRedis= False
-#use GO information in OMA
-#use mapping info from uniprot
-
-verbose = True
diff --git a/src/HogProf/build/lib/utils/pyhamutils.py b/src/HogProf/build/lib/utils/pyhamutils.py
deleted file mode 100755
index d5c4135..0000000
--- a/src/HogProf/build/lib/utils/pyhamutils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import pyham
-import xml.etree.cElementTree as ET
-import pickle
-from utils import config_utils
-
-def get_orthoxml_oma(fam, db_obj):
-    orthoxml = db_obj.get_orthoxml(fam).decode()
-    return orthoxml
-
-def get_orthoxml_tar(fam, tar):
-    f = tar.extractfile(fam)
-    if f is not None:
-        return f.read()
-    else:
-        raise Exception( member + ' : not found in tarfile ')
-    return orthoxml
-
-
-def get_species_from_orthoxml(orthoxml):
-    NCBI_taxid2name = {}
-    root = ET.fromstring(orthoxml)
-    for child in root:
-        if 'species' in child.tag:
-            NCBI_taxid2name[child.attrib['NCBITaxId']] = child.attrib['name']
-    return NCBI_taxid2name
-
-def switch_name_ncbi_id(orthoxml , mapdict = None):
-    #swap ncbi taxid for species name to avoid ambiguity
-    #mapdict should be a mapping from species name to taxid if the info isnt in the orthoxmls
-    root = ET.fromstring(orthoxml)
-    for child in root:
-        if 'species' in child.tag:
-            child.attrib['name'] = child.attrib['NCBITaxId']
-        elif mapdict:
-            child.attrib['name'] = mapdict[child.attrib['name']]
-    orthoxml = ET.tostring(root, encoding='unicode', method='xml')
-    return orthoxml
-
-
-
-def get_ham_treemap_from_row(row, tree , level = None):
-    fam, orthoxml = row
-    orthoxml = switch_name_ncbi_id(orthoxml)
-    try:
-        if level is None:
-            ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=True)
-            tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0])
-            return tp.treemap
-        else:
-            ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=True)
-            #return subHOGs at level
-            slice = ham_obj.get_ancestral_genome_by_name(level)
-            treeprofiles = [  ham_obj.create_tree_profile(hog=h) for h in ham_obj.get_list_top_level_hogs()[0].get_at_level(slice) ]
-
-    except TypeError as err:
-        print('Type error:', err)
-        return None
-    except AttributeError as err:
-        print('Attribute error:', err)
-        return None
-
-
-def yield_families(h5file, start_fam):
-    """
-    Given a h5file containing OMA server, returns an iterator over the families
-    (not sure if still in use)
-    :param h5file: omafile
-    :param start_fam: fam to start on
-    :return: fam number
-    """
-    for row in h5file.root.OrthoXML.Index:
-        if row[0] > start_fam:
-            yield row[0]
-
-
-def get_one_family(i, h5file):
-    '''
-    get one family from database
-    Args:
-        i : family number
-        h5file : OMA server file
-    Return :
-        family
-    Not sure if still in use
-    '''
-    return h5file.root.OrthoXML.Index[i][0]
diff --git a/src/HogProf/lshbuilder.py b/src/HogProf/lshbuilder.py
index 803cb55..10efcb2 100755
--- a/src/HogProf/lshbuilder.py
+++ b/src/HogProf/lshbuilder.py
@@ -1,48 +1,55 @@
-from tables import *
-import functools
 import argparse
-import sys
+import functools
+import gc
+import logging
 import multiprocessing as mp
-import glob
-import pandas as pd
-import time as t
+import os
 import pickle
-import xml.etree.cElementTree as ET
-
-from datasketch import MinHashLSHForest , WeightedMinHashGenerator
+import random
+import time
+import time as t
 from datetime import datetime
+
+import ete3
 import h5py
-import time
-import gc
-from pyoma.browser import db
-from HogProf.utils import pyhamutils, hashutils , files_utils
 import numpy as np
+import pandas as pd
 import tqdm
-import random
-import tqdm
-import os
-import ete3
+from datasketch import MinHashLSHForest, WeightedMinHashGenerator
+from pyoma.browser import db
+from tables import *
+
+from HogProf.utils import pyhamutils, hashutils, files_utils
+
 random.seed(0)
 np.random.seed(0)
 
-class LSHBuilder:
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)x [%(levelname)s] %(message)s',
+                    handlers=[
+                        logging.FileHandler('debug.log'),
+                        logging.StreamHandler()])
+
 
+class LSHBuilder:
     """
-    This class contains the stuff you need to make 
-    a phylogenetic profiling 
+    This class contains the stuff you need to make
+    a phylogenetic profiling
     database with input orthxml files and a taxonomic tree
-    You must either input an OMA hdf5 file or an ensembl tarfile 
+    You must either input an OMA hdf5 file or an ensembl tarfile
     containing orthoxml file with orthologous groups.
 
-    You can provide a species tree or use the ncbi taxonomy 
+    You can provide a species tree or use the ncbi taxonomy
     with a list of taxonomic codes for all the species in your db
     """
 
-    def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving_name=None ,   numperm = 256,  treeweights= None , taxfilter = None, taxmask= None , lossonly = False, duplonly = False, verbose = False , use_taxcodes = False , datetime = datetime.now()):
-                
+    def __init__(self, h5_oma=None, fileglob=None, taxa=None, masterTree=None, saving_name=None, numperm=256,
+                 treeweights=None, taxfilter=None, taxmask=None, lossonly=False, duplonly=False, verbose=False,
+                 use_taxcodes=False, datetime=datetime.now()):
+
         """
             Initializes the LSHBuilder class with the specified parameters and sets up the necessary objects.
-            
+
             Args:
             - tarfile_ortho (str):  path to an ensembl tarfile containing orthoxml files
             - h5_oma (str): path to an OMA hdf5 file
@@ -56,6 +63,10 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
             - verbose (bool): whether to print verbose output (default: False)
 
         """
+        logging.info('Initialising %s' % self.__class__.__name__)
+        self.groups = None
+        self.errorfile = None
+
         if h5_oma:
             self.h5OMA = h5_oma
             self.db_obj = db.Database(h5_oma)
@@ -64,7 +75,7 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
             self.h5OMA = None
             self.db_obj = None
             self.oma_id_obj = None
-        
+
         self.tax_filter = taxfilter
         self.tax_mask = taxmask
         self.verbose = verbose
@@ -72,222 +83,237 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
         self.fileglob = fileglob
         self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now())
 
-
         if saving_name:
-            self.saving_name= saving_name 
-            if self.saving_name[-1]!= '/':
-                self.saving_name = self.saving_name+'/'
+            self.saving_name = saving_name
+            if self.saving_name[-1] != '/':
+                self.saving_name = self.saving_name + '/'
             self.saving_path = saving_name
             if not os.path.isdir(self.saving_path):
                 os.mkdir(path=self.saving_path)
         else:
-            raise Exception( 'please specify an output location' )
-        
+            raise Exception('please specify an output location')
+
         if masterTree is None:
             if h5_oma:
                 genomes = pd.DataFrame(h5_oma.root.Genome.read())["NCBITaxonId"].tolist()
-                genomes = [ str(g) for g in genomes]
-                taxa = genomes + [ 131567, 2759, 2157, 45596 ]+[ taxrel[0] for taxrel in  list(h5_oma.root.Taxonomy[:]) ]  + [  taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:]) ]
-                self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxa, genomes = genomes , outdir=self.saving_path )
-            elif taxa:
-                with open(taxa, 'r') as taxin:
-                    taxlist = [ int(line) for line in taxin ]
-                self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxlist  , outdir=self.saving_path)
+                genomes = [str(g) for g in genomes]
+                taxa = genomes + [131567, 2759, 2157, 45596] + [taxrel[0] for taxrel in
+                                                                list(h5_oma.root.Taxonomy[:])] + [taxrel[1] for taxrel
+                                                                                                  in list(
+                        h5_oma.root.Taxonomy[:])]
+                self.tree_string, self.tree_ete3 = files_utils.get_tree(taxa=taxa, genomes=genomes,
+                                                                        outdir=self.saving_path)
             else:
-                raise Exception( 'please specify either a list of taxa or a tree' )
+                raise Exception('please specify either a list of taxa or a tree')
             self.swap2taxcode = True
-        elif mastertree:
+        elif masterTree:
             self.tree_ete3 = ete3.Tree(masterTree, format=1)
             with open(masterTree) as treein:
                 self.tree_string = treein.read()
             self.swap2taxcode = use_taxcodes
 
-        self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3 , self.tax_filter, self.tax_mask)
-        with open( self.saving_path + 'taxaIndex.pkl', 'wb') as taxout:
-            taxout.write( pickle.dumps(self.taxaIndex))
+        self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3, self.tax_filter, self.tax_mask)
+
+        with open(self.saving_path + 'taxaIndex.pkl', 'wb') as taxout:
+            taxout.write(pickle.dumps(self.taxaIndex))
+
         self.numperm = numperm
+
         if treeweights is None:
-            #generate aconfig_utilsll ones
-            self.treeweights = hashutils.generate_treeweights(self.tree_ete3  , self.taxaIndex , taxfilter, taxmask)
+            # generate aconfig_utilsll ones
+            self.treeweights = hashutils.generate_treeweights(self.tree_ete3, self.taxaIndex, taxfilter, taxmask)
         else:
-            #load machine learning weights
+            # load machine learning weights
             self.treeweights = treeweights
-        print(self.treeweights)
-        wmg = WeightedMinHashGenerator(3*len(self.taxaIndex), sample_size = numperm , seed=1)
-        with open( self.saving_path  + 'wmg.pkl', 'wb') as wmgout:
-            wmgout.write( pickle.dumps(wmg))
+
+        wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex), sample_size=numperm, seed=1)
+
+        with open(self.saving_path + 'wmg.pkl', 'wb') as wmgout:
+            wmgout.write(pickle.dumps(wmg))
         self.wmg = wmg
-        print( 'configuring pyham functions')
+
+        logging.info('Configuring pyham functions')
+
         if self.h5OMA:
-            self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string ,  swap_ids=self.swap2taxcode  )
+            self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string,
+                                                  swap_ids=self.swap2taxcode)
         else:
-            self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string ,  swap_ids=self.swap2taxcode  , orthoXML_as_string = False )     
-        self.HASH_PIPELINE = functools.partial( hashutils.row2hash , taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg , lossonly = lossonly, duplonly = duplonly)
-        if self.h5OMA:
+            self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string,
+                                                  swap_ids=self.swap2taxcode, orthoXML_as_string=False)
 
+        self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex,
+                                               treeweights=self.treeweights, wmg=wmg, lossonly=lossonly,
+                                               duplonly=duplonly)
+
+        if self.h5OMA:
             self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj)
-       
+
         if self.h5OMA:
-            self.n_groups  = len(self.h5OMA.root.OrthoXML.Index)
+            self.n_groups = len(self.h5OMA.root.OrthoXML.Index)
         elif self.fileglob:
             self.n_groups = len(self.fileglob)
         else:
-            raise Exception( 'please specify an input file' )
-    
+            raise Exception('please specify an input file')
+
         self.hashes_path = self.saving_path + 'hashes.h5'
         self.lshpath = self.saving_path + 'newlsh.pkl'
         self.lshforestpath = self.saving_path + 'newlshforest.pkl'
-        self.mat_path = self.saving_path+ 'hogmat.h5'
+        self.mat_path = self.saving_path + 'hogmat.h5'
         self.columns = len(self.taxaIndex)
         self.verbose = verbose
-        print('done')
+        print('Initialised')
 
     def load_one(self, fam):
-        #test function to try out the pipeline on one orthoxml
+        # test function to try out the pipeline on one orthoxml
         ortho_fam = self.READ_ORTHO(fam)
         pyham_tree = self.HAM_PIPELINE([fam, ortho_fam])
-        hog_matrix,weighted_hash = hashutils.hash_tree(pyham_tree , self.taxaIndex , self.treeweights , self.wmg)
-        return ortho_fam , pyham_tree, weighted_hash,hog_matrix
+        hog_matrix, weighted_hash = hashutils.hash_tree(pyham_tree, self.taxaIndex, self.treeweights, self.wmg)
+        return ortho_fam, pyham_tree, weighted_hash, hog_matrix
 
-    def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None ):
+    def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None):
         families = {}
         start = -1
         if self.h5OMA:
-            self.groups  = self.h5OMA.root.OrthoXML.Index
+            self.groups = self.h5OMA.root.OrthoXML.Index
             self.rows = len(self.groups)
+
             for i, row in tqdm.tqdm(enumerate(self.groups)):
                 if i > start:
                     fam = row[0]
                     ortho_fam = self.READ_ORTHO(fam)
                     hog_size = ortho_fam.count('<species name=')
-                    if (maxhog_size is None or hog_size < maxhog_size) and (minhog_size is None or hog_size > minhog_size):
+
+                    if (maxhog_size is None or hog_size < maxhog_size) and (
+                            minhog_size is None or hog_size > minhog_size):
                         families[fam] = {'ortho': ortho_fam}
+
                     if len(families) > size:
                         pd_dataframe = pd.DataFrame.from_dict(families, orient='index')
                         pd_dataframe['Fam'] = pd_dataframe.index
                         yield pd_dataframe
                         families = {}
+
             pd_dataframe = pd.DataFrame.from_dict(families, orient='index')
             pd_dataframe['Fam'] = pd_dataframe.index
+
             yield pd_dataframe
-            print('last dataframe sent')
-            families = {}
+
+            logging.info('Last dataframe sent')
 
         elif self.fileglob:
-            for i,file in enumerate(tqdm.tqdm(self.fileglob)):
+            for i, file in enumerate(tqdm.tqdm(self.fileglob)):
+
                 with open(file) as ortho:
-                    #oxml = ET.parse(ortho)
-                    #ortho_fam = ET.tostring( next(oxml.iter()), encoding='utf8', method='xml' ).decode()
                     orthostr = ortho.read()
+
                 hog_size = orthostr.count('<species name=')
+
                 if (maxhog_size is None or hog_size < maxhog_size) and (minhog_size is None or hog_size > minhog_size):
                     families[i] = {'ortho': file}
+
                 if len(families) > size:
                     pd_dataframe = pd.DataFrame.from_dict(families, orient='index')
                     pd_dataframe['Fam'] = pd_dataframe.index
                     yield pd_dataframe
                     families = {}
-                if i%10000 == 0:
+
+                if i % 10000 == 0:
                     print(i)
-                    #save the mapping of fam to orthoxml
+                    # save the mapping of fam to orthoxml
                     pd_dataframe = pd.DataFrame.from_dict(families, orient='index')
                     pd_dataframe['Fam'] = pd_dataframe.index
                     pd_dataframe.to_csv(self.saving_path + 'fam2orthoxml.csv')
 
-
-    def universe_saver(self, i, q, retq, matq,univerq, l):
-        #only useful to save all prots within a taxonomic range as db is being compiled
-        allowed = set( [ n.name for n in self.tree_ete3.get_leaves() ] )
-        with open(self.saving_path+'universe.txt') as universeout:
-            while True:
-                prots = univerq.get()
-                for row in df.iterrows():
-                    for ID in row.prots.tolist():
-                        universeout.write(ID)
-                else:
-                    print('Universe saver done' + str(i))
-                    break
-
     def worker(self, i, q, retq, matq, l):
-        if self.verbose == True:
-            print('worker init ' + str(i))
+        if self.verbose:
+            logging.info('Initialising worker %s ' % str(i))
         while True:
             df = q.get()
-            if df is not None :
+            if df is not None:
                 df['tree'] = df[['Fam', 'ortho']].apply(self.HAM_PIPELINE, axis=1)
-                df[['hash','rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
+                df[['hash', 'rows']] = df[['Fam', 'tree']].apply(self.HASH_PIPELINE, axis=1)
                 retq.put(df[['Fam', 'hash']])
-                #matq.put(df[['Fam', 'rows']])
             else:
-                if self.verbose == True:
-                    print('Worker done' + str(i))
+                if self.verbose:
+                    print('Worker done %s' % str(i))
                 break
 
-    def saver(self, i, q, retq, matq, l ):
-        print_start = t.time()
+    def saver(self, i, q, retq, matq, l):
         save_start = t.time()
         global_time = t.time()
         chunk_size = 100
         count = 0
         forest = MinHashLSHForest(num_perm=self.numperm)
         taxstr = ''
+
         if self.tax_filter is None:
             taxstr = 'NoFilter'
+
         if self.tax_mask is None:
-            taxstr+= 'NoMask'
+            taxstr += 'NoMask'
         else:
             taxstr = str(self.tax_filter)
+
         self.errorfile = self.saving_path + 'errors.txt'
         with open(self.errorfile, 'w') as hashes_error_files:
             with h5py.File(self.hashes_path, 'w', libver='latest') as h5hashes:
                 datasets = {}
+
                 if taxstr not in h5hashes.keys():
-                    if self.verbose == True:
-                        print('creating dataset')
-                        print('filtered at taxonomic level: '+taxstr)
+                    if self.verbose:
+                        logging.info('Creating dataset')
+                        logging.info('Filtered at taxonomic level: ' + taxstr)
                     h5hashes.create_dataset(taxstr, (chunk_size, 0), maxshape=(None, None), dtype='int32')
-                    if self.verbose == True:
-                        print(datasets)
+
+                    if self.verbose:
+                        logging.info(datasets)
                     h5flush = h5hashes.flush
-                print('saver init ' + str(i))
+
+                logging.info('Initialising saver %s ' % str(i))
+
                 while True:
                     this_dataframe = retq.get()
                     if this_dataframe is not None:
                         if not this_dataframe.empty:
                             hashes = this_dataframe['hash'].to_dict()
-                            #print(str(this_dataframe.Fam.max())+ 'fam num')
-                            #print(str(count) + ' done')
-                            hashes = {fam:hashes[fam]  for fam in hashes if hashes[fam] }
-                            [ forest.add(str(fam),hashes[fam]) for fam in hashes]
+                            hashes = {fam: hashes[fam] for fam in hashes if hashes[fam]}
+                            [forest.add(str(fam), hashes[fam]) for fam in hashes]
+
                             for fam in hashes:
                                 if len(h5hashes[taxstr]) < fam + 10:
                                     h5hashes[taxstr].resize((fam + chunk_size, len(hashes[fam].hashvalues.ravel())))
                                 h5hashes[taxstr][fam, :] = hashes[fam].hashvalues.ravel()
                                 count += 1
+
                             if t.time() - save_start > 200:
-                                print( t.time() - global_time )
+                                logging.info(t.time() - global_time)
                                 forest.index()
-                                print(forest.query( hashes[fam] , k = 10 ) )
+                                logging.info(forest.query(hashes[fam], k=10))
                                 h5flush()
                                 save_start = t.time()
-                                with open(self.lshforestpath , 'wb') as forestout:
+
+                                with open(self.lshforestpath, 'wb') as forestout:
                                     forestout.write(pickle.dumps(forest, -1))
-                                if self.verbose == True:
-                                    print('save done at' + str(t.time() - global_time))
+
+                                if self.verbose:
+                                    logging.info('Saved to %s' % str(t.time() - global_time))
                         else:
                             print(this_dataframe)
                     else:
-                        if self.verbose == True:
-                            print('wrap it up')
-                        with open(self.lshforestpath , 'wb') as forestout:
+                        if self.verbose:
+                            logging.info('Wrapping it up')
+
+                        with open(self.lshforestpath, 'wb') as forestout:
                             forestout.write(pickle.dumps(forest, -1))
+
                         h5flush()
-                        if self.verbose == True:
+
+                        if self.verbose:
                             print('DONE SAVER' + str(i))
                         break
 
-    def matrix_updater(self, iprocess , q, retq, matq, l):
-        print('hogmat saver init ' + str(iprocess))
+    def matrix_updater(self, iprocess, q, retq, matq, l):
+        logging.info('Initialising hogmat saver ' + str(iprocess))
         h5mat = None
         times1 = []
         frames = []
@@ -299,205 +325,171 @@ def matrix_updater(self, iprocess , q, retq, matq, l):
                     rows = rows.dropna()
                     maxfam = rows.Fam.max()
                     if h5mat is None:
-                        h5hashes.create_dataset('matrows',(10,block.shape[1]), maxshape=(None, block.shape[1]),chunks=(1, block.shape[1]), dtype='i8')
+                        h5hashes.create_dataset('matrows', (10, block.shape[1]), maxshape=(None, block.shape[1]),
+                                                chunks=(1, block.shape[1]), dtype='i8')
                         h5mat = h5hashes['matrows']
                     if h5mat.shape[0] < maxfam:
-                        h5mat.resize((maxfam+1,block.shape[1]))
-                    i+=1
+                        h5mat.resize((maxfam + 1, block.shape[1]))
+                    i += 1
                     frames.append(rows)
                     assign = t.time()
                     index = np.asarray(rows.Fam)
                     block = np.vstack(rows.rows)
-                    h5mat[index,:]= block
+                    h5mat[index, :] = block
 
-                    times1.append(t.time()-assign)
-                    if len(times1)>10:
+                    times1.append(t.time() - assign)
+                    if len(times1) > 10:
                         times1.pop(0)
-                        print(np.mean(times1))
+                        logging.info('Mean time: %s' % np.mean(times1))
                     h5hashes.flush()
                 else:
                     h5hashes.flush()
                     break
-        print('DONE MAT UPDATER' + str(i))
+        logging.info('DONE MAT UPDATER %s' % str(i))
+
+    def run_pipeline(self, threads):
+        logging.info('Running with %s threads:' % threads)
+        functype_dict = {'worker': (self.worker, threads, True), 'updater': (self.saver, 1, False),
+                         'matrix_updater': (self.matrix_updater, 0, False)}
 
-    def run_pipeline(self , threads):
-        print( 'run w n threads:', threads)
-        functype_dict = {'worker': (self.worker, threads , True), 'updater': (self.saver, 1, False),
-                         'matrix_updater': (self.matrix_updater, 0, False) }
         def mp_with_timeout(functypes, data_generator):
-            work_processes = {}
-            update_processes = {}
             lock = mp.Lock()
             cores = mp.cpu_count()
             q = mp.Queue(maxsize=cores * 10)
             retq = mp.Queue(maxsize=cores * 10)
             matq = mp.Queue(maxsize=cores * 10)
             work_processes = {}
-            print('start workers')
+            logging.info('Starting workers...')
+
             for key in functypes:
                 worker_function, number_workers, joinval = functypes[key]
                 work_processes[key] = []
                 for i in range(int(number_workers)):
-                    t = mp.Process(target=worker_function, args=(i, q, retq, matq, lock ))
+                    t = mp.Process(target=worker_function, args=(i, q, retq, matq, lock))
                     t.daemon = True
                     work_processes[key].append(t)
+
             for key in work_processes:
                 for process in work_processes[key]:
+                    logging.info('Starting process')
                     process.start()
+
             for data in data_generator:
+                logging.info('Putting data')
                 q.put(data)
-            print('done spooling data')
+
+            logging.info('Spooling data: OK')
+
             for key in work_processes:
                 for i in range(2):
                     for _ in work_processes[key]:
                         q.put(None)
-            print('joining processes')
+            logging.info('Joining processes')
+
             for key in work_processes:
-                worker_function, number_workers , joinval = functypes[key]
-                if joinval == True:
+                worker_function, number_workers, joinval = functypes[key]
+
+                if joinval:
                     for process in work_processes[key]:
                         process.join()
+
             for key in work_processes:
                 worker_function, number_workers, joinval = functypes[key]
-                if joinval == False:
+
+                if not joinval:
                     for _ in work_processes[key]:
                         retq.put(None)
                         matq.put(None)
+
             for key in work_processes:
-                worker_function, number_workers , joinval = functypes[key]
-                if joinval == False:
+                worker_function, number_workers, joinval = functypes[key]
+
+                if not joinval:
                     for process in work_processes[key]:
                         process.join()
+
             gc.collect()
             print('DONE!')
 
         mp_with_timeout(functypes=functype_dict, data_generator=self.generates_dataframes(100))
-        return self.hashes_path, self.lshforestpath , self.mat_path
+        return self.hashes_path, self.lshforestpath, self.mat_path
 
 
+def arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--outpath', help='name of the db', type=str)
+    parser.add_argument('--dbtype', help='preconfigured taxonomic ranges', type=str)
+    parser.add_argument('--OMA', help='use oma data ', type=str)
+    parser.add_argument('--nthreads', help='nthreads for multiprocessing', type=int)
+    parser.add_argument('--outfolder', help='folder for storing hash, db and tree objects', type=str)
+    parser.add_argument('--verbose', help='print verbose output', type=bool)
 
+    args = parser.parse_args()
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--taxweights', help='load optimised weights from keras model',type = str)
-    parser.add_argument('--taxmask', help='consider only one branch',type = str)
-    parser.add_argument('--taxfilter', help='remove these taxa' , type = str)
-    parser.add_argument('--outpath', help='name of the db', type = str)
-    parser.add_argument('--dbtype', help='preconfigured taxonomic ranges' , type = str)
-    parser.add_argument('--OMA', help='use oma data ' , type = str)
-    parser.add_argument('--OrthoGlob', help='a glob expression for orthoxml files ' , type = str)
-    parser.add_argument('--tarfile', help='use tarfile with orthoxml data ' , type = str)
-    parser.add_argument('--nperm', help='number of hash functions to use when constructing profiles' , type = int)
-    parser.add_argument('--mastertree', help='master taxonomic tree. should use ncbi taxonomic id numbers as leaf names' , type = str)
-    parser.add_argument('--nthreads', help='nthreads for multiprocessing' , type = int)
-    parser.add_argument('--outfolder', help='folder for storing hash, db and tree objects' , type = str)
-    parser.add_argument('--lossonly', help='only compile loss events' , type = bool)
-    parser.add_argument('--duplonly', help='only compile duplication events' , type = bool)
-    parser.add_argument('--taxcodes', help='use taxid info in HOGs' , type = bool)
-    parser.add_argument('--verbose', help='print verbose output' , type = bool)
-    dbdict = {
-        'all': { 'taxfilter': None , 'taxmask': None },
-        'plants': { 'taxfilter': None , 'taxmask': 33090 },
-        'archaea':{ 'taxfilter': None , 'taxmask': 2157 },
-        'bacteria':{ 'taxfilter': None , 'taxmask': 2 },
-        'eukarya':{ 'taxfilter': None , 'taxmask': 2759 },
-        'protists':{ 'taxfilter': [2 , 2157 , 33090 , 4751, 33208] , 'taxmask':None },
-        'fungi':{ 'taxfilter': None , 'taxmask': 4751 },
-        'metazoa':{ 'taxfilter': None , 'taxmask': 33208 },
-        'vertebrates':{ 'taxfilter': None , 'taxmask': 7742 },
-    }
-    taxfilter = None
-    taxmask = None
-    omafile = None
+    return args
 
-    args = vars(parser.parse_args(sys.argv[1:]))
 
+def main(args=None):
+    if args is None:
+        args = arg_parser()
+
+    dbdict = {
+        'all': {'taxfilter': None, 'taxmask': None},
+        'plants': {'taxfilter': None, 'taxmask': 33090},
+        'archaea': {'taxfilter': None, 'taxmask': 2157},
+        'bacteria': {'taxfilter': None, 'taxmask': 2},
+        'eukarya': {'taxfilter': None, 'taxmask': 2759},
+        'protists': {'taxfilter': [2, 2157, 33090, 4751, 33208], 'taxmask': None},
+        'fungi': {'taxfilter': None, 'taxmask': 4751},
+        'metazoa': {'taxfilter': None, 'taxmask': 33208},
+        'vertebrates': {'taxfilter': None, 'taxmask': 7742},
+    }
 
-    if 'OrthoGlob' in args:
-        if args['OrthoGlob']:
-            orthoglob = glob.glob(args['OrthoGlob']+ '*')
-        else:   
-            orthoglob = None
-    
     if 'outpath' in args:
-        dbname = args['outpath']
+        dbname = args.outpath
     else:
         raise Exception(' please give your profile an output path with the --outpath argument ')
-    if args['dbtype']:
-        taxfilter = dbdict[args['dbtype']]['taxfilter']
-        taxmask = dbdict[args['dbtype']]['taxmask']
-    if args['taxmask']:
-        taxfilter = args['taxfilter']
-    if args['taxfilter']:
-        taxmask = args['taxmask']
-    if args['nperm']:
-        nperm = int(args['nperm'])
+
+    if args.dbtype:
+        taxfilter = dbdict[args.dbtype]['taxfilter']
+        taxmask = dbdict[args.dbtype]['taxmask']
     else:
-        nperm = 256
-    if args['OMA']:
-        omafile = args['OMA']
-    elif args['tarfile']:
-        omafile = args['tarfile']
-    elif orthoglob:
-        fileglob = orthoglob
+        taxfilter=None
+        taxmask=None
+
+    nperm = 256
+
+    if args.OMA:
+        omafile = args.OMA
     else:
         raise Exception(' please specify input data ')
-    
-    if args['lossonly']:
-        lossonly = args['lossonly']
-    else:
-        lossonly = False
-    if args['duplonly']:
-        duplonly = args['duplonly']
-    else:
-        duplonly = False
-
-    if args['taxcodes']:
-        taxcodes = args['taxcodes']     
-    else:   
-        taxcodes = False
 
-    if args['verbose']:
-        verbose = args['verbose']
-    else:   
+    if args.verbose:
+        verbose = args.verbose
+    else:
         verbose = False
 
+    threads = 4
 
+    if args.nthreads:
+        threads = args.nthreads
 
-    threads = 4
-    if args['nthreads']:
-        threads = args['nthreads']
-    if args['taxweights']:
-        from keras.models import model_from_json
-        json_file = open(  args['taxweights']+ '.json', 'r')
-        loaded_model_json = json_file.read()
-        json_file.close()
-        model = model_from_json(loaded_model_json)
-        # load weights into new model
-        model.load_weights(  args['taxweights']+".h5")
-        print("Loaded model from disk")
-        weights = model.get_weights()[0]
-        weights += 10 ** -10
-    else:
-        weights = None
-    if args['mastertree']:
-        mastertree = args['mastertree']
-    else:
-        mastertree=None
     start = time.time()
 
-    if omafile:
-        with open_file( omafile , mode="r") as h5_oma:
-            lsh_builder = LSHBuilder(h5_oma = h5_oma,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
-            treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose)
-            lsh_builder.run_pipeline(threads)
-    else:
-        lsh_builder = LSHBuilder(h5_oma = None,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
-        treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , verbose=verbose)
-        lsh_builder.run_pipeline(threads)
+    with open_file(omafile, mode="r") as h5_oma:
+        logging.info('Starting LSH builder')
+        lsh_builder = LSHBuilder(h5_oma=h5_oma,
+                                 saving_name=dbname,
+                                 verbose=verbose,
+                                 numperm=nperm,
+                                 taxfilter=taxfilter,
+                                 taxmask=taxmask
+                                 )
+    lsh_builder.run_pipeline(threads)
     print(time.time() - start)
     print('DONE')
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    args = argparse.Namespace(outpath='out', dbtype='eukarya', OMA='data/OmaServer.h5', verbose=True, nthreads=8)
+    main(args)
diff --git a/src/HogProf/orthoxml.py b/src/HogProf/orthoxml.py
deleted file mode 100755
index d0c8687..0000000
--- a/src/HogProf/orthoxml.py
+++ /dev/null
@@ -1,1930 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*- 
-
-#
-# Generated Mon Jun 27 10:13:43 2011 by generateDS.py version 2.5b.
-#
-
-import sys
-import getopt
-import re as re_
-
-etree_ = None
-Verbose_import_ = False
-(   XMLParser_import_none, XMLParser_import_lxml,
-    XMLParser_import_elementtree
-    ) = range(3)
-XMLParser_import_library = None
-try:
-    # lxml
-    from lxml import etree as etree_
-    XMLParser_import_library = XMLParser_import_lxml
-    if Verbose_import_:
-        print("running with lxml.etree")
-except ImportError:
-    try:
-        # cElementTree from Python 2.5+
-        import xml.etree.cElementTree as etree_
-        XMLParser_import_library = XMLParser_import_elementtree
-        if Verbose_import_:
-            print("running with cElementTree on Python 2.5+")
-    except ImportError:
-        try:
-            # ElementTree from Python 2.5+
-            import xml.etree.ElementTree as etree_
-            XMLParser_import_library = XMLParser_import_elementtree
-            if Verbose_import_:
-                print("running with ElementTree on Python 2.5+")
-        except ImportError:
-            try:
-                # normal cElementTree install
-                import cElementTree as etree_
-                XMLParser_import_library = XMLParser_import_elementtree
-                if Verbose_import_:
-                    print("running with cElementTree")
-            except ImportError:
-                try:
-                    # normal ElementTree install
-                    import elementtree.ElementTree as etree_
-                    XMLParser_import_library = XMLParser_import_elementtree
-                    if Verbose_import_:
-                        print("running with ElementTree")
-                except ImportError:
-                    raise ImportError("Failed to import ElementTree from any known place")
-
-def parsexml_(*args, **kwargs):
-    if (XMLParser_import_library == XMLParser_import_lxml and
-        'parser' not in kwargs):
-        # Use the lxml ElementTree compatible parser so that, e.g.,
-        #   we ignore comments.
-        kwargs['parser'] = etree_.ETCompatXMLParser()
-    doc = etree_.parse(*args, **kwargs)
-    return doc
-
-#
-# User methods
-#
-# Calls to the methods in these classes are generated by generateDS.py.
-# You can replace these methods by re-implementing the following class
-#   in a module named generatedssuper.py.
-
-try:
-    from generatedssuper import GeneratedsSuper
-except ImportError, exp:
-
-    class GeneratedsSuper(object):
-        def gds_format_string(self, input_data, input_name=''):
-            return input_data
-        def gds_validate_string(self, input_data, node, input_name=''):
-            return input_data
-        def gds_format_integer(self, input_data, input_name=''):
-            return '%d' % input_data
-        def gds_validate_integer(self, input_data, node, input_name=''):
-            return input_data
-        def gds_format_integer_list(self, input_data, input_name=''):
-            return '%s' % input_data
-        def gds_validate_integer_list(self, input_data, node, input_name=''):
-            values = input_data.split()
-            for value in values:
-                try:
-                    fvalue = float(value)
-                except (TypeError, ValueError), exp:
-                    raise_parse_error(node, 'Requires sequence of integers')
-            return input_data
-        def gds_format_float(self, input_data, input_name=''):
-            return '%f' % input_data
-        def gds_validate_float(self, input_data, node, input_name=''):
-            return input_data
-        def gds_format_float_list(self, input_data, input_name=''):
-            return '%s' % input_data
-        def gds_validate_float_list(self, input_data, node, input_name=''):
-            values = input_data.split()
-            for value in values:
-                try:
-                    fvalue = float(value)
-                except (TypeError, ValueError), exp:
-                    raise_parse_error(node, 'Requires sequence of floats')
-            return input_data
-        def gds_format_double(self, input_data, input_name=''):
-            return '%e' % input_data
-        def gds_validate_double(self, input_data, node, input_name=''):
-            return input_data
-        def gds_format_double_list(self, input_data, input_name=''):
-            return '%s' % input_data
-        def gds_validate_double_list(self, input_data, node, input_name=''):
-            values = input_data.split()
-            for value in values:
-                try:
-                    fvalue = float(value)
-                except (TypeError, ValueError), exp:
-                    raise_parse_error(node, 'Requires sequence of doubles')
-            return input_data
-        def gds_format_boolean(self, input_data, input_name=''):
-            return '%s' % input_data
-        def gds_validate_boolean(self, input_data, node, input_name=''):
-            return input_data
-        def gds_format_boolean_list(self, input_data, input_name=''):
-            return '%s' % input_data
-        def gds_validate_boolean_list(self, input_data, node, input_name=''):
-            values = input_data.split()
-            for value in values:
-                if value not in ('true', '1', 'false', '0', ):
-                    raise_parse_error(node, 'Requires sequence of booleans ("true", "1", "false", "0")')
-            return input_data
-        def gds_str_lower(self, instring):
-            return instring.lower()
-        def get_path_(self, node):
-            path_list = []
-            self.get_path_list_(node, path_list)
-            path_list.reverse()
-            path = '/'.join(path_list)
-            return path
-        Tag_strip_pattern_ = re_.compile(r'\{.*\}')
-        def get_path_list_(self, node, path_list):
-            if node is None:
-                return
-            tag = GeneratedsSuper.Tag_strip_pattern_.sub('', node.tag)
-            if tag:
-                path_list.append(tag)
-            self.get_path_list_(node.getparent(), path_list)
-
-
-#
-# If you have installed IPython you can uncomment and use the following.
-# IPython is available from http://ipython.scipy.org/.
-#
-
-## from IPython.Shell import IPShellEmbed
-## args = ''
-## ipshell = IPShellEmbed(args,
-##     banner = 'Dropping into IPython',
-##     exit_msg = 'Leaving Interpreter, back to program.')
-
-# Then use the following line where and when you want to drop into the
-# IPython shell:
-#    ipshell('<some message> -- Entering ipshell.\nHit Ctrl-D to exit')
-
-#
-# Globals
-#
-
-ExternalEncoding = 'utf-8'
-Tag_pattern_ = re_.compile(r'({.*})?(.*)')
-STRING_CLEANUP_PAT = re_.compile(r"[\n\r\s]+")
-
-#
-# Support/utility functions.
-#
-
-def showIndent(outfile, level):
-    for idx in range(level):
-        outfile.write('    ')
-
-def quote_xml(inStr):
-    if not inStr:
-        return ''
-    s1 = (isinstance(inStr, basestring) and inStr or
-          '%s' % inStr)
-    s1 = s1.replace('&', '&amp;')
-    s1 = s1.replace('<', '&lt;')
-    s1 = s1.replace('>', '&gt;')
-    return s1
-
-def quote_attrib(inStr):
-    s1 = (isinstance(inStr, basestring) and inStr or
-          '%s' % inStr)
-    s1 = s1.replace('&', '&amp;')
-    s1 = s1.replace('<', '&lt;')
-    s1 = s1.replace('>', '&gt;')
-    if '"' in s1:
-        if "'" in s1:
-            s1 = '"%s"' % s1.replace('"', "&quot;")
-        else:
-            s1 = "'%s'" % s1
-    else:
-        s1 = '"%s"' % s1
-    return s1
-
-def quote_python(inStr):
-    s1 = inStr
-    if s1.find("'") == -1:
-        if s1.find('\n') == -1:
-            return "'%s'" % s1
-        else:
-            return "'''%s'''" % s1
-    else:
-        if s1.find('"') != -1:
-            s1 = s1.replace('"', '\\"')
-        if s1.find('\n') == -1:
-            return '"%s"' % s1
-        else:
-            return '"""%s"""' % s1
-
-def get_all_text_(node):
-    if node.text is not None:
-        text = node.text
-    else:
-        text = ''
-    for child in node:
-        if child.tail is not None:
-            text += child.tail
-    return text
-
-def find_attr_value_(attr_name, node):
-    attrs = node.attrib
-    # First try with no namespace.
-    value = attrs.get(attr_name)
-    if value is None:
-        # Now try the other possible namespaces.
-        namespaces = node.nsmap.itervalues()
-        for namespace in namespaces:
-            value = attrs.get('{%s}%s' % (namespace, attr_name, ))
-            if value is not None:
-                break
-    return value
-
-
-class GDSParseError(Exception):
-    pass
-
-def raise_parse_error(node, msg):
-    if XMLParser_import_library == XMLParser_import_lxml:
-        msg = '%s (element %s/line %d)' % (msg, node.tag, node.sourceline, )
-    else:
-        msg = '%s (element %s)' % (msg, node.tag, )
-    raise GDSParseError(msg)
-
-
-class MixedContainer:
-    # Constants for category:
-    CategoryNone = 0
-    CategoryText = 1
-    CategorySimple = 2
-    CategoryComplex = 3
-    # Constants for content_type:
-    TypeNone = 0
-    TypeText = 1
-    TypeString = 2
-    TypeInteger = 3
-    TypeFloat = 4
-    TypeDecimal = 5
-    TypeDouble = 6
-    TypeBoolean = 7
-    def __init__(self, category, content_type, name, value):
-        self.category = category
-        self.content_type = content_type
-        self.name = name
-        self.value = value
-    def getCategory(self):
-        return self.category
-    def getContenttype(self, content_type):
-        return self.content_type
-    def getValue(self):
-        return self.value
-    def getName(self):
-        return self.name
-    def export(self, outfile, level, name, namespace):
-        if self.category == MixedContainer.CategoryText:
-            # Prevent exporting empty content as empty lines.
-            if self.value.strip(): 
-                outfile.write(self.value)
-        elif self.category == MixedContainer.CategorySimple:
-            self.exportSimple(outfile, level, name)
-        else:    # category == MixedContainer.CategoryComplex
-            self.value.export(outfile, level, namespace,name)
-    def exportSimple(self, outfile, level, name):
-        if self.content_type == MixedContainer.TypeString:
-            outfile.write('<%s>%s</%s>' % (self.name, self.value, self.name))
-        elif self.content_type == MixedContainer.TypeInteger or \
-                self.content_type == MixedContainer.TypeBoolean:
-            outfile.write('<%s>%d</%s>' % (self.name, self.value, self.name))
-        elif self.content_type == MixedContainer.TypeFloat or \
-                self.content_type == MixedContainer.TypeDecimal:
-            outfile.write('<%s>%f</%s>' % (self.name, self.value, self.name))
-        elif self.content_type == MixedContainer.TypeDouble:
-            outfile.write('<%s>%g</%s>' % (self.name, self.value, self.name))
-    def exportLiteral(self, outfile, level, name):
-        if self.category == MixedContainer.CategoryText:
-            showIndent(outfile, level)
-            outfile.write('model_.MixedContainer(%d, %d, "%s", "%s"),\n' % \
-                (self.category, self.content_type, self.name, self.value))
-        elif self.category == MixedContainer.CategorySimple:
-            showIndent(outfile, level)
-            outfile.write('model_.MixedContainer(%d, %d, "%s", "%s"),\n' % \
-                (self.category, self.content_type, self.name, self.value))
-        else:    # category == MixedContainer.CategoryComplex
-            showIndent(outfile, level)
-            outfile.write('model_.MixedContainer(%d, %d, "%s",\n' % \
-                (self.category, self.content_type, self.name,))
-            self.value.exportLiteral(outfile, level + 1)
-            showIndent(outfile, level)
-            outfile.write(')\n')
-
-
-class MemberSpec_(object):
-    def __init__(self, name='', data_type='', container=0):
-        self.name = name
-        self.data_type = data_type
-        self.container = container
-    def set_name(self, name): self.name = name
-    def get_name(self): return self.name
-    def set_data_type(self, data_type): self.data_type = data_type
-    def get_data_type_chain(self): return self.data_type
-    def get_data_type(self):
-        if isinstance(self.data_type, list):
-            if len(self.data_type) > 0:
-                return self.data_type[-1]
-            else:
-                return 'xs:string'
-        else:
-            return self.data_type
-    def set_container(self, container): self.container = container
-    def get_container(self): return self.container
-
-def _cast(typ, value):
-    if typ is None or value is None:
-        return value
-    return typ(value)
-
-#
-# Data representation classes.
-#
-
-class orthoXML(GeneratedsSuper):
-    """The OrthoXML root element. The source program/database of the file
-    for instance OMA or InParanoid. The version number of the file.
-    The version or release number of the source program/database at
-    time the file was generated."""
-    subclass = None
-    superclass = None
-    def __init__(self, origin=None, version=None, originVersion=None, notes=None, species=None, scores=None, groups=None, valueOf_=None):
-        self.origin = _cast(None, origin)
-        self.version = _cast(float, version)
-        self.originVersion = _cast(None, originVersion)
-        self.notes = notes
-        if species is None:
-            self.species = []
-        else:
-            self.species = species
-        self.scores = scores
-        self.groups = groups
-    def factory(*args_, **kwargs_):
-        if orthoXML.subclass:
-            return orthoXML.subclass(*args_, **kwargs_)
-        else:
-            return orthoXML(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_notes(self): return self.notes
-    def set_notes(self, notes): self.notes = notes
-    def get_species(self): return self.species
-    def set_species(self, species): self.species = species
-    def add_species(self, value): self.species.append(value)
-    def insert_species(self, index, value): self.species[index] = value
-    def get_scores(self): return self.scores
-    def set_scores(self, scores): self.scores = scores
-    def get_groups(self): return self.groups
-    def set_groups(self, groups): self.groups = groups
-    def get_origin(self): return self.origin
-    def set_origin(self, origin): self.origin = origin
-    def get_version(self): return self.version
-    def set_version(self, version): self.version = version
-    def get_originVersion(self): return self.originVersion
-    def set_originVersion(self, originVersion): self.originVersion = originVersion
-    def export(self, outfile, level, namespace_='ortho:', name_='orthoXML', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='orthoXML')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='orthoXML'):
-        if self.origin is not None and 'origin' not in already_processed:
-            already_processed.append('origin')
-            outfile.write(' origin=%s' % (self.gds_format_string(quote_attrib(self.origin).encode(ExternalEncoding), input_name='origin'), ))
-        if self.version is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            outfile.write(' version="%s"' % self.gds_format_float(self.version, input_name='version'))
-        if self.originVersion is not None and 'originVersion' not in already_processed:
-            already_processed.append('originVersion')
-            outfile.write(' originVersion=%s' % (self.gds_format_string(quote_attrib(self.originVersion).encode(ExternalEncoding), input_name='originVersion'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='orthoXML', fromsubclass_=False):
-        if self.notes:
-            self.notes.export(outfile, level, namespace_, name_='notes')
-        for species_ in self.species:
-            species_.export(outfile, level, namespace_, name_='species')
-        if self.scores:
-            self.scores.export(outfile, level, namespace_, name_='scores')
-        if self.groups:
-            self.groups.export(outfile, level, namespace_, name_='groups', )
-    def hasContent_(self):
-        if (
-            self.notes is not None or
-            self.species or
-            self.scores is not None or
-            self.groups is not None
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='orthoXML'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.origin is not None and 'origin' not in already_processed:
-            already_processed.append('origin')
-            showIndent(outfile, level)
-            outfile.write('origin = "%s",\n' % (self.origin,))
-        if self.version is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            showIndent(outfile, level)
-            outfile.write('version = %f,\n' % (self.version,))
-        if self.originVersion is not None and 'originVersion' not in already_processed:
-            already_processed.append('originVersion')
-            showIndent(outfile, level)
-            outfile.write('originVersion = "%s",\n' % (self.originVersion,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        if self.notes is not None:
-            showIndent(outfile, level)
-            outfile.write('notes=model_.notes(\n')
-            self.notes.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        showIndent(outfile, level)
-        outfile.write('species=[\n')
-        level += 1
-        for species_ in self.species:
-            showIndent(outfile, level)
-            outfile.write('model_.species(\n')
-            species_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        if self.scores is not None:
-            showIndent(outfile, level)
-            outfile.write('scores=model_.scores(\n')
-            self.scores.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        if self.groups is not None:
-            showIndent(outfile, level)
-            outfile.write('groups=model_.groups(\n')
-            self.groups.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('origin', node)
-        if value is not None and 'origin' not in already_processed:
-            already_processed.append('origin')
-            self.origin = value
-        value = find_attr_value_('version', node)
-        if value is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            try:
-                self.version = float(value)
-            except ValueError, exp:
-                raise ValueError('Bad float/double attribute (version): %s' % exp)
-        value = find_attr_value_('originVersion', node)
-        if value is not None and 'originVersion' not in already_processed:
-            already_processed.append('originVersion')
-            self.originVersion = value
-            self.originVersion = ' '.join(self.originVersion.split())
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'notes':
-            obj_ = notes.factory()
-            obj_.build(child_)
-            self.set_notes(obj_)
-        elif nodeName_ == 'species':
-            obj_ = species.factory()
-            obj_.build(child_)
-            self.species.append(obj_)
-        elif nodeName_ == 'scores':
-            obj_ = scores.factory()
-            obj_.build(child_)
-            self.set_scores(obj_)
-        elif nodeName_ == 'groups':
-            obj_ = groups.factory()
-            obj_.build(child_)
-            self.set_groups(obj_)
-# end class orthoXML
-
-
-class species(GeneratedsSuper):
-    """The species element contains all sequences of one species. The NCBI
-    Taxonomy identifier of the species to identify it unambiguously.
-    The name of the species."""
-    subclass = None
-    superclass = None
-    def __init__(self, name=None, NCBITaxId=None, database=None, notes=None, valueOf_=None):
-        self.name = _cast(None, name)
-        self.NCBITaxId = _cast(int, NCBITaxId)
-        if database is None:
-            self.database = []
-        else:
-            self.database = database
-        self.notes = notes
-    def factory(*args_, **kwargs_):
-        if species.subclass:
-            return species.subclass(*args_, **kwargs_)
-        else:
-            return species(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_database(self): return self.database
-    def set_database(self, database): self.database = database
-    def add_database(self, value): self.database.append(value)
-    def insert_database(self, index, value): self.database[index] = value
-    def get_notes(self): return self.notes
-    def set_notes(self, notes): self.notes = notes
-    def get_name(self): return self.name
-    def set_name(self, name): self.name = name
-    def get_NCBITaxId(self): return self.NCBITaxId
-    def set_NCBITaxId(self, NCBITaxId): self.NCBITaxId = NCBITaxId
-    def export(self, outfile, level, namespace_='ortho:', name_='species', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='species')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='species'):
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), ))
-        if self.NCBITaxId is not None and 'NCBITaxId' not in already_processed:
-            already_processed.append('NCBITaxId')
-            outfile.write(' NCBITaxId="%s"' % self.gds_format_integer(self.NCBITaxId, input_name='NCBITaxId'))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='species', fromsubclass_=False):
-        for database_ in self.database:
-            database_.export(outfile, level, namespace_, name_='database')
-        if self.notes:
-            self.notes.export(outfile, level, namespace_, name_='notes')
-    def hasContent_(self):
-        if (
-            self.database or
-            self.notes is not None
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='species'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            showIndent(outfile, level)
-            outfile.write('name = "%s",\n' % (self.name,))
-        if self.NCBITaxId is not None and 'NCBITaxId' not in already_processed:
-            already_processed.append('NCBITaxId')
-            showIndent(outfile, level)
-            outfile.write('NCBITaxId = %d,\n' % (self.NCBITaxId,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('database=[\n')
-        level += 1
-        for database_ in self.database:
-            showIndent(outfile, level)
-            outfile.write('model_.database(\n')
-            database_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        if self.notes is not None:
-            showIndent(outfile, level)
-            outfile.write('notes=model_.notes(\n')
-            self.notes.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('name', node)
-        if value is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            self.name = value
-        value = find_attr_value_('NCBITaxId', node)
-        if value is not None and 'NCBITaxId' not in already_processed:
-            already_processed.append('NCBITaxId')
-            try:
-                self.NCBITaxId = int(value)
-            except ValueError, exp:
-                raise_parse_error(node, 'Bad integer attribute: %s' % exp)
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'database':
-            obj_ = database.factory()
-            obj_.build(child_)
-            self.database.append(obj_)
-        elif nodeName_ == 'notes':
-            obj_ = notes.factory()
-            obj_.build(child_)
-            self.set_notes(obj_)
-# end class species
-
-
-class database(GeneratedsSuper):
-    """A database element contains all genes from a single database/source.
-    A Uniform Resource Identifier (URI) pointing to the gene. In the
-    simplest case one could imagine a URL which in concatenation
-    with the gene identifier links to the website of the gene in the
-    source database. However, how this is used depends on the source
-    of the orthoXML file. Name of the database. A Uniform Resource
-    Identifier (URI) pointing to the protein. A Uniform Resource
-    Identifier (URI) pointing to the transcript. Version number of
-    the database."""
-    subclass = None
-    superclass = None
-    def __init__(self, transcriptLink=None, protLink=None, geneLink=None, name=None, version=None, genes=None, valueOf_=None):
-        self.transcriptLink = _cast(None, transcriptLink)
-        self.protLink = _cast(None, protLink)
-        self.geneLink = _cast(None, geneLink)
-        self.name = _cast(None, name)
-        self.version = _cast(None, version)
-        self.genes = genes
-    def factory(*args_, **kwargs_):
-        if database.subclass:
-            return database.subclass(*args_, **kwargs_)
-        else:
-            return database(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_genes(self): return self.genes
-    def set_genes(self, genes): self.genes = genes
-    def get_transcriptLink(self): return self.transcriptLink
-    def set_transcriptLink(self, transcriptLink): self.transcriptLink = transcriptLink
-    def get_protLink(self): return self.protLink
-    def set_protLink(self, protLink): self.protLink = protLink
-    def get_geneLink(self): return self.geneLink
-    def set_geneLink(self, geneLink): self.geneLink = geneLink
-    def get_name(self): return self.name
-    def set_name(self, name): self.name = name
-    def get_version(self): return self.version
-    def set_version(self, version): self.version = version
-    def export(self, outfile, level, namespace_='ortho:', name_='database', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='database')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='database'):
-        if self.transcriptLink is not None and 'transcriptLink' not in already_processed:
-            already_processed.append('transcriptLink')
-            outfile.write(' transcriptLink=%s' % (self.gds_format_string(quote_attrib(self.transcriptLink).encode(ExternalEncoding), input_name='transcriptLink'), ))
-        if self.protLink is not None and 'protLink' not in already_processed:
-            already_processed.append('protLink')
-            outfile.write(' protLink=%s' % (self.gds_format_string(quote_attrib(self.protLink).encode(ExternalEncoding), input_name='protLink'), ))
-        if self.geneLink is not None and 'geneLink' not in already_processed:
-            already_processed.append('geneLink')
-            outfile.write(' geneLink=%s' % (self.gds_format_string(quote_attrib(self.geneLink).encode(ExternalEncoding), input_name='geneLink'), ))
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), ))
-        if self.version is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            outfile.write(' version=%s' % (self.gds_format_string(quote_attrib(self.version).encode(ExternalEncoding), input_name='version'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='database', fromsubclass_=False):
-        if self.genes:
-            self.genes.export(outfile, level, namespace_, name_='genes', )
-    def hasContent_(self):
-        if (
-            self.genes is not None
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='database'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.transcriptLink is not None and 'transcriptLink' not in already_processed:
-            already_processed.append('transcriptLink')
-            showIndent(outfile, level)
-            outfile.write('transcriptLink = "%s",\n' % (self.transcriptLink,))
-        if self.protLink is not None and 'protLink' not in already_processed:
-            already_processed.append('protLink')
-            showIndent(outfile, level)
-            outfile.write('protLink = "%s",\n' % (self.protLink,))
-        if self.geneLink is not None and 'geneLink' not in already_processed:
-            already_processed.append('geneLink')
-            showIndent(outfile, level)
-            outfile.write('geneLink = "%s",\n' % (self.geneLink,))
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            showIndent(outfile, level)
-            outfile.write('name = "%s",\n' % (self.name,))
-        if self.version is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            showIndent(outfile, level)
-            outfile.write('version = "%s",\n' % (self.version,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        if self.genes is not None:
-            showIndent(outfile, level)
-            outfile.write('genes=model_.genes(\n')
-            self.genes.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('transcriptLink', node)
-        if value is not None and 'transcriptLink' not in already_processed:
-            already_processed.append('transcriptLink')
-            self.transcriptLink = value
-        value = find_attr_value_('protLink', node)
-        if value is not None and 'protLink' not in already_processed:
-            already_processed.append('protLink')
-            self.protLink = value
-        value = find_attr_value_('geneLink', node)
-        if value is not None and 'geneLink' not in already_processed:
-            already_processed.append('geneLink')
-            self.geneLink = value
-        value = find_attr_value_('name', node)
-        if value is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            self.name = value
-        value = find_attr_value_('version', node)
-        if value is not None and 'version' not in already_processed:
-            already_processed.append('version')
-            self.version = value
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'genes':
-            obj_ = genes.factory()
-            obj_.build(child_)
-            self.set_genes(obj_)
-# end class database
-
-
-class genes(GeneratedsSuper):
-    """A gene element represents a list of genes."""
-    subclass = None
-    superclass = None
-    def __init__(self, gene=None, valueOf_=None):
-        if gene is None:
-            self.gene = []
-        else:
-            self.gene = gene
-    def factory(*args_, **kwargs_):
-        if genes.subclass:
-            return genes.subclass(*args_, **kwargs_)
-        else:
-            return genes(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_gene(self): return self.gene
-    def set_gene(self, gene): self.gene = gene
-    def add_gene(self, value): self.gene.append(value)
-    def insert_gene(self, index, value): self.gene[index] = value
-    def export(self, outfile, level, namespace_='ortho:', name_='genes', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='genes')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='genes'):
-        pass
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='genes', fromsubclass_=False):
-        for gene_ in self.gene:
-            gene_.export(outfile, level, namespace_, name_='gene')
-    def hasContent_(self):
-        if (
-            self.gene
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='genes'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        pass
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('gene=[\n')
-        level += 1
-        for gene_ in self.gene:
-            showIndent(outfile, level)
-            outfile.write('model_.gene(\n')
-            gene_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        pass
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'gene':
-            obj_ = gene.factory()
-            obj_.build(child_)
-            self.gene.append(obj_)
-# end class genes
-
-
-class gene(GeneratedsSuper):
-    """The gene element represents a single gene, protein or transcript. It
-    is in fact a set of identifiers: one internal identifier that is
-    used to link from geneRef elements in ortholog clusters and gene
-    identifiers, transcript identifiers and protein identifiers to
-    identify the molecule. The proper term for this element would
-    therefore rather be molecule. However, as the general purpose of
-    orthoXML is to represent orthology data for genes the term gene
-    is used instead. Gene, protein and transcipt identifiers are
-    optional but at least one of the three should be given. The
-    source database of the gene is defined through the database
-    element in which the gene element lies and the identifiers
-    should stem from this source. Identifier of the gene in the
-    source database. Multiple splice forms are possible by having
-    the same geneId more than once. Internal identifier to link to
-    the gene via the geneRef elements. Identifier of the protein in
-    the source database. Identifier of the transcript in the source
-    database."""
-    subclass = None
-    superclass = None
-    def __init__(self, protId=None, id=None, geneId=None, transcriptId=None, valueOf_=None):
-        self.protId = _cast(None, protId)
-        self.id = _cast(int, id)
-        self.geneId = _cast(None, geneId)
-        self.transcriptId = _cast(None, transcriptId)
-        pass
-    def factory(*args_, **kwargs_):
-        if gene.subclass:
-            return gene.subclass(*args_, **kwargs_)
-        else:
-            return gene(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_protId(self): return self.protId
-    def set_protId(self, protId): self.protId = protId
-    def get_id(self): return self.id
-    def set_id(self, id): self.id = id
-    def get_geneId(self): return self.geneId
-    def set_geneId(self, geneId): self.geneId = geneId
-    def get_transcriptId(self): return self.transcriptId
-    def set_transcriptId(self, transcriptId): self.transcriptId = transcriptId
-    def export(self, outfile, level, namespace_='ortho:', name_='gene', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='gene')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='gene'):
-        if self.protId is not None and 'protId' not in already_processed:
-            already_processed.append('protId')
-            outfile.write(' protId=%s' % (self.gds_format_string(quote_attrib(self.protId).encode(ExternalEncoding), input_name='protId'), ))
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            outfile.write(' id="%s"' % self.gds_format_integer(self.id, input_name='id'))
-        if self.geneId is not None and 'geneId' not in already_processed:
-            already_processed.append('geneId')
-            outfile.write(' geneId=%s' % (self.gds_format_string(quote_attrib(self.geneId).encode(ExternalEncoding), input_name='geneId'), ))
-        if self.transcriptId is not None and 'transcriptId' not in already_processed:
-            already_processed.append('transcriptId')
-            outfile.write(' transcriptId=%s' % (self.gds_format_string(quote_attrib(self.transcriptId).encode(ExternalEncoding), input_name='transcriptId'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='gene', fromsubclass_=False):
-        pass
-    def hasContent_(self):
-        if (
-
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='gene'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.protId is not None and 'protId' not in already_processed:
-            already_processed.append('protId')
-            showIndent(outfile, level)
-            outfile.write('protId = "%s",\n' % (self.protId,))
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            showIndent(outfile, level)
-            outfile.write('id = %d,\n' % (self.id,))
-        if self.geneId is not None and 'geneId' not in already_processed:
-            already_processed.append('geneId')
-            showIndent(outfile, level)
-            outfile.write('geneId = "%s",\n' % (self.geneId,))
-        if self.transcriptId is not None and 'transcriptId' not in already_processed:
-            already_processed.append('transcriptId')
-            showIndent(outfile, level)
-            outfile.write('transcriptId = "%s",\n' % (self.transcriptId,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        pass
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('protId', node)
-        if value is not None and 'protId' not in already_processed:
-            already_processed.append('protId')
-            self.protId = value
-        value = find_attr_value_('id', node)
-        if value is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            try:
-                self.id = int(value)
-            except ValueError, exp:
-                raise_parse_error(node, 'Bad integer attribute: %s' % exp)
-        value = find_attr_value_('geneId', node)
-        if value is not None and 'geneId' not in already_processed:
-            already_processed.append('geneId')
-            self.geneId = value
-        value = find_attr_value_('transcriptId', node)
-        if value is not None and 'transcriptId' not in already_processed:
-            already_processed.append('transcriptId')
-            self.transcriptId = value
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        pass
-# end class gene
-
-
-class scores(GeneratedsSuper):
-    """A list of score definitions."""
-    subclass = None
-    superclass = None
-    def __init__(self, scoreDef=None, valueOf_=None):
-        if scoreDef is None:
-            self.scoreDef = []
-        else:
-            self.scoreDef = scoreDef
-    def factory(*args_, **kwargs_):
-        if scores.subclass:
-            return scores.subclass(*args_, **kwargs_)
-        else:
-            return scores(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_scoreDef(self): return self.scoreDef
-    def set_scoreDef(self, scoreDef): self.scoreDef = scoreDef
-    def add_scoreDef(self, value): self.scoreDef.append(value)
-    def insert_scoreDef(self, index, value): self.scoreDef[index] = value
-    def export(self, outfile, level, namespace_='ortho:', name_='scores', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='scores')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='scores'):
-        pass
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='scores', fromsubclass_=False):
-        for scoreDef_ in self.scoreDef:
-            scoreDef_.export(outfile, level, namespace_, name_='scoreDef')
-    def hasContent_(self):
-        if (
-            self.scoreDef
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='scores'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        pass
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('scoreDef=[\n')
-        level += 1
-        for scoreDef_ in self.scoreDef:
-            showIndent(outfile, level)
-            outfile.write('model_.scoreDef(\n')
-            scoreDef_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        pass
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'scoreDef':
-            obj_ = scoreDef.factory()
-            obj_.build(child_)
-            self.scoreDef.append(obj_)
-# end class scores
-
-
-class groups(GeneratedsSuper):
-    """Represents the list of ortholog groups. Note that the purpose of
-    OrthoXML is to store orthology assignment hence on the top level
-    only ortholog groups are allowed."""
-    subclass = None
-    superclass = None
-    def __init__(self, orthologGroup=None, valueOf_=None):
-        if orthologGroup is None:
-            self.orthologGroup = []
-        else:
-            self.orthologGroup = orthologGroup
-    def factory(*args_, **kwargs_):
-        if groups.subclass:
-            return groups.subclass(*args_, **kwargs_)
-        else:
-            return groups(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_orthologGroup(self): return self.orthologGroup
-    def set_orthologGroup(self, orthologGroup): self.orthologGroup = orthologGroup
-    def add_orthologGroup(self, value): self.orthologGroup.append(value)
-    def insert_orthologGroup(self, index, value): self.orthologGroup[index] = value
-    def export(self, outfile, level, namespace_='ortho:', name_='groups', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='groups')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='groups'):
-        pass
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='groups', fromsubclass_=False):
-        for orthologGroup_ in self.orthologGroup:
-            orthologGroup_.export(outfile, level, namespace_, name_='orthologGroup')
-    def hasContent_(self):
-        if (
-            self.orthologGroup
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='groups'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        pass
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('orthologGroup=[\n')
-        level += 1
-        for orthologGroup_ in self.orthologGroup:
-            showIndent(outfile, level)
-            outfile.write('model_.group(\n')
-            orthologGroup_.exportLiteral(outfile, level, name_='group')
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        pass
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'orthologGroup':
-            obj_ = group.factory()
-            obj_.build(child_)
-            self.orthologGroup.append(obj_)
-# end class groups
-
-
-class group(GeneratedsSuper):
-    """A group of genes or nested groups. In case of a orothologGroup
-    element, all genes in the group or in the nested groups are
-    orthologs to each other i.e. stem from the same gene in the last
-    common ancester of the species. In case of a paralogGroup the
-    genes are paralogs to each other. Subgroups within the group
-    allow the represention of phylogenetic trees. For more details
-    and examples see http://orthoxml.org/orthoxml_doc.html. A group
-    can may contain two or more of the three alternatives geneRef,
-    paralogGroup, and orthologGroup. By combining these, complex
-    phylogenies are possible. Identifier for the group in context of
-    the resource. This attribute is not required but if your
-    resource provides identifiers for the ortholog groups we
-    strongly recommend to use it at least for the top level groups."""
-    subclass = None
-    superclass = None
-    def __init__(self, id=None, score=None, property=None, geneRef=None, paralogGroup=None, orthologGroup=None, notes=None, valueOf_=None):
-        self.id = _cast(None, id)
-        if score is None:
-            self.score = []
-        else:
-            self.score = score
-        if property is None:
-            self.property = []
-        else:
-            self.property = property
-        if geneRef is None:
-            self.geneRef = []
-        else:
-            self.geneRef = geneRef
-        if paralogGroup is None:
-            self.paralogGroup = []
-        else:
-            self.paralogGroup = paralogGroup
-        if orthologGroup is None:
-            self.orthologGroup = []
-        else:
-            self.orthologGroup = orthologGroup
-        self.notes = notes
-    def factory(*args_, **kwargs_):
-        if group.subclass:
-            return group.subclass(*args_, **kwargs_)
-        else:
-            return group(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_score(self): return self.score
-    def set_score(self, score): self.score = score
-    def add_score(self, value): self.score.append(value)
-    def insert_score(self, index, value): self.score[index] = value
-    def get_property(self): return self.property
-    def set_property(self, property): self.property = property
-    def add_property(self, value): self.property.append(value)
-    def insert_property(self, index, value): self.property[index] = value
-    def get_geneRef(self): return self.geneRef
-    def set_geneRef(self, geneRef): self.geneRef = geneRef
-    def add_geneRef(self, value): self.geneRef.append(value)
-    def insert_geneRef(self, index, value): self.geneRef[index] = value
-    def get_paralogGroup(self): return self.paralogGroup
-    def set_paralogGroup(self, paralogGroup): self.paralogGroup = paralogGroup
-    def add_paralogGroup(self, value): self.paralogGroup.append(value)
-    def insert_paralogGroup(self, index, value): self.paralogGroup[index] = value
-    def get_orthologGroup(self): return self.orthologGroup
-    def set_orthologGroup(self, orthologGroup): self.orthologGroup = orthologGroup
-    def add_orthologGroup(self, value): self.orthologGroup.append(value)
-    def insert_orthologGroup(self, index, value): self.orthologGroup[index] = value
-    def get_notes(self): return self.notes
-    def set_notes(self, notes): self.notes = notes
-    def get_id(self): return self.id
-    def set_id(self, id): self.id = id
-    def export(self, outfile, level, namespace_='ortho:', name_='group', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='group')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='group'):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            outfile.write(' id=%s' % (self.gds_format_string(quote_attrib(self.id).encode(ExternalEncoding), input_name='id'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='group', fromsubclass_=False):
-        for score_ in self.score:
-            score_.export(outfile, level, namespace_, name_='score')
-        for property_ in self.property:
-            property_.export(outfile, level, namespace_, name_='property')
-        for geneRef_ in self.geneRef:
-            geneRef_.export(outfile, level, namespace_, name_='geneRef')
-        for paralogGroup_ in self.paralogGroup:
-            paralogGroup_.export(outfile, level, namespace_, name_='paralogGroup')
-        for orthologGroup_ in self.orthologGroup:
-            orthologGroup_.export(outfile, level, namespace_, name_='orthologGroup')
-        if self.notes:
-            self.notes.export(outfile, level, namespace_, name_='notes')
-    def hasContent_(self):
-        if (
-            self.score or
-            self.property or
-            self.geneRef or
-            self.paralogGroup or
-            self.orthologGroup or
-            self.notes is not None
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='group'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            showIndent(outfile, level)
-            outfile.write('id = "%s",\n' % (self.id,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('score=[\n')
-        level += 1
-        for score_ in self.score:
-            showIndent(outfile, level)
-            outfile.write('model_.score(\n')
-            score_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        showIndent(outfile, level)
-        outfile.write('property=[\n')
-        level += 1
-        for property_ in self.property:
-            showIndent(outfile, level)
-            outfile.write('model_.property(\n')
-            property_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        showIndent(outfile, level)
-        outfile.write('geneRef=[\n')
-        level += 1
-        for geneRef_ in self.geneRef:
-            showIndent(outfile, level)
-            outfile.write('model_.geneRef(\n')
-            geneRef_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        showIndent(outfile, level)
-        outfile.write('paralogGroup=[\n')
-        level += 1
-        for paralogGroup_ in self.paralogGroup:
-            showIndent(outfile, level)
-            outfile.write('model_.group(\n')
-            paralogGroup_.exportLiteral(outfile, level, name_='group')
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        showIndent(outfile, level)
-        outfile.write('orthologGroup=[\n')
-        level += 1
-        for orthologGroup_ in self.orthologGroup:
-            showIndent(outfile, level)
-            outfile.write('model_.group(\n')
-            orthologGroup_.exportLiteral(outfile, level, name_='group')
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        if self.notes is not None:
-            showIndent(outfile, level)
-            outfile.write('notes=model_.notes(\n')
-            self.notes.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('id', node)
-        if value is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            self.id = value
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'score':
-            obj_ = score.factory()
-            obj_.build(child_)
-            self.score.append(obj_)
-        elif nodeName_ == 'property':
-            obj_ = property.factory()
-            obj_.build(child_)
-            self.property.append(obj_)
-        elif nodeName_ == 'geneRef':
-            obj_ = geneRef.factory()
-            obj_.build(child_)
-            self.geneRef.append(obj_)
-        elif nodeName_ == 'paralogGroup':
-            obj_ = group.factory()
-            obj_.build(child_)
-            self.paralogGroup.append(obj_)
-        elif nodeName_ == 'orthologGroup':
-            obj_ = group.factory()
-            obj_.build(child_)
-            self.orthologGroup.append(obj_)
-        elif nodeName_ == 'notes':
-            obj_ = notes.factory()
-            obj_.build(child_)
-            self.set_notes(obj_)
-# end class group
-
-
-class geneRef(GeneratedsSuper):
-    """The geneRef element is a link to the gene definition under the
-    species element. It defines the members of an ortholog or
-    paralog group. The same gene can be referenced muliple times.
-    The geneRef element can have multiple score elements and a notes
-    elements as children. The notes element can for instance be used
-    for special, ortholog-database-specific information (with
-    InParanoid, for example, we could use it to mark the seed
-    orthologs). Internal identifier for a gene element defined under
-    the species element."""
-    subclass = None
-    superclass = None
-    def __init__(self, id=None, score=None, notes=None, valueOf_=None):
-        self.id = _cast(int, id)
-        if score is None:
-            self.score = []
-        else:
-            self.score = score
-        self.notes = notes
-    def factory(*args_, **kwargs_):
-        if geneRef.subclass:
-            return geneRef.subclass(*args_, **kwargs_)
-        else:
-            return geneRef(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_score(self): return self.score
-    def set_score(self, score): self.score = score
-    def add_score(self, value): self.score.append(value)
-    def insert_score(self, index, value): self.score[index] = value
-    def get_notes(self): return self.notes
-    def set_notes(self, notes): self.notes = notes
-    def get_id(self): return self.id
-    def set_id(self, id): self.id = id
-    def export(self, outfile, level, namespace_='ortho:', name_='geneRef', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='geneRef')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            showIndent(outfile, level)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='geneRef'):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            outfile.write(' id="%s"' % self.gds_format_integer(self.id, input_name='id'))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='geneRef', fromsubclass_=False):
-        for score_ in self.score:
-            score_.export(outfile, level, namespace_, name_='score')
-        if self.notes:
-            self.notes.export(outfile, level, namespace_, name_='notes')
-    def hasContent_(self):
-        if (
-            self.score or
-            self.notes is not None
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='geneRef'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            showIndent(outfile, level)
-            outfile.write('id = %d,\n' % (self.id,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        showIndent(outfile, level)
-        outfile.write('score=[\n')
-        level += 1
-        for score_ in self.score:
-            showIndent(outfile, level)
-            outfile.write('model_.score(\n')
-            score_.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-        level -= 1
-        showIndent(outfile, level)
-        outfile.write('],\n')
-        if self.notes is not None:
-            showIndent(outfile, level)
-            outfile.write('notes=model_.notes(\n')
-            self.notes.exportLiteral(outfile, level)
-            showIndent(outfile, level)
-            outfile.write('),\n')
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('id', node)
-        if value is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            try:
-                self.id = int(value)
-            except ValueError, exp:
-                raise_parse_error(node, 'Bad integer attribute: %s' % exp)
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if nodeName_ == 'score':
-            obj_ = score.factory()
-            obj_.build(child_)
-            self.score.append(obj_)
-        elif nodeName_ == 'notes':
-            obj_ = notes.factory()
-            obj_.build(child_)
-            self.set_notes(obj_)
-# end class geneRef
-
-
-class scoreDef(GeneratedsSuper):
-    """The scoreDef element defines a score. One of the concepts of
-    orthoXML is to be as flexible as possible but still uniformly
-    parsable. Part of this is to allow every ortholog resource to
-    give their own types of scores for groups or group members,
-    which is done using score elements. Score elements can be
-    defined to apply to either groups or geneRefs. It is possible to
-    define multiple scores. An internal identifier to link to the
-    scoreDef from a score element. Description of the score."""
-    subclass = None
-    superclass = None
-    def __init__(self, id=None, desc=None, valueOf_=None):
-        self.id = _cast(None, id)
-        self.desc = _cast(None, desc)
-        pass
-    def factory(*args_, **kwargs_):
-        if scoreDef.subclass:
-            return scoreDef.subclass(*args_, **kwargs_)
-        else:
-            return scoreDef(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_id(self): return self.id
-    def set_id(self, id): self.id = id
-    def get_desc(self): return self.desc
-    def set_desc(self, desc): self.desc = desc
-    def export(self, outfile, level, namespace_='ortho:', name_='scoreDef', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='scoreDef')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='scoreDef'):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            outfile.write(' id=%s' % (quote_attrib(self.id), ))
-        if self.desc is not None and 'desc' not in already_processed:
-            already_processed.append('desc')
-            outfile.write(' desc=%s' % (self.gds_format_string(quote_attrib(self.desc).encode(ExternalEncoding), input_name='desc'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='scoreDef', fromsubclass_=False):
-        pass
-    def hasContent_(self):
-        if (
-
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='scoreDef'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            showIndent(outfile, level)
-            outfile.write('id = "%s",\n' % (self.id,))
-        if self.desc is not None and 'desc' not in already_processed:
-            already_processed.append('desc')
-            showIndent(outfile, level)
-            outfile.write('desc = "%s",\n' % (self.desc,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        pass
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('id', node)
-        if value is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            self.id = value
-        value = find_attr_value_('desc', node)
-        if value is not None and 'desc' not in already_processed:
-            already_processed.append('desc')
-            self.desc = value
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        pass
-# end class scoreDef
-
-
-class score(GeneratedsSuper):
-    """The score element gives the value of a score and links it to the
-    scoreDef element, which defines the score. It can be child of a
-    group or a geneRef element to allow scoring on different levels.
-    An identifier linking to the scoreDef element, which defines the
-    score. The actual value of the score. For instance a confidence
-    score of a group member."""
-    subclass = None
-    superclass = None
-    def __init__(self, id=None, value=None, valueOf_=None):
-        self.id = _cast(None, id)
-        self.value = _cast(float, value)
-        pass
-    def factory(*args_, **kwargs_):
-        if score.subclass:
-            return score.subclass(*args_, **kwargs_)
-        else:
-            return score(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_id(self): return self.id
-    def set_id(self, id): self.id = id
-    def get_value(self): return self.value
-    def set_value(self, value): self.value = value
-    def export(self, outfile, level, namespace_='ortho:', name_='score', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='score')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='score'):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            outfile.write(' id=%s' % (quote_attrib(self.id), ))
-        if self.value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            outfile.write(' value="%s"' % self.gds_format_float(self.value, input_name='value'))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='score', fromsubclass_=False):
-        pass
-    def hasContent_(self):
-        if (
-
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='score'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.id is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            showIndent(outfile, level)
-            outfile.write('id = "%s",\n' % (self.id,))
-        if self.value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            showIndent(outfile, level)
-            outfile.write('value = %f,\n' % (self.value,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        pass
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('id', node)
-        if value is not None and 'id' not in already_processed:
-            already_processed.append('id')
-            self.id = value
-        value = find_attr_value_('value', node)
-        if value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            try:
-                self.value = float(value)
-            except ValueError, exp:
-                raise ValueError('Bad float/double attribute (value): %s' % exp)
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        pass
-# end class score
-
-
-class property(GeneratedsSuper):
-    """Key-value pair for group annotations, for instance statistics about
-    the group members. The key of the key-value annotation pair. The
-    value of the key-value annotation pair. Optional to allow flag
-    like annotations."""
-    subclass = None
-    superclass = None
-    def __init__(self, name=None, value=None, valueOf_=None):
-        self.name = _cast(None, name)
-        self.value = _cast(None, value)
-        pass
-    def factory(*args_, **kwargs_):
-        if property.subclass:
-            return property.subclass(*args_, **kwargs_)
-        else:
-            return property(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_name(self): return self.name
-    def set_name(self, name): self.name = name
-    def get_value(self): return self.value
-    def set_value(self, value): self.value = value
-    def export(self, outfile, level, namespace_='ortho:', name_='property', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='property')
-        if self.hasContent_():
-            outfile.write('>\n')
-            self.exportChildren(outfile, level + 1, namespace_, name_)
-            outfile.write('</%s%s>\n' % (namespace_, name_))
-        else:
-            outfile.write('/>\n')
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='property'):
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            outfile.write(' name=%s' % (self.gds_format_string(quote_attrib(self.name).encode(ExternalEncoding), input_name='name'), ))
-        if self.value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            outfile.write(' value=%s' % (self.gds_format_string(quote_attrib(self.value).encode(ExternalEncoding), input_name='value'), ))
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='property', fromsubclass_=False):
-        pass
-    def hasContent_(self):
-        if (
-
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='property'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        if self.name is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            showIndent(outfile, level)
-            outfile.write('name = "%s",\n' % (self.name,))
-        if self.value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            showIndent(outfile, level)
-            outfile.write('value = "%s",\n' % (self.value,))
-    def exportLiteralChildren(self, outfile, level, name_):
-        pass
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        value = find_attr_value_('name', node)
-        if value is not None and 'name' not in already_processed:
-            already_processed.append('name')
-            self.name = value
-        value = find_attr_value_('value', node)
-        if value is not None and 'value' not in already_processed:
-            already_processed.append('value')
-            self.value = value
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        pass
-# end class property
-
-
-class notes(GeneratedsSuper):
-    """The notes element is a special element, which allows adding
-    information that is not general enough to be part of the
-    standard. I.e. something specific to a particular ortholog
-    database or algorithm. Notes elements will not be validated, so
-    any child elements are legal. Notes elements can be children of
-    the root element orthoXML, the species element, the
-    orthologGroup element, the paralogGroup element, or the geneRef
-    element."""
-    subclass = None
-    superclass = None
-    def __init__(self, valueOf_=None, mixedclass_=None, content_=None):
-        self.valueOf_ = valueOf_
-        if mixedclass_ is None:
-            self.mixedclass_ = MixedContainer
-        else:
-            self.mixedclass_ = mixedclass_
-        if content_ is None:
-            self.content_ = []
-        else:
-            self.content_ = content_
-        self.valueOf_ = valueOf_
-    def factory(*args_, **kwargs_):
-        if notes.subclass:
-            return notes.subclass(*args_, **kwargs_)
-        else:
-            return notes(*args_, **kwargs_)
-    factory = staticmethod(factory)
-    def get_valueOf_(self): return self.valueOf_
-    def set_valueOf_(self, valueOf_): self.valueOf_ = valueOf_
-    def export(self, outfile, level, namespace_='ortho:', name_='notes', namespacedef_=''):
-        showIndent(outfile, level)
-        outfile.write('<%s%s%s' % (namespace_, name_, namespacedef_ and ' ' + namespacedef_ or '', ))
-        already_processed = []
-        self.exportAttributes(outfile, level, already_processed, namespace_, name_='notes')
-        outfile.write('>')
-        self.exportChildren(outfile, level + 1, namespace_, name_)
-        outfile.write('</%s%s>\n' % (namespace_, name_))
-    def exportAttributes(self, outfile, level, already_processed, namespace_='ortho:', name_='notes'):
-        pass
-    def exportChildren(self, outfile, level, namespace_='ortho:', name_='notes', fromsubclass_=False):
-        pass
-    def hasContent_(self):
-        if (
-            self.valueOf_
-            ):
-            return True
-        else:
-            return False
-    def exportLiteral(self, outfile, level, name_='notes'):
-        level += 1
-        self.exportLiteralAttributes(outfile, level, [], name_)
-        if self.hasContent_():
-            self.exportLiteralChildren(outfile, level, name_)
-        showIndent(outfile, level)
-        outfile.write('valueOf_ = """%s""",\n' % (self.valueOf_,))
-    def exportLiteralAttributes(self, outfile, level, already_processed, name_):
-        pass
-    def exportLiteralChildren(self, outfile, level, name_):
-        pass
-    def build(self, node):
-        self.buildAttributes(node, node.attrib, [])
-        self.valueOf_ = get_all_text_(node)
-        if node.text is not None:
-            obj_ = self.mixedclass_(MixedContainer.CategoryText,
-                MixedContainer.TypeNone, '', node.text)
-            self.content_.append(obj_)
-        for child in node:
-            nodeName_ = Tag_pattern_.match(child.tag).groups()[-1]
-            self.buildChildren(child, node, nodeName_)
-    def buildAttributes(self, node, attrs, already_processed):
-        pass
-    def buildChildren(self, child_, node, nodeName_, fromsubclass_=False):
-        if not fromsubclass_ and child_.tail is not None:
-            obj_ = self.mixedclass_(MixedContainer.CategoryText,
-                MixedContainer.TypeNone, '', child_.tail)
-            self.content_.append(obj_)
-        pass
-# end class notes
-
-
-USAGE_TEXT = """
-Usage: python <Parser>.py [ -s ] <in_xml_file>
-"""
-
-def usage():
-    print USAGE_TEXT
-    sys.exit(1)
-
-
-def get_root_tag(node):
-    tag = Tag_pattern_.match(node.tag).groups()[-1]
-    rootClass = globals().get(tag)
-    return tag, rootClass
-
-
-def parse(inFileName):
-    doc = parsexml_(inFileName)
-    rootNode = doc.getroot()
-    rootTag, rootClass = get_root_tag(rootNode)
-    if rootClass is None:
-        rootTag = 'orthoXML'
-        rootClass = orthoXML
-    rootObj = rootClass.factory()
-    rootObj.build(rootNode)
-    # Enable Python to collect the space used by the DOM.
-    doc = None
-##     sys.stdout.write('<?xml version="1.0" ?>\n')
-##     rootObj.export(sys.stdout, 0, name_=rootTag, 
-##         namespacedef_='xmlns:ortho="http://orthoXML.org/2011/"')
-    return rootObj
-
-
-def parseString(inString):
-    from StringIO import StringIO
-    doc = parsexml_(StringIO(inString))
-    rootNode = doc.getroot()
-    rootTag, rootClass = get_root_tag(rootNode)
-    if rootClass is None:
-        rootTag = 'orthoXML'
-        rootClass = orthoXML
-    rootObj = rootClass.factory()
-    rootObj.build(rootNode)
-    # Enable Python to collect the space used by the DOM.
-    doc = None
-##     sys.stdout.write('<?xml version="1.0" ?>\n')
-##     rootObj.export(sys.stdout, 0, name_="orthoXML",
-##         namespacedef_='xmlns:ortho="http://orthoXML.org/2011/"')
-    return rootObj
-
-
-def parseLiteral(inFileName):
-    doc = parsexml_(inFileName)
-    rootNode = doc.getroot()
-    rootTag, rootClass = get_root_tag(rootNode)
-    if rootClass is None:
-        rootTag = 'orthoXML'
-        rootClass = orthoXML
-    rootObj = rootClass.factory()
-    rootObj.build(rootNode)
-    # Enable Python to collect the space used by the DOM.
-    doc = None
-##     sys.stdout.write('#from orthoxml import *\n\n')
-##     sys.stdout.write('import orthoxml as model_\n\n')
-##     sys.stdout.write('rootObj = model_.rootTag(\n')
-##     rootObj.exportLiteral(sys.stdout, 0, name_=rootTag)
-##     sys.stdout.write(')\n')
-    return rootObj
-
-
-def main():
-    args = sys.argv[1:]
-    if len(args) == 1:
-        parse(args[0])
-    else:
-        usage()
-
-
-if __name__ == '__main__':
-    #import pdb; pdb.set_trace()
-    main()
-
-
-__all__ = [
-    "database",
-    "gene",
-    "geneRef",
-    "genes",
-    "group",
-    "groups",
-    "notes",
-    "orthoXML",
-    "property",
-    "score",
-    "scoreDef",
-    "scores",
-    "species"
-    ]
diff --git a/src/HogProf/profiler.py b/src/HogProf/profiler.py
index 8b396f7..ce1065f 100755
--- a/src/HogProf/profiler.py
+++ b/src/HogProf/profiler.py
@@ -1,426 +1,429 @@
-from pyoma.browser import db
+import functools
+import gc
+import logging
+import multiprocessing as mp
 import pickle
-import pandas as pd
-import h5py
 import random
-from tables import *
-import numpy as np
-import random
-import ete3
-#from validation import validation_semantic_similarity
-from HogProf.utils import hashutils , pyhamutils , files_utils
+import time
 from time import time
-import multiprocessing as mp
-import functools
+
+import ete3
+import h5py
 import numpy as np
-import time
-import gc
-import logging
+import pandas as pd
 from pyoma.browser import db
+from tables import *
+
+from HogProf.utils import hashutils, pyhamutils, files_utils
+
 np.random.seed(0)
 random.seed(0)
-class Profiler:
 
-	"""
-	A profiler object allows the user to query the LSH with HOGs and get a list of result HOGs back
-
-	"""
-	def __init__(self,lshforestpath = None, hashes_h5=None, mat_path= None, oma = False , nsamples = 256 , mastertree = None ):
-		"""
-		The Profiler class initializes a profiler object for querying the LSH with HOGs and returning a list of result HOGs.
-
-		Attributes:
-		lshobj (object): LSH object for querying.
-		hashes_h5 (h5py.File): H5 file containing HOGs.
-		nsamples (int): Number of samples to use.
-		tree (ete3.Tree): Master tree used for generating taxa index.
-		tree_string (str): String representation of the master tree.
-		taxaIndex (dict): Dictionary mapping taxa names to their indices in the master tree.
-		ReverseTaxaIndex (dict): Dictionary mapping indices in the master tree to their corresponding taxa names.
-		db_obj (db.Database): OMA database object.
-		treeweights (dict): Dictionary containing the tree weight for each taxon.
-		READ_ORTHO (callable): Function for reading orthoxml files from OMA.
-		HAM_PIPELINE (callable): Function for generating the Annotated tree from a row.
-		HASH_PIPELINE (callable): Function for generating the hash from a row.
-
-		Parameters:
-		lshforestpath (str, optional): Path to the pickled LSH forest object.
-		hashes_h5 (str, optional): Path to the H5 file containing HOGs.
-		mat_path (str, optional): Path to the matrix file containing HOGs.
-		oma (str, optional): Path to the OMA database.
-		tar (str, optional): Path to the tar archive.
-		nsamples (int, optional): Number of samples to use. Defaults to 256.
-		mastertree (str, optional): Path to the master tree file.
-		"""
-
-		print('loading lsh')
-		with open(lshforestpath, 'rb') as lshpickle:
-			self.lshobj = pickle.loads(lshpickle.read())
-			print('indexing lsh')
-			self.lshobj.index()
-
-		self.hashes_h5 = h5py.File(hashes_h5, mode='r')
-		print('h5' , self.hashes_h5 , self.hashes_h5.keys())
-		self.nsamples = nsamples
-		if mastertree.split('.')[-1] == 'pkl':
-				with open( mastertree , 'rb') as pklin:
-					self.tree = pickle.loads(pklin.read())
-					self.tree_string = self.tree.write(format=1)
-		elif mastertree.split('.')[-1] == 'nwk':
-			self.tree = ete3.Tree(mastertree,format=1)
-			self.tree_string = self.tree.write(format=1)
-		
-		else:
-			raise Exception( 'please provide a pickled ete3 tree or a newick file' )
-		self.taxaIndex, self.ReverseTaxaIndex = files_utils.generate_taxa_index(self.tree)
-			
-		if oma:
-			h5_oma = open_file(oma, mode="r")
-			self.db_obj = db.Database(h5_oma)
-			self.treeweights = hashutils.generate_treeweights(self.tree , self.taxaIndex , None, None )
-			self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma	, db_obj=self.db_obj)
-			self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string )
-			self.HASH_PIPELINE = functools.partial(hashutils.row2hash , taxaIndex=self.taxaIndex  , treeweights=self.treeweights , wmg=None )
-
-		print('DONE')
-
-	def hogid2fam(self, hog_entry):
-		if type(hog_entry )== int:
-			return hog_entry
-		else:
-			hog_entry = self.db_obj.entry_by_entry_nr(self.db_obj.id_resolver.resolve(hog_entry))
-			famnr = int(self.db_obj.hog_family( entry=hog_entry ) )
-			return famnr
-
-	def return_profile_OTF(self, fam):
-		"""
-		Returns profiles as binary vectors for use with optimisation pipelines
-		"""
-		if type(fam) is str:
-			fam = self.hogid2fam(fam)
-		ortho_fam = self.READ_ORTHO(fam)
-		if ortho_fam:
-			tp = self.HAM_PIPELINE([fam, ortho_fam])
-
-			losses = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.lost and n.name in self.taxaIndex  ]
-			dupl = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.dupl  and n.name in self.taxaIndex  ]
-			presence = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.nbr_genes > 0  and n.name in self.taxaIndex  ]
-
-			indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) )
-			hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex)))
-			for i,event in enumerate(indices):
-				if len(indices[event])>0:
-					taxindex = np.asarray(indices[event])
-					hogindex = np.asarray(indices[event])+i*len(self.taxaIndex)
-					hog_matrix_raw[:,hogindex] = 1
-			return {fam:{ 'mat':hog_matrix_raw, 'tree':tp} }
-		else:
-			return{ fam: { 'mat':None , 'tree':None }}
-
-
-	def return_profile_complements(self, fam):
-		"""
-		Returns profiles for each loss to search for complementary hogs
-		"""
-		if type(fam) is str:
-			fam = self.hogid2fam(fam)
-		ortho_fam = self.READ_ORTHO(fam)
-		tp = self.HAM_PIPELINE([fam, ortho_fam])
-
-		losses = set([ n.name  for n in tp.traverse() if n.lost and n.name in self.taxaIndex  ])
-		#these are the roots of the fams we are looking for
-		#we just assume no duplications or losses from this point
-
-		ancestral_nodes = ([ n for n in profiler.tree.traverse() if n.name in losses])
-		losses=[]
-		dupl=[]
-		complements={ n.name+'_loss' : [] }
-
-		indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) )
-
-		hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex)))
-		for i,event in enumerate(indices):
-			if len(indices[event])>0:
-				taxindex = np.asarray(indices[event])
-				hogindex = np.asarray(indices[event])+i*len(self.taxaIndex)
-				hog_matrix_raw[:,hogindex] = 1
-		
-		return {fam:{ 'mat':hog_matrix_raw, 'hash':tp} }
-
-	def worker( self,i, inq, retq ):
-		"""
-		this worker function is for parallelization of generation of binary vector for use with optimisation pipelines
-
-		"""
-		print('worker start'+str(i))
-		while True:
-			input = inq.get()
-			if input is None:
-				break
-			else:
-				fam,ortho_fam = input
-				tp = self.HAM_PIPELINE([fam, ortho_fam])
-				losses = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.lost and n.name in self.taxaIndex  ]
-				dupl = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.dupl  and n.name in self.taxaIndex  ]
-				presence = [ self.taxaIndex[n.name]  for n in tp.traverse() if n.nbr_genes > 0  and n.name in self.taxaIndex  ]
-				indices = dict(zip (['presence', 'loss', 'dup'],[presence,losses,dupl] ) )
-				hog_matrix_raw = np.zeros((1, 3*len(self.taxaIndex)))
-				for i,event in enumerate(indices):
-					if len(indices[event])>0:
-						taxindex = np.asarray(indices[event])
-						hogindex = np.asarray(indices[event])+i*len(self.taxaIndex)
-						hog_matrix_raw[:,hogindex] = 1
-				retq.put({fam:{ 'mat':hog_matrix_raw, 'tree':tp} })
-
-
-	def retmat_mp(self, traindf , nworkers = 25, chunksize=50  ):
-		"""
-		function used to create training matrix with pairs of hogs. calculate_x will return the intersetcion of
-		two binary vectors generated by pyham
-		"""
-		#fams = [ hashutils.hogid2fam(fam) for fam in fams ]
-		def calculate_x(row):
-			mat_x1 = row.mat_x
-			mat_x2 = row.mat_y
-			ret1 = np.zeros(mat_x1.shape)
-			ret2 = np.zeros(mat_x2.shape)
-			#diff = mat_x1 - mat_x2
-			matsum = mat_x1 + mat_x2
-			#ret1[np.where(diff != 0 ) ] = -1
-			ret2[ np.where(matsum == 2 ) ] = 1
-			return list(ret2)
-		retq= mp.Queue(-1)
-		inq= mp.Queue(-1)
-		processes = {}
-		mp.log_to_stderr()
-		logger = mp.get_logger()
-		logger.setLevel(logging.INFO)
-
-		for i in range(nworkers):
-			processes[i] = {'time':time.time() , 'process': mp.Process( target = self.worker , args = (i,inq, retq )  ) }
-			#processes[i]['process'].daemon = True
-			processes[i]['process'].start()
-
-		for batch in range(0, len(traindf) , chunksize ):
-
-			slicedf = traindf.iloc[batch:batch+chunksize, :]
-			fams = list(set(list(slicedf.HogFamA.unique()) + list(slicedf.HogFamB.unique() ) ) )
-			total= {}
-
-			for fam in fams:
-				orthxml = self.READ_ORTHO(fam)
-				if orthxml is not None:
-					inq.put((fam,orthxml))
-			done = []
-			count = 0
-			while len(fams)-1 > count:
-				try:
-					data =retq.get(False)
-					count+=1
-					total.update(data)
-				except :
-					pass
-				time.sleep(.01)
-
-			gc.collect()
-			retdf= pd.DataFrame.from_dict( total , orient= 'index')
-			slicedf = slicedf.merge( retdf , left_on = 'HogFamA' , right_index = True , how= 'left')
-			slicedf = slicedf.merge( retdf , left_on = 'HogFamB' , right_index = True , how= 'left')
-			slicedf = slicedf.dropna(subset=['mat_y', 'mat_x'] , how = 'any')
-			slicedf['xtrain'] = slicedf.apply( calculate_x , axis = 1)
-			X_train = np.vstack( slicedf['xtrain'])
-			y_train = slicedf.truth
-			print(slicedf)
-
-			yield (X_train, y_train)
-		for i in processes:
-			inq.put(None)
-		for i in processes:
-			processes[i]['process'].terminate()
-
-	def retmat_mp_profiles(self, fams , nworkers = 25, chunksize=50 , verbose = False ):
-		"""
-		function used to create dataframe containing binary profiles
-		and trees of fams
-		"""
-
-		fams = [ f for f in fams if f]
-		retq= mp.Queue(-1)
-		inq= mp.Queue(-1)
-		processes = {}
-		mp.log_to_stderr()
-		logger = mp.get_logger()
-		logger.setLevel(logging.INFO)
-		total = {}
-
-		for i in range(nworkers):
-			processes[i] = {'time':time.time() , 'process': mp.Process( target = self.worker , args = (i,inq, retq )  ) }
-			#processes[i]['process'].daemon = True
-			processes[i]['process'].start()
-		for fam in fams:
-			if verbose == True:
-				print(fam)
-			try:
-				orthxml = self.READ_ORTHO(fam)
-			except:
-				orthxml = None
-			if orthxml is not None:
-				inq.put((fam,orthxml))
-		done = []
-		count = 0
-
-		while len(fams)-1 > count :
-			try:
-				data =retq.get(False	)
-				count+=1
-				total.update(data)
-				if count % 100 == 0 :
-					print(count)
-			except :
-				pass
-			time.sleep(.01)
-
-		for i in range(nworkers):
-			processes[i]['process'].terminate()
-		retdf= pd.DataFrame.from_dict( total , orient= 'index')
-		return retdf
-
-	def hog_query(self, hog_id=None, fam_id=None , k = 100  ):
-		"""
-		Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
-		:param hog_id: query hog id
-		:param fam_id: query fam id
-		:return: list containing the results of the LSH for the given query
-		"""
-
-		if hog_id is not None:
-			fam_id = self.hogid2fam(hog_id)
-		query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5 , nsamples=  self.nsamples )
-		#print(query_hash.hashvalues)
-		results = self.lshobj.query(query_hash, k)
-
-
-		return results
-
-	def hog_query_sorted(self, hog_id=None, fam_id=None , k = 100  ):
-		"""
-		Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
-		:param hog_id: query hog id
-		:param fam_id: query fam id
-		:return: list containing the results of the LSH for the given query
-		"""
-
-		if hog_id is not None:
-			fam_id = self.hogid2fam(hog_id)
-		query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5 , nsamples=  self.nsamples )
-		results = self.lshobj.query(query_hash, k)
-		hogdict = self.pull_hashes(results)
-
-		hogdict = { hog: hogdict[hog].jaccard(query_hash) for hog in hogdict  }
-		sortedhogs = [(k, v) for k, v in hogdict.items()]
-		sortedhogs = sorted(student_tuples, key=lambda x: x[1])
-		sortedhogs = [ h[0] for h in sortehogs.reverse() ]
-		return hogdict
-
-	def pull_hashes(self , hoglist):
-
-		"""
-		Given a list of hog_ids , returns a dictionary containing their hashes.
-		This uses the hdf5 file to get the hashvalues
-		:param hog_id: query hog id
-		:param fam_id: query fam id
-		:return: a dict containing the hash values of the hogs in hoglist
-		"""
-
-		return { entry: hashutils.fam2hash_hdf5( self.hogid2fam(entry), self.hashes_h5 , nsamples=  self.nsamples) for entry in hoglist}
-
-	def pull_matrows(self,fams):
-		"""
-		given a list of fams return the submatrix containing their profiles
-
-		:return:fams sorted, sparse mat
-		"""
-		return self.profile_matrix[np.asarray(fams),:]
-
-
-	@staticmethod
-	def sort_hashes(query_hash,hashes):
-		"""
-		Given a dict of hogs:hashes, returns a sorted array of hogs and jaccard distances relative to query hog.
-		:param query hash: weighted minhash of the query
-		:param hashes: a dict of hogs:hashes
-		:return: sortedhogs, jaccard
-		"""
-		#sort the hashes by their jaccard relative to query hash
-		jaccard=[ query_hash.jaccard(hashes[hog]) for hog in hashes]
-		index = np.argsort(jaccard)
-		sortedhogs = np.asarry(list(hashes.keys()))[index]
-		jaccard= jaccard[index]
-		return sortedhogs, jaccard
-
-	@staticmethod
-	def allvall_hashes(hashes):
-		"""
-		Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix.
-		:param hashes: a dict of hogs:hashes
-		:return: hashmat
-		"""
-		#generate an all v all jaccard distance matrix
-		hashmat = np.zeros((len(hashes),len(hashes)))
-		for i , hog1 in enumerate(hashes):
-			for j, hog2 in enumerate(hashes):
-				if i < j :
-					hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2])
-		hashmat = hashmat+hashmat.T
-		np.fill_diagonal(hashmat, 1)
-		return hashmat
-
-	def hog_v_hog(self, hogs):
-		"""
-		give two hogs returns jaccard distance.
-		:param hog1 , hog2: str hog id
-		:return: jaccard score
-		"""
-		hog1,hog2 = hogs
-		#generate an all v all jaccard distance matrix
-		hashes = self.pull_hashes([hog1,hog2])
-		hashes = list(hashes.values())
-		return hashes[0].jaccard(hashes[1])
-
-	def allvall_nx(G,hashes,thresh =None):
-
-		"""
-		Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix.
-		:param hashes: a dict of hogs:hashes
-		:return: hashmat
-		"""
-
-		#generate an all v all jaccard distance matrix
-
-		hashmat = [[ hashes[hog1].jaccard(hashes[hog2]) if j>i else 0 for j,hog2 in enumerate(hashes[0:i] ) ] for i,hog1 in enumerate(hashes) ]
-		hashmat = np.asarray(hashmat)
-		hashmat+= hashmat.T
-		np.fill_diagonal(hashmat, 1)
-
-		#hashmat = np.zeros((len(hashes),len(hashes)))
-
-		#for i , hog1 in enumerate(hashes):
-		#	for j, hog2 in enumerate(hashes):
-		#		hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2])
-		return hashmat
-
-	def iternetwork(seedHOG):
-		pass
-
-	def rank_hashes(query_hash,hashes):
-		jaccard = []
-		sorted = []
-		scores = {}
-		hogsRanked = np.asarray(list(hashes.keys()))
-		for i, hog in enumerate(hashes):
-			score = query_hash.jaccard(hashes[hog])
-			jaccard.append( score)
-			scores[hog] = score
-		hogsRanked = list( hogsRanked[ np.argsort(jaccard) ] )
-		jaccard = np.sort(jaccard)
-		return hogsRanked, jaccard
+
+class Profiler:
+    """
+    A profiler object allows the user to query the LSH with HOGs and get a list of result HOGs back
+
+    """
+
+    def __init__(self, lshforestpath=None, hashes_h5=None, mat_path=None, oma=False, nsamples=256, mastertree=None):
+        """
+        The Profiler class initializes a profiler object for querying the LSH with HOGs and returning a list of result HOGs.
+
+        Attributes:
+        lshobj (object): LSH object for querying.
+        hashes_h5 (h5py.File): H5 file containing HOGs.
+        nsamples (int): Number of samples to use.
+        tree (ete3.Tree): Master tree used for generating taxa index.
+        tree_string (str): String representation of the master tree.
+        taxaIndex (dict): Dictionary mapping taxa names to their indices in the master tree.
+        ReverseTaxaIndex (dict): Dictionary mapping indices in the master tree to their corresponding taxa names.
+        db_obj (db.Database): OMA database object.
+        treeweights (dict): Dictionary containing the tree weight for each taxon.
+        READ_ORTHO (callable): Function for reading orthoxml files from OMA.
+        HAM_PIPELINE (callable): Function for generating the Annotated tree from a row.
+        HASH_PIPELINE (callable): Function for generating the hash from a row.
+
+        Parameters:
+        lshforestpath (str, optional): Path to the pickled LSH forest object.
+        hashes_h5 (str, optional): Path to the H5 file containing HOGs.
+        mat_path (str, optional): Path to the matrix file containing HOGs.
+        oma (str, optional): Path to the OMA database.
+        tar (str, optional): Path to the tar archive.
+        nsamples (int, optional): Number of samples to use. Defaults to 256.
+        mastertree (str, optional): Path to the master tree file.
+        """
+
+        print('loading lsh')
+        with open(lshforestpath, 'rb') as lshpickle:
+            self.lshobj = pickle.loads(lshpickle.read())
+            print('indexing lsh')
+            self.lshobj.index()
+
+        self.hashes_h5 = h5py.File(hashes_h5, mode='r')
+        print('h5', self.hashes_h5, self.hashes_h5.keys())
+        self.nsamples = nsamples
+        if mastertree.split('.')[-1] == 'pkl':
+            with open(mastertree, 'rb') as pklin:
+                self.tree = pickle.loads(pklin.read())
+                self.tree_string = self.tree.write(format=1)
+        elif mastertree.split('.')[-1] == 'nwk':
+            self.tree = ete3.Tree(mastertree, format=1)
+            self.tree_string = self.tree.write(format=1)
+
+        else:
+            raise Exception('please provide a pickled ete3 tree or a newick file')
+        self.taxaIndex, self.ReverseTaxaIndex = files_utils.generate_taxa_index(self.tree)
+
+        if oma:
+            h5_oma = open_file(oma, mode="r")
+            self.db_obj = db.Database(h5_oma)
+            self.treeweights = hashutils.generate_treeweights(self.tree, self.taxaIndex, None, None)
+            self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj)
+            self.HAM_PIPELINE = functools.partial(pyhamutils.get_ham_treemap_from_row, tree=self.tree_string)
+            self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex,
+                                                   treeweights=self.treeweights, wmg=None)
+
+        print('DONE')
+
+    def hogid2fam(self, hog_entry):
+        if type(hog_entry) == int:
+            return hog_entry
+        else:
+            hog_entry = self.db_obj.entry_by_entry_nr(self.db_obj.id_resolver.resolve(hog_entry))
+            famnr = int(self.db_obj.hog_family(entry=hog_entry))
+            return famnr
+
+    def return_profile_OTF(self, fam):
+        """
+        Returns profiles as binary vectors for use with optimisation pipelines
+        """
+        if type(fam) is str:
+            fam = self.hogid2fam(fam)
+        ortho_fam = self.READ_ORTHO(fam)
+        if ortho_fam:
+            tp = self.HAM_PIPELINE([fam, ortho_fam])
+
+            losses = [self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex]
+            dupl = [self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex]
+            presence = [self.taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in self.taxaIndex]
+
+            indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl]))
+            hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex)))
+            for i, event in enumerate(indices):
+                if len(indices[event]) > 0:
+                    taxindex = np.asarray(indices[event])
+                    hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex)
+                    hog_matrix_raw[:, hogindex] = 1
+            return {fam: {'mat': hog_matrix_raw, 'tree': tp}}
+        else:
+            return {fam: {'mat': None, 'tree': None}}
+
+    def return_profile_complements(self, fam):
+        """
+        Returns profiles for each loss to search for complementary hogs
+        """
+        if type(fam) is str:
+            fam = self.hogid2fam(fam)
+        ortho_fam = self.READ_ORTHO(fam)
+        tp = self.HAM_PIPELINE([fam, ortho_fam])
+
+        losses = set([n.name for n in tp.traverse() if n.lost and n.name in self.taxaIndex])
+        # these are the roots of the fams we are looking for
+        # we just assume no duplications or losses from this point
+
+        ancestral_nodes = ([n for n in profiler.tree.traverse() if n.name in losses])
+        losses = []
+        dupl = []
+        complements = {n.name + '_loss': []}
+
+        indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl]))
+
+        hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex)))
+        for i, event in enumerate(indices):
+            if len(indices[event]) > 0:
+                taxindex = np.asarray(indices[event])
+                hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex)
+                hog_matrix_raw[:, hogindex] = 1
+
+        return {fam: {'mat': hog_matrix_raw, 'hash': tp}}
+
+    def worker(self, i, inq, retq):
+        """
+        this worker function is for parallelization of generation of binary vector for use with optimisation pipelines
+
+        """
+        print('worker start' + str(i))
+        while True:
+            input = inq.get()
+            if input is None:
+                break
+            else:
+                fam, ortho_fam = input
+                tp = self.HAM_PIPELINE([fam, ortho_fam])
+                losses = [self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex]
+                dupl = [self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex]
+                presence = [self.taxaIndex[n.name] for n in tp.traverse() if
+                            n.nbr_genes > 0 and n.name in self.taxaIndex]
+                indices = dict(zip(['presence', 'loss', 'dup'], [presence, losses, dupl]))
+                hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex)))
+                for i, event in enumerate(indices):
+                    if len(indices[event]) > 0:
+                        taxindex = np.asarray(indices[event])
+                        hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex)
+                        hog_matrix_raw[:, hogindex] = 1
+                retq.put({fam: {'mat': hog_matrix_raw, 'tree': tp}})
+
+    def retmat_mp(self, traindf, nworkers=25, chunksize=50):
+        """
+        function used to create training matrix with pairs of hogs. calculate_x will return the intersetcion of
+        two binary vectors generated by pyham
+        """
+
+        # fams = [ hashutils.hogid2fam(fam) for fam in fams ]
+        def calculate_x(row):
+            mat_x1 = row.mat_x
+            mat_x2 = row.mat_y
+            ret1 = np.zeros(mat_x1.shape)
+            ret2 = np.zeros(mat_x2.shape)
+            # diff = mat_x1 - mat_x2
+            matsum = mat_x1 + mat_x2
+            # ret1[np.where(diff != 0 ) ] = -1
+            ret2[np.where(matsum == 2)] = 1
+            return list(ret2)
+
+        retq = mp.Queue(-1)
+        inq = mp.Queue(-1)
+        processes = {}
+        mp.log_to_stderr()
+        logger = mp.get_logger()
+        logger.setLevel(logging.INFO)
+
+        for i in range(nworkers):
+            processes[i] = {'time': time.time(), 'process': mp.Process(target=self.worker, args=(i, inq, retq))}
+            # processes[i]['process'].daemon = True
+            processes[i]['process'].start()
+
+        for batch in range(0, len(traindf), chunksize):
+
+            slicedf = traindf.iloc[batch:batch + chunksize, :]
+            fams = list(set(list(slicedf.HogFamA.unique()) + list(slicedf.HogFamB.unique())))
+            total = {}
+
+            for fam in fams:
+                orthxml = self.READ_ORTHO(fam)
+                if orthxml is not None:
+                    inq.put((fam, orthxml))
+            done = []
+            count = 0
+            while len(fams) - 1 > count:
+                try:
+                    data = retq.get(False)
+                    count += 1
+                    total.update(data)
+                except:
+                    pass
+                time.sleep(.01)
+
+            gc.collect()
+            retdf = pd.DataFrame.from_dict(total, orient='index')
+            slicedf = slicedf.merge(retdf, left_on='HogFamA', right_index=True, how='left')
+            slicedf = slicedf.merge(retdf, left_on='HogFamB', right_index=True, how='left')
+            slicedf = slicedf.dropna(subset=['mat_y', 'mat_x'], how='any')
+            slicedf['xtrain'] = slicedf.apply(calculate_x, axis=1)
+            X_train = np.vstack(slicedf['xtrain'])
+            y_train = slicedf.truth
+            print(slicedf)
+
+            yield (X_train, y_train)
+        for i in processes:
+            inq.put(None)
+        for i in processes:
+            processes[i]['process'].terminate()
+
+    def retmat_mp_profiles(self, fams, nworkers=25, chunksize=50, verbose=False):
+        """
+        function used to create dataframe containing binary profiles
+        and trees of fams
+        """
+
+        fams = [f for f in fams if f]
+        retq = mp.Queue(-1)
+        inq = mp.Queue(-1)
+        processes = {}
+        mp.log_to_stderr()
+        logger = mp.get_logger()
+        logger.setLevel(logging.INFO)
+        total = {}
+
+        for i in range(nworkers):
+            processes[i] = {'time': time.time(), 'process': mp.Process(target=self.worker, args=(i, inq, retq))}
+            # processes[i]['process'].daemon = True
+            processes[i]['process'].start()
+        for fam in fams:
+            if verbose:
+                print(fam)
+            try:
+                orthxml = self.READ_ORTHO(fam)
+            except:
+                orthxml = None
+            if orthxml is not None:
+                inq.put((fam, orthxml))
+        done = []
+        count = 0
+
+        while len(fams) - 1 > count:
+            try:
+                data = retq.get(False)
+                count += 1
+                total.update(data)
+                if count % 100 == 0:
+                    print(count)
+            except:
+                pass
+            time.sleep(.01)
+
+        for i in range(nworkers):
+            processes[i]['process'].terminate()
+        retdf = pd.DataFrame.from_dict(total, orient='index')
+        return retdf
+
+    def hog_query(self, hog_id=None, fam_id=None, k=100):
+        """
+        Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
+        :param hog_id: query hog id
+        :param fam_id: query fam id
+        :return: list containing the results of the LSH for the given query
+        """
+
+        if hog_id is not None:
+            fam_id = self.hogid2fam(hog_id)
+        query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples)
+        # print(query_hash.hashvalues)
+        results = self.lshobj.query(query_hash, k)
+
+        return results
+
+    def hog_query_sorted(self, hog_id=None, fam_id=None, k=100):
+        """
+        Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
+        :param hog_id: query hog id
+        :param fam_id: query fam id
+        :return: list containing the results of the LSH for the given query
+        """
+
+        if hog_id is not None:
+            fam_id = self.hogid2fam(hog_id)
+        query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples)
+        results = self.lshobj.query(query_hash, k)
+        hogdict = self.pull_hashes(results)
+
+        hogdict = {hog: hogdict[hog].jaccard(query_hash) for hog in hogdict}
+        sortedhogs = [(k, v) for k, v in hogdict.items()]
+        sortedhogs = sorted(student_tuples, key=lambda x: x[1])
+        sortedhogs = [h[0] for h in sortehogs.reverse()]
+        return hogdict
+
+    def pull_hashes(self, hoglist):
+
+        """
+        Given a list of hog_ids , returns a dictionary containing their hashes.
+        This uses the hdf5 file to get the hashvalues
+        :param hog_id: query hog id
+        :param fam_id: query fam id
+        :return: a dict containing the hash values of the hogs in hoglist
+        """
+
+        return {entry: hashutils.fam2hash_hdf5(self.hogid2fam(entry), self.hashes_h5, nsamples=self.nsamples) for entry
+                in hoglist}
+
+    def pull_matrows(self, fams):
+        """
+        given a list of fams return the submatrix containing their profiles
+
+        :return:fams sorted, sparse mat
+        """
+        return self.profile_matrix[np.asarray(fams), :]
+
+    @staticmethod
+    def sort_hashes(query_hash, hashes):
+        """
+        Given a dict of hogs:hashes, returns a sorted array of hogs and jaccard distances relative to query hog.
+        :param query hash: weighted minhash of the query
+        :param hashes: a dict of hogs:hashes
+        :return: sortedhogs, jaccard
+        """
+        # sort the hashes by their jaccard relative to query hash
+        jaccard = [query_hash.jaccard(hashes[hog]) for hog in hashes]
+        index = np.argsort(jaccard)
+        sortedhogs = np.asarry(list(hashes.keys()))[index]
+        jaccard = jaccard[index]
+        return sortedhogs, jaccard
+
+    @staticmethod
+    def allvall_hashes(hashes):
+        """
+        Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix.
+        :param hashes: a dict of hogs:hashes
+        :return: hashmat
+        """
+        # generate an all v all jaccard distance matrix
+        hashmat = np.zeros((len(hashes), len(hashes)))
+        for i, hog1 in enumerate(hashes):
+            for j, hog2 in enumerate(hashes):
+                if i < j:
+                    hashmat[i, j] = hashes[hog1].jaccard(hashes[hog2])
+        hashmat = hashmat + hashmat.T
+        np.fill_diagonal(hashmat, 1)
+        return hashmat
+
+    def hog_v_hog(self, hogs):
+        """
+        give two hogs returns jaccard distance.
+        :param hog1 , hog2: str hog id
+        :return: jaccard score
+        """
+        hog1, hog2 = hogs
+        # generate an all v all jaccard distance matrix
+        hashes = self.pull_hashes([hog1, hog2])
+        hashes = list(hashes.values())
+        return hashes[0].jaccard(hashes[1])
+
+    def allvall_nx(G, hashes, thresh=None):
+
+        """
+        Given a dict of hogs:hashes, returns generate an all v all jaccard distance matrix.
+        :param hashes: a dict of hogs:hashes
+        :return: hashmat
+        """
+
+        # generate an all v all jaccard distance matrix
+
+        hashmat = [[hashes[hog1].jaccard(hashes[hog2]) if j > i else 0 for j, hog2 in enumerate(hashes[0:i])] for
+                   i, hog1 in enumerate(hashes)]
+        hashmat = np.asarray(hashmat)
+        hashmat += hashmat.T
+        np.fill_diagonal(hashmat, 1)
+
+        # hashmat = np.zeros((len(hashes),len(hashes)))
+
+        # for i , hog1 in enumerate(hashes):
+        #	for j, hog2 in enumerate(hashes):
+        #		hashmat[i,j]= hashes[hog1].jaccard(hashes[hog2])
+        return hashmat
+
+    def iternetwork(seedHOG):
+        pass
+
+    def rank_hashes(query_hash, hashes):
+        jaccard = []
+        sorted = []
+        scores = {}
+        hogsRanked = np.asarray(list(hashes.keys()))
+        for i, hog in enumerate(hashes):
+            score = query_hash.jaccard(hashes[hog])
+            jaccard.append(score)
+            scores[hog] = score
+        hogsRanked = list(hogsRanked[np.argsort(jaccard)])
+        jaccard = np.sort(jaccard)
+        return hogsRanked, jaccard
diff --git a/src/HogProf/utils/__pycache__/__init__.cpython-310.pyc b/src/HogProf/utils/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 8165b08..0000000
Binary files a/src/HogProf/utils/__pycache__/__init__.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/__init__.cpython-38.pyc b/src/HogProf/utils/__pycache__/__init__.cpython-38.pyc
deleted file mode 100644
index 2464570..0000000
Binary files a/src/HogProf/utils/__pycache__/__init__.cpython-38.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/files_utils.cpython-310.pyc b/src/HogProf/utils/__pycache__/files_utils.cpython-310.pyc
deleted file mode 100644
index ecf63ee..0000000
Binary files a/src/HogProf/utils/__pycache__/files_utils.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc b/src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc
deleted file mode 100644
index 2fccd01..0000000
Binary files a/src/HogProf/utils/__pycache__/files_utils.cpython-38.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/goatools_utils.cpython-310.pyc b/src/HogProf/utils/__pycache__/goatools_utils.cpython-310.pyc
deleted file mode 100644
index 41e542f..0000000
Binary files a/src/HogProf/utils/__pycache__/goatools_utils.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/goautils.cpython-310.pyc b/src/HogProf/utils/__pycache__/goautils.cpython-310.pyc
deleted file mode 100644
index e4b5e44..0000000
Binary files a/src/HogProf/utils/__pycache__/goautils.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc b/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc
deleted file mode 100644
index 0763262..0000000
Binary files a/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc b/src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc
deleted file mode 100644
index 56ce8c6..0000000
Binary files a/src/HogProf/utils/__pycache__/hashutils.cpython-38.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc b/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc
deleted file mode 100644
index 16199a6..0000000
Binary files a/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc and /dev/null differ
diff --git a/src/HogProf/utils/__pycache__/pyhamutils.cpython-38.pyc b/src/HogProf/utils/__pycache__/pyhamutils.cpython-38.pyc
deleted file mode 100644
index d15a8d7..0000000
Binary files a/src/HogProf/utils/__pycache__/pyhamutils.cpython-38.pyc and /dev/null differ
diff --git a/src/HogProf/utils/files_utils.py b/src/HogProf/utils/files_utils.py
index 4b1ca40..9f1464d 100755
--- a/src/HogProf/utils/files_utils.py
+++ b/src/HogProf/utils/files_utils.py
@@ -1,16 +1,15 @@
-import ete3
-import pandas as pd
-from Bio import Entrez
 import copy
 import pickle
-import os
 
+import ete3
+from Bio import Entrez
 
-def get_tree(taxa , genomes ,  outdir = None):
+
+def get_tree(taxa, genomes, outdir=None):
     """
     Generates a taxonomic tree using the ncbi taxonomy and
     :param oma:  a pyoma db object
-    :param saveTree: Bool for whether or not to save a mastertree newick file
+    :param saveTree: Bool for whether to save a mastertree newick file
     :return: tree_string: a newick string tree: an ete3 object
 
     """
@@ -19,9 +18,8 @@ def get_tree(taxa , genomes ,  outdir = None):
     genomes = set(genomes)
     tax.remove(0)
     print(len(tax))
-    tree = ete3.PhyloTree( name = '-1')
-    topo = ncbi.get_topology(genomes , collapse_subspecies=False)
-    tax = set([ str(taxid) for taxid in tax])
+    tree = ete3.PhyloTree(name='-1')
+    topo = ncbi.get_topology(genomes, collapse_subspecies=False)
     tree.add_child(topo)
     orphans = list(genomes - set([x.name for x in tree.get_leaves()]))
     print('missing taxa:')
@@ -29,30 +27,32 @@ def get_tree(taxa , genomes ,  outdir = None):
 
     orphans_info1 = {}
     orphans_info2 = {}
+
     for x in orphans:
+        Entrez.email = 'leo.burgy@epfl.ch'
         search_handle = Entrez.efetch('taxonomy', id=str(x), retmode='xml')
         record = next(Entrez.parse(search_handle))
         print(record)
-        orphans_info1[ record['ParentTaxId']] = x
+        orphans_info1[record['ParentTaxId']] = x
         orphans_info2[x] = [x['TaxId'] for x in record['LineageEx']]
+
     for n in tree.traverse():
         if n.name in orphans_info1:
-            n.add_sister(name = orphans_info1[n.name])
+            n.add_sister(name=orphans_info1[n.name])
             print(n)
-    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
+
     tree = add_orphans(orphans_info2, tree, genomes)
-    orphans = set(genomes) - set([x.name for x in tree.get_leaves()])
     tree_string = tree.write(format=1)
-    
-    
-    with open( outdir +'master_tree.nwk' , 'w') as nwkout:
+
+    with open(outdir + 'master_tree.nwk', 'w') as nwkout:
         nwkout.write(tree_string)
-    with open( outdir + '_master_tree.pkl' , 'wb') as pklout:
+    with open(outdir + '_master_tree.pkl', 'wb') as pklout:
         pklout.write(pickle.dumps(tree))
-    
+
     return tree_string, tree
 
-def generate_taxa_index(tree , taxfilter= None, taxmask=None):
+
+def generate_taxa_index(tree, taxfilter=None, taxmask=None):
     """
     Generates an index for the global taxonomic tree for all OMA
     :param tree: ete3 tree
@@ -67,13 +67,13 @@ def generate_taxa_index(tree , taxfilter= None, taxmask=None):
                 break
         if taxfilter:
             if n.name in taxfilter:
-                #set weight for descendants of n to 0
+                # set weight for descendants of n to 0
                 n.delete()
     taxa_index = {}
     taxa_index_reverse = {}
     for i, n in enumerate(tree.traverse()):
         taxa_index_reverse[i] = n.name
-        taxa_index[n.name] = i-1
+        taxa_index[n.name] = i - 1
 
     return taxa_index, taxa_index_reverse
 
@@ -89,7 +89,6 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False):
     """
     first = True
 
-
     newdict = {}
 
     leaves = set([leaf.name for leaf in tree.get_leaves()])
@@ -101,7 +100,7 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False):
     i = 0
     print(i)
 
-    while first or ( len(orphans) > 0  and keys != oldkeys ) :
+    while first or (len(orphans) > 0 and keys != oldkeys):
         first = False
         oldkeys = keys
         leaves = set([leaf.name for leaf in tree.get_leaves()])
@@ -126,18 +125,17 @@ def add_orphans(orphan_info, tree, genome_ids_list, verbose=False):
         newdict = {}
     nodes = {}
     print(orphans)
-    #clean up duplicates
+    # clean up duplicates
     for n in tree.traverse():
         if n.name not in nodes:
-            nodes[ n.name] =1
+            nodes[n.name] = 1
         else:
-            nodes[ n.name] +=1
+            nodes[n.name] += 1
 
     for n in tree.traverse():
-        if nodes[ n.name] >1:
-            if n.is_leaf()== False:
+        if nodes[n.name] > 1:
+            if n.is_leaf() == False:
                 n.delete()
-                nodes[ n.name]-= 1
-
+                nodes[n.name] -= 1
 
     return tree
diff --git a/src/HogProf/utils/goautils.py b/src/HogProf/utils/goautils.py
index 5bd223f..780106c 100644
--- a/src/HogProf/utils/goautils.py
+++ b/src/HogProf/utils/goautils.py
@@ -1,4 +1,3 @@
-
 from __future__ import print_function
 
 from goatools import semantic
@@ -11,93 +10,93 @@
 import multiprocessing as mp
 from tables import *
 import time
+
+
 ##############enrichment##############################################
 
-def return_enrichment_study_obj(gaf_taxfiltered, obo = None):
+def return_enrichment_study_obj(gaf_taxfiltered, obo=None):
     '''
     Generate go enrichment study object with a background dataset.
     '''
     if obo is None:
-        obodag = GODag(config_utils.datadir+"/GOData/go-basic.obo")
+        obodag = GODag(config_utils.datadir + "/GOData/go-basic.obo")
     else:
         obodag = GODag(obo)
 
     goeaobj = GOEnrichmentStudy(
-        gaf_taxfiltered.keys(), #
-        gaf_taxfiltered, # geneid/GO associations possible with tree used for DB
-        obodag, # Ontologies
-        propagate_counts = False,
-        alpha = 0.15, # default significance cut-off
-        methods = ['fdr_bh']) # defult multipletest correction method
+        gaf_taxfiltered.keys(),  #
+        gaf_taxfiltered,  # geneid/GO associations possible with tree used for DB
+        obodag,  # Ontologies
+        propagate_counts=False,
+        alpha=0.15,  # default significance cut-off
+        methods=['fdr_bh'])  # defult multipletest correction method
     return goeaobj
 
 
-def buildGAF(gaf_file , universe= None):
-
+def buildGAF(gaf_file, universe=None):
     gaf_filtered = {}
     with open(gaf_file, mode='r') as gafin:
         for line in gafin:
             words = line.split()
             if words[0] not in gaf_filtered:
-                gaf_filtered[words[0]]=set([words[1]])
+                gaf_filtered[words[0]] = set([words[1]])
             else:
                 gaf_filtered[words[0]].add(words[1])
 
     if universe:
-        gaf_filtered = { prot:gaf_filtered[prot] for prot in universe}
-
+        gaf_filtered = {prot: gaf_filtered[prot] for prot in universe}
 
     return gaf_filtered
 
 
-def return_hogs_timeout( result, retq):
+def return_hogs_timeout(result, retq):
     print('started')
-    with open_file(config_utils.config['dir']['omadir']+'OmaServer.h5' , mode="r") as h5_oma:
+    with open_file(config_utils.config['dir']['omadir'] + 'OmaServer.h5', mode="r") as h5_oma:
         db_obj = db.Database(h5_oma)
-        res =  [ ProteinEntry(db_obj, e).omaid for  e in db_obj.member_of_fam(int(result)) ]
+        res = [ProteinEntry(db_obj, e).omaid for e in db_obj.member_of_fam(int(result))]
         retq.put(res)
 
-def run_GOEA_onresults(results, db_obj, goeaobj, outname = None):
+
+def run_GOEA_onresults(results, db_obj, goeaobj, outname=None):
     '''
         Perform enrichment analysis on returned results
         grabs all member protein of all hogs in result
         returns goe results and HOG composition
     '''
-    hogids =[ "HOG:" + (7-len(str(fam_id))) * '0' + str(fam_id) for fam_id in results ]
-    HOGS={}
+    hogids = ["HOG:" + (7 - len(str(fam_id))) * '0' + str(fam_id) for fam_id in results]
+    HOGS = {}
     print('compiling hogs')
     prots = []
 
     print('mod13')
     retq = mp.Queue()
 
-    for i,result in enumerate(results):
-        if i %10 ==0:
+    for i, result in enumerate(results):
+        if i % 10 == 0:
             print(i)
         print(result)
-        HOGS[result]=[]
-        p = mp.Process( target= return_hogs_timeout , args= (result, retq))
+        HOGS[result] = []
+        p = mp.Process(target=return_hogs_timeout, args=(result, retq))
         p.start()
 
         t0 = time.time()
         timeout = False
 
-        while time.time()-t0 < 10 :
+        while time.time() - t0 < 10:
             time.sleep(.1)
             if p.is_alive() == False:
                 print('done')
                 break
-        if time.time()-t0 > 10:
+        if time.time() - t0 > 10:
             timeout = True
             print('Dead')
         p.terminate()
         del p
 
-
         if retq.empty() == False:
             iterobj = retq.get(10)
-            #retq get
-            for k,member in enumerate(iterobj):
+            # retq get
+            for k, member in enumerate(iterobj):
                 if k < 1:
                     print(member)
                 if k > 500:
@@ -107,47 +106,46 @@ def run_GOEA_onresults(results, db_obj, goeaobj, outname = None):
 
     print('done')
     print('running GO enrichment study')
-    goea_results_all = goeaobj.run_study(prots )
+    goea_results_all = goeaobj.run_study(prots)
     print('done')
     if outname:
-        with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save:
-            save.write(pickle.dumps(HOGS,2))
-        goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all)
+        with open(config_utils.datadir + outname + 'Hogs2Prots.pkl', 'wb') as save:
+            save.write(pickle.dumps(HOGS, 2))
+        goeaobj.wr_txt(config_utils.datadir + str(outname) + "enrichment.txt", goea_results_all)
     print('DONE!')
     return goea_results_all, HOGS
 
 
-def run_GOEA_onresults_tar(results, tar, goeaobj, outname = None):
+def run_GOEA_onresults_tar(results, tar, goeaobj, outname=None):
     '''
         Perform enrichment analysis on returned results
         grabs all member protein of all hogs in result
         returns goe results and HOG composition
     '''
     ## TODO: finish this function with tar hog to list of prot IDS
-    #print(db_obj.member_of_hog_id(int(results[0])))
-    #hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ]
-    #print( db_obj.member_of_hog_id(hogids[0]) )
-
+    # print(db_obj.member_of_hog_id(int(results[0])))
+    # hogids =[ "HOG:" + (7-len(fam_id)) * '0' + fam_id for fam_id in results ]
+    # print( db_obj.member_of_hog_id(hogids[0]) )
 
-    HOGS={}
+    HOGS = {}
     print('compiling hogs')
     prots = []
-    for i,result in enumerate(hogids):
-        if i %10 ==0:
+    for i, result in enumerate(hogids):
+        if i % 10 == 0:
             print(i)
-        HOGS[result]=[]
+        HOGS[result] = []
         for member in db_obj.iter_members_of_hog_id(result):
             HOGS[result].append(member.omaid)
             prots.append(member.omaid)
     print('done')
     print('running GO enrichment study')
 
-    goea_results_all = goeaobj.run_study(prots )
+    goea_results_all = goeaobj.run_study(prots)
     print('done')
-    with open( config_utils.datadir + outname + 'Hogs2Prots.pkl' , 'wb' ) as save:
-       save.write(pickle.dumps(HOGS,2))
+    with open(config_utils.datadir + outname + 'Hogs2Prots.pkl', 'wb') as save:
+        save.write(pickle.dumps(HOGS, 2))
 
-    goeaobj.wr_txt(config_utils.datadir+ str(outname)+"enrichment.txt", goea_results_all)
+    goeaobj.wr_txt(config_utils.datadir + str(outname) + "enrichment.txt", goea_results_all)
     print('DONE!')
     return goea_results_all, HOGS
 
@@ -175,6 +173,7 @@ def deepest_common_ancestor_hdf5(go_ids, godag, hdf5):
     # Take the element at maximum depth.
     return max(common_parent_go_ids_hdf5(go_ids, hdf5), key=lambda t: godag[t].depth)
 
+
 def common_parent_go_ids_hdf5(go_ids, hdf5_set):
     '''
         Finds the common ancestors in the GO
@@ -187,12 +186,13 @@ def common_parent_go_ids_hdf5(go_ids, hdf5_set):
     corrected_candidates = [id2goterm(c) for c in candidates]
     return corrected_candidates
 
-def resnik_sim_pandas(tup, df , termcounts):
+
+def resnik_sim_pandas(tup, df, termcounts):
     '''
         Computes Resnik's similarity measure.
     '''
     go_id1, go_id2 = tup
-    #print(df.head())
+    # print(df.head())
     if go_id1 == go_id2:
         return semantic.get_info_content(go_id1, termcounts)
 
@@ -202,9 +202,9 @@ def resnik_sim_pandas(tup, df , termcounts):
         ancestors += df.loc[str(go_id1)].parents
         terms = df.loc[ancestors]
         ancestors_set = terms.parents.tolist()
-        intersection = set(ancestors_set[0]).intersection(* ancestors_set[1:])
+        intersection = set(ancestors_set[0]).intersection(*ancestors_set[1:])
         common_ancestors = df.loc[list(intersection)]
-        common_ancestors = common_ancestors.sort_values('depth', ascending= False)
+        common_ancestors = common_ancestors.sort_values('depth', ascending=False)
         msca_goid = common_ancestors.index.tolist()[0]
         return semantic.get_info_content(msca_goid, termcounts)
 
@@ -212,18 +212,18 @@ def resnik_sim_pandas(tup, df , termcounts):
         return -1
 
 
-def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf , genomes = None):
+def get_go_terms_gaf(hog_id, pyoma_dbobj, gaf, genomes=None):
     '''
         iterate over hog members and get the go information from a gaf in memory
     '''
     fam = hashutils.hogid2fam(hog_id)
-    go_terms = { mr.omaid:gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf  }
+    go_terms = {mr.omaid: gaf[mr.omaid] for mr in pyoma_dbobj.iter_members_of_hog_id(hog_id) if mr.omaid in gaf}
     return go_terms
 
 
 def goterm2id(go_term_to_modif):
-
     return int(go_term_to_modif.split(':')[1])
 
+
 def id2goterm(go_term_to_modif):
-    return 'GO:{:07d}'.format(go_term_to_modif)
\ No newline at end of file
+    return 'GO:{:07d}'.format(go_term_to_modif)