Merge pull request #1 from EBI-Metagenomics/pypi

Add PyPi publishing to kegg-pathways-completeness
EBI-Metagenomics · Jun 18, 2024 · 84f031f · 84f031f
2 parents 11c2b5f + ca7252f
commit 84f031f
Show file tree

Hide file tree

Showing 1,025 changed files with 539 additions and 427 deletions.
diff --git a/.github/workflows/.check_api.yml b/.github/workflows/.check_api.yml
@@ -0,0 +1,59 @@
+name: Check KEGG API for new records
+
+on:
+  schedule:
+    - cron: '0 0 1 * *'  # Runs at midnight on the 1st of every month
+
+jobs:
+  check-api:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+
+    - name: Check API for new records
+      run: python kegg_pathways_completeness/bin/update_pathways_data/get_modules_list.py
+
+    - name: Check if new_modules.txt exists
+      id: check-file
+      run: |
+        if [[ -f "new_modules.txt" ]]; then
+          echo "file_exists=true" >> $GITHUB_ENV
+        else
+          echo "file_exists=false" >> $GITHUB_ENV
+        fi
+
+    - name: Get current date
+      id: date
+      run: echo "BRANCH_DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
+    - name: Commit and push changes
+      if: env.file_exists == 'true'
+      run: |
+        git config --global user.name "github-actions[bot]"
+        git config --global user.email "github-actions[bot]@users.noreply.github.com"
+        git config pull.rebase false
+        git pull origin master --allow-unrelated-histories
+        git checkout -b changes-${{ env.BRANCH_DATE }}
+        git add new_modules.txt
+        git commit -m "Add new_modules.txt with updates"
+        git push origin HEAD:changes-${{ env.BRANCH_DATE }}
+
+    - name: Create Pull Request
+      if: env.file_exists == 'true'
+      uses: peter-evans/create-pull-request@v5
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+        branch: changes
+        title: "Automated update of new_modules.txt"
+        body: "This PR was created automatically by the GitHub Action."
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,31 @@
+name: Tests
+
+on:
+  push:
+    branches: [master, dev]
+  pull_request:
+    branches: [master, dev]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [ '3.10', '3.11' ]
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Setup Graphviz
+      uses: ts-graphviz/setup-graphviz@v2
+
+    - uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: pip install .[test]
+
+    - name: Run pytest
+      run: pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: 'v0.4.8'
+    hooks:
+      - id: ruff
+  - repo: https://github.com/PyCQA/isort
+    rev: '5.13.2'
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: '24.4.2'
+    hooks:
+      - id: black
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: 'v4.6.0'
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -1,35 +1,43 @@
 # kegg-pathways-completeness tool
 
-The tool counts completeness of each KEGG pathway for protein sequence. 
+The tool counts completeness of each KEGG modules pathway for protein sequence. 
 
 Please read **Theory** section with detailed explanation in the bottom of README. 
 
-Current version of pathways saved into **[pathways_data](pathways_data)** and graphs were [pre-built](graphs/README.md) and [saved](graphs/updates/pipeline-v5/graphs.pkl) into pkl format. 
+**Required files:**
+- list of KEGG modules in KOs notation (example, [all_pathways.txt](kegg_pathways_completeness%2Fpathways_data%2Fall_pathways.txt))
+- list of classes of KEGG modules (example, [all_pathways_class.txt](kegg_pathways_completeness%2Fpathways_data%2Fall_pathways_class.txt))
+- list of names of KEGG modules (example, [all_pathways_names.txt](kegg_pathways_completeness%2Fpathways_data%2Fall_pathways_names.txt))
+- graphs constructed from each module (example, [graphs.pkl](kegg_pathways_completeness%2Fgraphs%2Fgraphs.pkl))
 
-`*Pipeline-v5 data has 394 modules.*`
+This repository has a set of required files pre-generated. Current version of data was saved into **[pathways_data](kegg_pathways_completeness/pathways_data)** and graphs were [saved](kegg_pathways_completeness/graphs/updates/pipeline-v5/graphs.pkl) into pkl format. 
 
-`Updated (from 07/03/2024) data in this repo has 481 modules.`
+_**About graphs:**_
+In order to generate graphs all pathways were parsed with networkx library. Every graph is presented in .png format in [png](kegg_pathways_completeness/graphs/png) and .dot format in [dots](kegg_pathways_completeness/graphs/dots). Pathway and weights of each KO can be checked easily with .png image.
+Instructions how to build graphs.pkl are [provided](kegg_pathways_completeness/graphs/README.md). 
 
-Previous updates:
-- 27/04/2023 has 475 modules.
+**Latest update:**
+- 07/03/2024  has 481 KEGG modules.
 
-If you need to update existing pathways data and graphs follow this [instruction](pathways_data/README.md).
+**Previous [updates](kegg_pathways_completeness/graphs/updates):**
+- 27/04/2023 has 475 modules.
+- MGnify [pipeline-v5](https://github.com/EBI-Metagenomics/pipeline-v5) uses 394 modules.
 
-These files are also available on EBI MGnify FTP and can be downloaded using [download.sh](download.sh)
+If you need to update existing pathways data and graphs follow this [instruction](kegg_pathways_completeness/pathways_data/README.md).
 
 ## Calculate pathways completeness
-This script requires [hmmsearch table](tests/test_data/test-input/test) run on KEGG profiles with annotated sequences (preferable) **OR** [file with list](tests/test_data/test-input/test_list.txt) of KOs.
-If you don't have this table follow [instructions](src/README.md) how to generate it first.
+This script requires [hmmsearch table](tests/fixtures/give_pathways/test_pathway.txt) that was run on KEGG profiles with annotated sequences (preferable) **OR** [file with list](tests/fixtures/give_pathways/test_kos.txt) of KOs.
+If you don't have this table follow [instructions](src/README.md) how to generate it.
 
 #### Run using conda 
 ```commandline
 conda create --name kegg-env
 conda activate kegg-env
 
-pip3 install requirements.txt
+pip3 install -r requirements.txt
 
-export INPUT='tests/test_data/test-input/test'  # path to hmm-result table
-export OUTPUT='test-out'  # prefix for output
+export INPUT="tests/fixtures/give_pathways/test_pathway.txt"  # path to hmm-result table
+export OUTPUT="test-out"  # prefix for output
 
 # hmmtable as input
 python3 bin/give_pathways.py \
@@ -38,12 +46,12 @@ python3 bin/give_pathways.py \
 
 # KOs list as input
 python3 bin/give_pathways.py \
-  -l 'tests/test_data/test-input/test_list.txt' \
+  -l 'tests/test_data/test-input/test_kos.txt' \
   -o ${OUTPUT}
 ```
-Check example of output [here](tests/test_data/test-output). \
-`kegg_pathways.tsv` has pathways completeness calculated by all KOs in given input file \
-`kegg_contigs.tsv` has pathways completeness calculated per each contig (first column contains name of contig).
+Check example of output [here](tests/fixtures/give_pathways/output). \
+`*kegg_pathways.tsv` has pathways completeness calculated by all KOs in given input file \
+`*kegg_contigs.tsv` has pathways completeness calculated per each contig (first column contains name of contig).
 
 
 #### Run using docker
@@ -74,9 +82,9 @@ python3 bin/plot_completeness_graphs.py -i output_with_pathways_completeness
 
 Example,
 
-![M00050.png](tests%2Ftest_data%2Ftest-output%2Fplots%2FM00050.png)
+![M00050.png](tests/fixtures/give_pathways/output/pathways_plots/M00050.png)
 
-more examples for test data [here](tests/test_data/test-output/plots)
+more examples for test data [here](tests/fixtures/give_pathways/output/pathways_plots)
 
 
 ## Theory: 
@@ -88,23 +96,18 @@ where A, B, C, D, E, F are KOs \
 **comma** means OR \
 **plus** means essential component \
 **minus** means optional component
-Each expression was recursively [converted](bin/make_graphs/make_graphs.py) into directed graph using NetworkX. First node has number 0 and the last number 1. Each edge corresponds to KO. 
+Each expression was recursively [converted](kegg_pathways_completeness/bin/make_graphs/make_graphs.py) into directed graph using NetworkX. First node has number 0 and the last number 1. Each edge corresponds to KO. 
 
 ![ex1.png](src%2Fimg%2Fex1.png)
 
 ### Completeness
 In order to count pathways completeness each graph was made weighted. Default weight of each edge is 0. \
 Let's imagine there is a set of KOs predicted by annotation. If KO is presented in pathway - corresponding edge receives weight = 1 (or 0 if edge is optional or another value if edge is connected by +). \
-After that [script](bin/give_pathways.py) searches the most weighted path from node 0 to node 1 (`graph_weight`). 
+After that [script](kegg_pathways_completeness/bin/give_pathways.py) searches the most weighted path from node 0 to node 1 (`graph_weight`). 
 `max_graph_weight` calculated in assumption all KOs are presented. \
 ``
 completeness = graph_weight/max_graph_weight * 100%
 ``
 
 ![ex2.png](src%2Fimg%2Fex2.png)
 
-
-## Create plots for all pathways
-There are [plots](graphs/png) for every pathway as graph representation.
-If you need to re-generate them follow [instruction](graphs/README.md).
-
diff --git a/download.sh b/download.sh
diff --git a/src/cwl/__init__.py → kegg_pathways_completeness/__init__.py b/src/cwl/__init__.py → kegg_pathways_completeness/__init__.py
diff --git a/kegg_pathways_completeness/bin/__init__.py b/kegg_pathways_completeness/bin/__init__.py
diff --git a/kegg_pathways_completeness/bin/generate_hmmtable/__init__.py b/kegg_pathways_completeness/bin/generate_hmmtable/__init__.py
diff --git a/bin/generate_hmmtable/hmmscan_tab.py → ...ness/bin/generate_hmmtable/hmmscan_tab.py b/bin/generate_hmmtable/hmmscan_tab.py → ...ness/bin/generate_hmmtable/hmmscan_tab.py
@@ -8,8 +8,7 @@
 import sys
 import argparse
 
-
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(description="Convert fastq to fasta")
     parser.add_argument("-i", "--input", dest="input", help="Input file", required=True)
     parser.add_argument("-o", "--output", dest="output", help="Output file", required=True)
@@ -22,4 +21,7 @@
                 continue
             line = list(filter(None, line.strip().split(' ')))
             modified_line = '\t'.join(line[:22] + [' '.join(line[22:])])
-            file_out.write(modified_line + '\n')
+            file_out.write(modified_line + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/generate_hmmtable/parsing_hmmscan.py → .../bin/generate_hmmtable/parsing_hmmscan.py b/bin/generate_hmmtable/parsing_hmmscan.py → .../bin/generate_hmmtable/parsing_hmmscan.py
@@ -4,7 +4,6 @@
 import sys
 from Bio import SeqIO
 
-
 def get_dir_contigs(input_fasta):
     dict_contigs = {}
     seq_records = SeqIO.parse(input_fasta, "fasta")
@@ -38,8 +37,7 @@ def parsing(dict_contigs, input_file, outdir):
             if len(dict_contigs[key]) != 0:
                 file_out.write('\t'.join([key]+list(dict_contigs[key]))+'\n')
 
-
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(description="Generates file with KEGG orthologs for each contig")
     parser.add_argument("-i", "--input", dest="input_file", help="Tab deliminated file with hmmscan results",
                         required=True)
@@ -51,4 +49,8 @@ def parsing(dict_contigs, input_file, outdir):
         parser.print_help()
     else:
         args = parser.parse_args()
-        parsing(get_dir_contigs(args.fasta_file), args.input_file, args.outdir)
+        parsing(get_dir_contigs(args.fasta_file), args.input_file, args.outdir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/give_pathways.py → ...athways_completeness/bin/give_pathways.py b/bin/give_pathways.py → ...athways_completeness/bin/give_pathways.py
@@ -238,21 +238,23 @@ def sort_out_pathways(graphs, edges, pathway_names, pathway_classes,
     for percentage in sorted(list(dict_sort_by_percentage.keys()), reverse=True):
         #file_out_summary.write('**********************************************\nPercentage = ' + str(percentage) + '\n')
         for name_pathway in dict_sort_by_percentage[percentage]:
+            matching_list = sorted(dict_sort_by_percentage[percentage][name_pathway][1])
+            missing_list = sorted(dict_sort_by_percentage[percentage][name_pathway][2])
             if include_weights:
                 # matching
                 out_str = []
-                for KO in dict_sort_by_percentage[percentage][name_pathway][1]:
+                for KO in matching_list:
                     record = KO + '(' + str(weights_of_KOs[name_pathway][KO]) + ')'
                     out_str.append(record)
                 matching_current = ','.join(out_str)
                 # missing
                 out_str = []
-                for KO in dict_sort_by_percentage[percentage][name_pathway][2]:
+                for KO in missing_list:
                     out_str.append(KO + '(' + str(weights_of_KOs[name_pathway][KO]) + ')')
                 missing_current = ','.join(out_str)
             else:
-                matching_current = ','.join(dict_sort_by_percentage[percentage][name_pathway][1])
-                missing_current = ','.join(dict_sort_by_percentage[percentage][name_pathway][2])
+                matching_current = ','.join(matching_list)
+                missing_current = ','.join(missing_list)
 
             if contig_name != '':
                 out_name_pathway = '\t'.join([contig_name, name_pathway])
@@ -303,21 +305,19 @@ def get_weights_for_KOs(graphs):
     logging.info('weights done')
     return dict_graphKO
 
-
-if __name__ == "__main__":
-
+def main():
     parser = argparse.ArgumentParser(description="Script generates Graphs for each contig")
     parser.add_argument("-i", "--input", dest="input_file", help="Each line = pathway", required=False)
     parser.add_argument("-l", "--input-list", dest="input_list", help="File with KOs comma separated", required=False)
 
     parser.add_argument("-g", "--graphs", dest="graphs", help="graphs in pickle format", required=False,
-                        default="graphs/graphs.pkl")
+                        default="kegg_pathways_completeness/graphs/graphs.pkl")
     parser.add_argument("-a", "--pathways", dest="pathways", help="Pathways list", required=False,
-                        default="pathways_data/all_pathways.txt")
+                        default="kegg_pathways_completeness/pathways_data/all_pathways.txt")
     parser.add_argument("-n", "--names", dest="names", help="Pathway names", required=False,
-                        default="pathways_data/all_pathways_names.txt")
+                        default="kegg_pathways_completeness/pathways_data/all_pathways_names.txt")
     parser.add_argument("-c", "--classes", dest="classes", help="Pathway classes", required=False,
-                        default="pathways_data/all_pathways_class.txt")
+                        default="kegg_pathways_completeness/pathways_data/all_pathways_class.txt")
 
     parser.add_argument("-o", "--outname", dest="outname", help="first part of ouput name", default="summary.kegg")
     parser.add_argument("-w", "--include-weights", dest="include_weights", help="add weights for each KO in output", action='store_true')
@@ -370,3 +370,6 @@ def get_weights_for_KOs(graphs):
         file_out_summary.close()
         logging.info('...Done')
         logging.info('Bye!')
+
+if __name__ == "__main__":
+    main()
diff --git a/kegg_pathways_completeness/bin/make_graphs/__init__.py b/kegg_pathways_completeness/bin/make_graphs/__init__.py
diff --git a/bin/make_graphs/get_dot.py → ...s_completeness/bin/make_graphs/get_dot.py b/bin/make_graphs/get_dot.py → ...s_completeness/bin/make_graphs/get_dot.py
diff --git a/bin/make_graphs/make_graphs.py → ...mpleteness/bin/make_graphs/make_graphs.py b/bin/make_graphs/make_graphs.py → ...mpleteness/bin/make_graphs/make_graphs.py
diff --git a/bin/make_graphs/plot.py → ...ways_completeness/bin/make_graphs/plot.py b/bin/make_graphs/plot.py → ...ways_completeness/bin/make_graphs/plot.py