-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from obophenotype/test
Dockerfile and docker-compose file have been added.
- Loading branch information
Showing
8 changed files
with
330 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
name: Docker | ||
|
||
# This workflow uses actions that are not certified by GitHub. | ||
# They are provided by a third-party and are governed by | ||
# separate terms of service, privacy policy, and support | ||
# documentation. | ||
|
||
on: | ||
workflow_dispatch: | ||
release: | ||
types: [created] | ||
|
||
env: | ||
# Use docker.io for Docker Hub if empty | ||
REGISTRY: ghcr.io | ||
# github.repository as <account>/<repo> | ||
IMAGE_NAME: ${{ github.repository }} | ||
|
||
|
||
jobs: | ||
build-and-push-image: | ||
runs-on: ubuntu-latest | ||
permissions: | ||
contents: read | ||
packages: write | ||
|
||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
|
||
- name: Log in to the Container registry | ||
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 | ||
with: | ||
registry: ${{ env.REGISTRY }} | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Extract metadata (tags, labels) for Docker | ||
id: meta | ||
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 | ||
with: | ||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} | ||
|
||
- name: Setup environment | ||
run: echo "BRANCH=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV | ||
|
||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v1 | ||
|
||
- name: Set up Docker Buildx | ||
id: buildx | ||
uses: docker/setup-buildx-action@v1 | ||
|
||
- name: Build and push Docker image | ||
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc | ||
with: | ||
context: . | ||
push: true | ||
platforms: linux/amd64,linux/arm64 | ||
tags: ${{ steps.meta.outputs.tags }} | ||
labels: ${{ steps.meta.outputs.labels }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
FROM python:3.9 | ||
|
||
SHELL ["/bin/bash", "-c"] | ||
|
||
RUN apt-get update && apt-get install -y build-essential graphviz libgraphviz-dev pkg-config && apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
WORKDIR /app | ||
|
||
ENV VENV="/opt/env" | ||
ENV PATH="$VENV/bin:$PATH" | ||
|
||
RUN python -m venv $VENV | ||
|
||
COPY requirements.txt ./ | ||
RUN pip install -r requirements.txt | ||
|
||
COPY src/ ./src | ||
|
||
CMD ["python", "src/process.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
version: '3.8' | ||
|
||
services: | ||
anndata2rdf: | ||
image: anndata2rdf:latest | ||
container_name: anndata_to_rdf | ||
volumes: | ||
- ./src/config:/app/src/config | ||
- ./src/curated_data:/app/src/curated_data | ||
- ./src/dataset:/app/src/dataset | ||
- ./src/graph:/app/src/graph |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cellxgene-census==1.11.1 | ||
pandasaurus-cxg~=0.1.11 | ||
pandas~=2.2.1 | ||
PyYAML~=6.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import logging | ||
import os | ||
|
||
import pandas as pd | ||
import yaml | ||
|
||
|
||
logging.basicConfig(level=logging.WARNING) | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
def generate_yaml_data(data): | ||
grouped_data = data.groupby("CxG link") | ||
_yaml_data = [] | ||
for link, group_df in grouped_data: | ||
author_cell_type_list = [ | ||
col.strip() | ||
for col in group_df["Author Category Cell Type Field Name"].tolist() | ||
] | ||
entry = {"CxG_link": link, "author_cell_type_list": author_cell_type_list} | ||
_yaml_data.append(entry) | ||
return _yaml_data | ||
|
||
|
||
def write_yaml_file(yaml_data, file_path): | ||
with open(file_path, "w") as yaml_file: | ||
yaml.dump(yaml_data, yaml_file) | ||
logger.info(f"{file_path} written") | ||
|
||
|
||
def generate_author_cell_type_config(curated_data_folder: str = "curated_data"): | ||
all_yaml_data = [] | ||
data_folder = os.path.join( | ||
os.path.dirname(os.path.abspath(__file__)), curated_data_folder | ||
) | ||
for file_name in os.listdir(data_folder): | ||
file_path = os.path.join(data_folder, file_name) | ||
|
||
if file_name.endswith(".csv"): | ||
df = pd.read_csv(file_path) | ||
elif file_name.endswith(".xlsx") or file_name.endswith(".xls"): | ||
df = pd.read_excel(file_path) | ||
else: | ||
logger.info(f"Skipping file '{file_name}' with unsupported format.") | ||
continue | ||
|
||
yaml_data = generate_yaml_data(df) | ||
all_yaml_data.extend(yaml_data) | ||
return all_yaml_data | ||
|
||
|
||
if __name__ == "__main__": | ||
config_yaml = generate_author_cell_type_config() | ||
output_file_path = os.path.join( | ||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "config"), | ||
"cxg_author_cell_type.yaml", | ||
) | ||
write_yaml_file(config_yaml, output_file_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import logging | ||
import os | ||
from typing import List | ||
import yaml | ||
|
||
from pandasaurus_cxg.enrichment_analysis import AnndataEnrichmentAnalyzer | ||
from pandasaurus_cxg.graph_generator.graph_generator import GraphGenerator | ||
|
||
logging.basicConfig(level=logging.WARNING) | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
def generate_rdf_graph( | ||
anndata_file_path: str, author_cell_type_list: List[str], output_rdf_path: str | ||
): | ||
logger.info(f"Generating RDF graph using {anndata_file_path}...") | ||
aea = AnndataEnrichmentAnalyzer(anndata_file_path, author_cell_type_list) | ||
aea.analyzer_manager.co_annotation_report() | ||
gg = GraphGenerator(aea) | ||
gg.generate_rdf_graph() | ||
gg.set_label_adding_priority(author_cell_type_list) | ||
gg.add_label_to_terms() | ||
gg.save_rdf_graph(file_name=output_rdf_path) | ||
logger.info(f"RDF graph has been generated for {anndata_file_path}...") | ||
|
||
|
||
if __name__ == "__main__": | ||
dirname = os.path.dirname(os.path.abspath(__file__)) | ||
config_dir = os.path.join(dirname, "config") | ||
with open( | ||
os.path.join( | ||
config_dir, | ||
"rdf_config.yaml", | ||
), | ||
"r", | ||
) as file: | ||
config_data = yaml.safe_load(file) | ||
|
||
for config in config_data: | ||
generate_rdf_graph( | ||
os.path.join(dirname, str(config["anndata_file_path"])), | ||
config["author_cell_type_list"], | ||
os.path.join( | ||
"graph", | ||
( | ||
config["output_rdf_path"] | ||
if "output_rdf_path" in config | ||
else config["anndata_file_path"].split("/")[-1].split(".")[0] | ||
), | ||
), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import logging | ||
import os | ||
import sys | ||
|
||
from csv_parser import generate_author_cell_type_config, write_yaml_file | ||
from pull_anndata import download_dataset_with_id, get_dataset_dict, delete_file | ||
from generate_rdf import generate_rdf_graph | ||
|
||
logger = logging.getLogger(__name__) | ||
logging.basicConfig(level=logging.WARNING) | ||
stdout_handler = logging.StreamHandler(sys.stdout) | ||
stdout_handler.setLevel(logging.INFO) | ||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | ||
stdout_handler.setFormatter(formatter) | ||
logger.addHandler(stdout_handler) | ||
|
||
CONFIG_DIRECTORY = "config" | ||
CURATED_DATA_DIRECTORY = "curated_data" | ||
DATASET_DIRECTORY = "dataset" | ||
GRAPH_DIRECTORY = "graph" | ||
|
||
CXG_AUTHOR_CELL_TYPE_CONFIG = "cxg_author_cell_type.yaml" | ||
GENERATE_RDF_CONFIG = "generate_rdf_config.yaml" | ||
|
||
cxg_author_cell_type_yaml = generate_author_cell_type_config() | ||
output_file_path = os.path.join( | ||
os.path.join(os.path.dirname(os.path.abspath(__file__)), CONFIG_DIRECTORY), | ||
CXG_AUTHOR_CELL_TYPE_CONFIG, | ||
) | ||
write_yaml_file(cxg_author_cell_type_yaml, output_file_path) | ||
|
||
datasets = get_dataset_dict(cxg_author_cell_type_yaml) | ||
for dataset, author_cell_types in datasets.items(): | ||
dataset_path = download_dataset_with_id(dataset) | ||
generate_rdf_graph( | ||
dataset_path, | ||
author_cell_types, | ||
os.path.join( | ||
os.path.join(os.path.dirname(os.path.abspath(__file__)), GRAPH_DIRECTORY), | ||
dataset, | ||
), | ||
) | ||
delete_file(dataset_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import logging | ||
import os | ||
from typing import Dict, List, Optional, Union | ||
import yaml | ||
|
||
import cellxgene_census | ||
|
||
|
||
logging.basicConfig(level=logging.WARNING) | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
def download_dataset_with_id(dataset_id: str, file_path: Optional[str] = None) -> str: | ||
""" | ||
Download an AnnData dataset with the specified ID. | ||
Args: | ||
dataset_id (str): The ID of the dataset to download. | ||
file_path (Optional[str], optional): The file path to save the downloaded AnnData. If not provided, | ||
the dataset_id will be used as the file name. Defaults to None. | ||
Returns: | ||
str: The path to the downloaded file | ||
""" | ||
anndata_file_path = f"{dataset_id}.h5ad" if file_path is None else file_path | ||
anndata_file_path = os.path.join( | ||
os.path.dirname(os.path.abspath(__file__)), | ||
os.path.join("dataset", anndata_file_path), | ||
) | ||
if os.path.exists(anndata_file_path): | ||
logger.info(f"File '{anndata_file_path}' already exists. Skipping download.") | ||
else: | ||
logger.info(f"Downloading dataset with ID '{dataset_id} to {anndata_file_path}'...") | ||
cellxgene_census.download_source_h5ad(dataset_id, to_path=anndata_file_path) | ||
logger.info(f"Download complete. File saved at '{anndata_file_path}'.") | ||
return anndata_file_path | ||
|
||
|
||
def delete_file(file_name): | ||
try: | ||
os.remove(file_name) | ||
logger.info(f"File '{file_name}' deleted successfully.") | ||
except OSError as e: | ||
logger.info(f"Error deleting file '{file_name}': {e}") | ||
|
||
|
||
def get_dataset_dict(input_source: List[Dict]): | ||
cxg_dataset_dict = {} | ||
for config in input_source: | ||
cxg_link = config["CxG_link"] | ||
cxg_id = get_dataset_id_from_link(cxg_link) | ||
cxg_dataset_dict.update({cxg_id.split(".")[0]: config["author_cell_type_list"]}) | ||
return cxg_dataset_dict | ||
|
||
|
||
def get_dataset_id_from_link(cxg_link: str) -> str: | ||
if cxg_link.endswith("/"): | ||
return cxg_link.split("/")[-2] | ||
else: | ||
return cxg_link.split("/")[-1] | ||
|
||
|
||
def read_yaml_config(config_file: str): | ||
with open(config_file, "r") as file: | ||
return yaml.safe_load(file) | ||
|
||
|
||
if __name__ == "__main__": | ||
config_list = read_yaml_config( | ||
os.path.join( | ||
os.path.dirname(os.path.abspath(__file__)), | ||
os.path.join("config", "cxg_author_cell_type.yaml"), | ||
) | ||
) | ||
datasets = get_dataset_dict(config_list) | ||
for dataset in datasets.keys(): | ||
dataset_name = download_dataset_with_id(dataset) |