Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Script to fetch unique conda environments #6659

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ test_output/
tests/data/
work/
.github/CODEOWNERS-tmp
conda_envs/
78 changes: 78 additions & 0 deletions get_conda_envs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import hashlib
import os
from pathlib import Path

import yaml
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn

console = Console()


def find_and_normalize_yaml():
base_dir = Path("modules/nf-core")
output_dir = Path("conda_envs")
output_dir.mkdir(exist_ok=True)

processed_hashes = set()
total_files = 0
written_files = 0
skipped_files = 0

with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
console=console,
) as progress:
task = progress.add_task("[green]Processing files...", total=None)

for root, dirs, files in os.walk(base_dir):
for file in files:
if file == "environment.yml":
total_files += 1
file_path = Path(root) / file
was_written = normalize_and_save_yaml(file_path, output_dir, processed_hashes, progress, task)
if was_written:
written_files += 1
else:
skipped_files += 1

console.print(
Panel.fit(
f"[bold green]✨ Summary ✨[/]\n\n"
f"📁 Total environment.yml files found: [cyan]{total_files}[/]\n"
f"📝 Unique files written: [cyan]{written_files}[/]\n"
f"🔄 Duplicate files skipped: [cyan]{skipped_files}[/]",
title="Conda Environment Processing Results",
border_style="bold blue",
)
)


def normalize_and_save_yaml(file_path, output_dir, processed_hashes, progress, task):
with open(file_path) as f:
data = yaml.safe_load(f)

normalized_yaml = yaml.dump(data, sort_keys=False)
yaml_hash = hashlib.md5(normalized_yaml.encode()).hexdigest()

if yaml_hash not in processed_hashes:
output_file = output_dir / f"{yaml_hash}.yml"
with open(output_file, "w") as f:
f.write(normalized_yaml)
processed_hashes.add(yaml_hash)
progress.console.print(f"[green]✅ Saved:[/] {file_path} [dim](location: {output_file})[/]")
return True
else:
progress.console.print(f"[yellow]⏭️ Skipped:[/] {file_path}")
return False

progress.update(task, advance=1)


if __name__ == "__main__":
console.print("[bold magenta]Starting Conda Environment Processing...[/]\n")
find_and_normalize_yaml()
115 changes: 115 additions & 0 deletions get_wave_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import logging
import subprocess
from pathlib import Path

import yaml
from rich.console import Console
from rich.logging import RichHandler

# Configure rich console
console = Console()

# Configure logging with rich
logging.basicConfig(
level="DEBUG", # Set to DEBUG to capture all messages
format="%(message)s",
datefmt="[%X]",
handlers=[
RichHandler(console=console, rich_tracebacks=True, show_time=False, level="INFO"),
],
)
logger = logging.getLogger("rich")


def get_wave_image(env_file):
logger.debug(f"Requesting Wave image for {env_file}")
try:
result = subprocess.run(
["wave", "--conda-file", env_file, "--await"], capture_output=True, text=True, check=True
)
logger.debug(f"Successfully obtained Wave image for {env_file}")
return result.stdout.strip()
except subprocess.CalledProcessError as e:
logger.error(f"Error running Wave CLI for {env_file}: {e}")
return None


def update_meta_yml(meta_file, docker_image):
logger.debug(f"Updating {meta_file} with Docker image")

# Read the existing content
with open(meta_file) as f:
content = f.read()
meta_data = yaml.safe_load(content) or {}

# Check if 'containers' key exists
if "containers" not in meta_data:
# If not, add it with proper indentation
content += f"\ncontainers:\n docker_x86: {docker_image}\n"
else:
# If it exists, add docker_x86 under it
lines = content.splitlines()
containers_index = next(i for i, line in enumerate(lines) if line.strip() == "containers:")
indent = len(lines[containers_index + 1]) - len(lines[containers_index + 1].lstrip())
lines.insert(containers_index + 1, " " * indent + f"docker_x86: {docker_image}")
content = "\n".join(lines) + "\n"

# Write the updated content back to the file
with open(meta_file, "w") as f:
f.write(content)

logger.debug(f"Successfully updated {meta_file}")


def process_module_directory(directory, modname):
env_file = directory / "environment.yml"
meta_file = directory / "meta.yml"

if not env_file.exists():
logger.debug(f"Skipping {directory.name}: environment.yml not found")
return
if not meta_file.exists():
logger.debug(f"Skipping {directory.name}: meta.yml not found")
return

logger.debug(f"Checking {meta_file}")
with open(meta_file) as f:
meta_data = yaml.safe_load(f) or {}

if "containers" in meta_data and "docker_x86" in meta_data["containers"]:
logger.debug(f"Skipped {meta_file} (already contains docker_x86)")
return

logger.debug(f"docker_x86 not found in {meta_file}, proceeding with Wave CLI")
logger.info(f"Processing: {modname}")
docker_image = get_wave_image(str(env_file))
if docker_image:
update_meta_yml(meta_file, docker_image)
logger.debug(f"Got image: {docker_image}")
else:
logger.error(f"Failed to update {meta_file}: No Docker image obtained")


def process_modules():
modules_dir = Path("modules/nf-core")
logger.debug(f"Processing modules in {modules_dir}")

for module_dir in modules_dir.iterdir():
if not module_dir.is_dir():
continue

# Check if the module directory itself contains environment.yml and meta.yml
if (module_dir / "environment.yml").exists() and (module_dir / "meta.yml").exists():
process_module_directory(module_dir, module_dir.name)
else:
# If not, check for subdirectories
for subdir in module_dir.iterdir():
if subdir.is_dir():
logger.debug(f"Processing subdirectory: {subdir.name}")
process_module_directory(subdir, f"{module_dir.name}/{subdir.name}")


if __name__ == "__main__":
console.rule("[bold blue]Starting module processing[/bold blue]")
process_modules()
console.rule("[bold blue]Finished processing all modules[/bold blue]")
Loading