Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changes to maintenance script / ensembl push #1646

Merged
merged 13 commits into from
Jan 17, 2025
143 changes: 132 additions & 11 deletions maintenance/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ def test_name(self):
def test_common_name(self):
assert self.species.common_name == "$common_name"

def test_assembly_source(self):
assert self.genome.assembly_source == "$assembly_source"

def test_assembly_build_version(self):
assert self.genome.assembly_build_version == "$assembly_build_version"

# QC Tests. These tests are performed by another contributor
# independently referring to the citations provided in the
# species definition, filling in the appropriate values
Expand Down Expand Up @@ -177,6 +183,13 @@ def black_format(code):


def ensembl_stdpopsim_id(ensembl_id):
# Here is commented-out example code for dealing with name changes
# in Ensembl: when they changed the Ensembl ID of 'dog' from
# 'canis_familiaris' to 'canis_lupus_familiaris", we inserted these two lines;
# used this script to update `CanFam/genome_data.py` (which also put in
# the new Ensembl ID); and then removed the lines. The code:
# if ensembl_id == "canis_lupus_familiaris":
# ensembl_id = "canis_familiaris"
tmp = ensembl_id.split("_")[:2]
sps_id = "".join([x[0:3].capitalize() for x in tmp])
if len(sps_id) != 6:
Expand Down Expand Up @@ -296,15 +309,73 @@ def write_genome_data(self, ensembl_id):
raise ValueError(
f"Directory {id} corresponding to {ensembl_id} does" + "not exist"
)
logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
path = path / "genome_data.py"

genome_data_path = path / "genome_data.py"
existing_data = None

# Try to read existing genome data file once
if genome_data_path.exists():
try:
namespace = {}
with open(genome_data_path) as f:
exec(f.read(), namespace)
existing_data = namespace["data"]

# Check for non-Ensembl assembly source
existing_assembly_source = existing_data.get(
"assembly_source", "ensembl"
)
if existing_assembly_source != "ensembl":
logger.info(
f"Skipping {sps_id} ({ensembl_id}): "
f"existing genome_data.py has data "
f"not from Ensembl. (Re)move {genome_data_path}, "
f"re-run, and look"
f"at a diff to compare to current Ensembl data."
)
return ("manual", None)
except Exception as e:
logger.warning(
f"Error reading genome data for {sps_id}: {e}. "
"Proceeding with update."
)

# Get new data from Ensembl
data = self.ensembl_client.get_genome_data(ensembl_id)

# Preserve existing assembly source or default to "ensembl"
data["assembly_source"] = (
existing_data.get("assembly_source", "ensembl")
if existing_data
else "ensembl"
)

# Add Ensembl version number if assembly source is Ensembl
data["assembly_build_version"] = (
str(self.ensembl_client.get_release())
if data["assembly_source"] == "ensembl"
else None
)

# Check for chromosome name mismatches if we have existing data
if existing_data:
existing_chroms = set(existing_data["chromosomes"].keys())
new_chroms = set(data["chromosomes"].keys())

if existing_chroms != new_chroms:
logger.warning(
f"Skipping {sps_id} ({ensembl_id}): chromosome names in existing "
"genome_data.py do not match those in current Ensembl release. "
)
return ("chrom_mismatch", (existing_chroms, new_chroms))

logger.info(f"Writing genome data for {sps_id} {ensembl_id}")
code = f"data = {data}"

# Format the code with Black so we don't get noisy diffs
with self.write(path) as f:
with self.write(genome_data_path) as f:
f.write(black_format(code))
return data
return ("updated", None)
petrelharp marked this conversation as resolved.
Show resolved Hide resolved

def write_genome_data_ncbi(self, ncbi_id, sps_id):
path = catalog_path(sps_id)
Expand Down Expand Up @@ -371,15 +442,66 @@ def update_genome_data(species):
will update the genome data for humans. Multiple species can
be specified. By default all species are updated.
"""
# TODO make this work for NCBI as well
# Track warnings and errors
skipped_species = []

# Original species processing logic
if len(species) == 0:
embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
species_list = list(stdpopsim.all_species())
logger.info(f"Found {len(species_list)} species in catalog")
embl_ids = []
for s in species_list:
logger.info(f"Processing {s.id}: ensembl_id={s.ensembl_id}")
embl_ids.append((s.id, s.ensembl_id))
else:
embl_ids = [s.lower() for s in species]
embl_ids = [(s, s.lower()) for s in species]

# Process each species, maintaining existing logging
writer = DataWriter()
for eid in embl_ids:
writer.write_genome_data(eid)
writer.write_ensembl_release()
for species_id, eid in embl_ids:
try:
result = writer.write_genome_data(eid)
status, details = result
if status == "manual":
skipped_species.append(
(species_id, eid, "Manually created genome data file")
)
elif status == "chrom_mismatch":
existing_chroms, new_chroms = details
skipped_species.append(
(
species_id,
eid,
(
f"Chromosome names mismatch.\n"
f"Existing: {sorted(existing_chroms)}\n"
f"New: {sorted(new_chroms)}"
),
)
)
except ValueError as e:
logger.error(f"Error processing {eid}: {e}")
skipped_species.append((species_id, eid, str(e)))
except Exception as e:
logger.error(f"Unexpected error processing {eid}: {e}")
skipped_species.append((species_id, eid, f"Unexpected error: {str(e)}"))

# Add summary report at the end
if skipped_species:
logger.warning("\n=== Species Update Summary ===")
logger.warning("The following species were not updated:")
for species_id, eid, reason in skipped_species:
if "Chromosome names mismatch" in reason:
# Split the chromosome mismatch message into multiple lines
logger.warning(f" - {species_id} (Ensembl ID: {eid}):")
logger.warning(" Chromosome names mismatch.")
# Parse the chromosome lists from the new format
existing = reason.split("Existing: ")[1].split("\n")[0]
new = reason.split("New: ")[1]
logger.warning(f" Existing chromosomes: {existing}")
logger.warning(f" New chromosomes: {new}")
else:
logger.warning(f" - {species_id} (Ensembl ID: {eid}): {reason}")


@cli.command()
Expand All @@ -391,7 +513,6 @@ def add_species(ensembl_id, force):
"""
writer = DataWriter()
writer.add_species(ensembl_id.lower(), force=force)
writer.write_ensembl_release()


# TODO refactor this so that it's an option to add-species. By default
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AedAeg/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@
"3": {"length": 409777670, "synonyms": []},
"MT": {"length": 16790, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnaPla/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@
"39": {"length": 2018729, "synonyms": []},
"40": {"length": 1354177, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoCar/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@
"LGh": {"length": 248369, "synonyms": []},
"MT": {"length": 17223, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "103",
}
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AnoGam/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@
"X": {"length": 24393108, "synonyms": []},
"Mt": {"length": 15363, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
36 changes: 19 additions & 17 deletions stdpopsim/catalog/ApiMel/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,24 @@
"assembly_accession": "GCA_003254395.2",
"assembly_name": "Amel_HAv3.1",
"chromosomes": {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this is why Codecov is complaining (no more synonyms, so this bit of code in ApiMel/species.py is not needed)
Screenshot from 2025-01-13 21-54-18

"CM009931.2": {"length": 27754200, "synonyms": ["NC_037638.1"]},
"CM009932.2": {"length": 16089512, "synonyms": ["NC_037639.1"]},
"CM009933.2": {"length": 13619445, "synonyms": ["NC_037640.1"]},
"CM009934.2": {"length": 13404451, "synonyms": ["NC_037641.1"]},
"CM009935.2": {"length": 13896941, "synonyms": ["NC_037642.1"]},
"CM009936.2": {"length": 17789102, "synonyms": ["NC_037643.1"]},
"CM009937.2": {"length": 14198698, "synonyms": ["NC_037644.1"]},
"CM009938.2": {"length": 12717210, "synonyms": ["NC_037645.1"]},
"CM009939.2": {"length": 12354651, "synonyms": ["NC_037646.1"]},
"CM009940.2": {"length": 12360052, "synonyms": ["NC_037647.1"]},
"CM009941.2": {"length": 16352600, "synonyms": ["NC_037648.1"]},
"CM009942.2": {"length": 11514234, "synonyms": ["NC_037649.1"]},
"CM009943.2": {"length": 11279722, "synonyms": ["NC_037650.1"]},
"CM009944.2": {"length": 10670842, "synonyms": ["NC_037651.1"]},
"CM009945.2": {"length": 9534514, "synonyms": ["NC_037652.1"]},
"CM009946.2": {"length": 7238532, "synonyms": ["NC_037653.1"]},
"CM009947.2": {"length": 16343, "synonyms": ["NC_001566.1", "MT"]},
petrelharp marked this conversation as resolved.
Show resolved Hide resolved
"CM009931.2": {"length": 27754200, "synonyms": []},
"CM009932.2": {"length": 16089512, "synonyms": []},
"CM009933.2": {"length": 13619445, "synonyms": []},
"CM009934.2": {"length": 13404451, "synonyms": []},
"CM009935.2": {"length": 13896941, "synonyms": []},
"CM009936.2": {"length": 17789102, "synonyms": []},
"CM009937.2": {"length": 14198698, "synonyms": []},
"CM009938.2": {"length": 12717210, "synonyms": []},
"CM009939.2": {"length": 12354651, "synonyms": []},
"CM009940.2": {"length": 12360052, "synonyms": []},
"CM009941.2": {"length": 16352600, "synonyms": []},
"CM009942.2": {"length": 11514234, "synonyms": []},
"CM009943.2": {"length": 11279722, "synonyms": []},
"CM009944.2": {"length": 10670842, "synonyms": []},
"CM009945.2": {"length": 9534514, "synonyms": []},
"CM009946.2": {"length": 7238532, "synonyms": []},
"CM009947.2": {"length": 16343, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
10 changes: 2 additions & 8 deletions stdpopsim/catalog/ApiMel/species.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,9 @@
"CM009947.2": ["NC_001566.1", "MT"],
}


def add_if_unique(chrom, synonyms):
for syn in synonyms:
if syn not in chrom.synonyms:
chrom.synonyms.append(syn)


for chrom in _genome.chromosomes:
add_if_unique(chrom, chr_synonyms_dict[chrom.id])
syns = list(set(chrom.synonyms + chr_synonyms_dict[chrom.id]))
chrom.synonyms = syns

_NelsonEtAl = stdpopsim.Citation(
doi="https://doi.org/10.1111/mec.14122",
Expand Down
2 changes: 2 additions & 0 deletions stdpopsim/catalog/AraTha/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@
"Mt": {"length": 366924, "synonyms": []},
"Pt": {"length": 154478, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
66 changes: 34 additions & 32 deletions stdpopsim/catalog/BosTau/genome_data.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
# File autogenerated from Ensembl REST API. Do not edit.
data = {
"assembly_accession": "GCA_002263795.2",
"assembly_name": "ARS-UCD1.2",
"assembly_accession": "GCA_002263795.3",
"assembly_name": "ARS-UCD1.3",
"chromosomes": {
"1": {"length": 158534110, "synonyms": []},
"2": {"length": 136231102, "synonyms": []},
"3": {"length": 121005158, "synonyms": []},
"4": {"length": 120000601, "synonyms": []},
"5": {"length": 120089316, "synonyms": []},
"6": {"length": 117806340, "synonyms": []},
"7": {"length": 110682743, "synonyms": []},
"8": {"length": 113319770, "synonyms": []},
"9": {"length": 105454467, "synonyms": []},
"10": {"length": 103308737, "synonyms": []},
"11": {"length": 106982474, "synonyms": []},
"12": {"length": 87216183, "synonyms": []},
"13": {"length": 83472345, "synonyms": []},
"14": {"length": 82403003, "synonyms": []},
"15": {"length": 85007780, "synonyms": []},
"16": {"length": 81013979, "synonyms": []},
"17": {"length": 73167244, "synonyms": []},
"18": {"length": 65820629, "synonyms": []},
"19": {"length": 63449741, "synonyms": []},
"20": {"length": 71974595, "synonyms": []},
"21": {"length": 69862954, "synonyms": []},
"22": {"length": 60773035, "synonyms": []},
"23": {"length": 52498615, "synonyms": []},
"24": {"length": 62317253, "synonyms": []},
"25": {"length": 42350435, "synonyms": []},
"26": {"length": 51992305, "synonyms": []},
"27": {"length": 45612108, "synonyms": []},
"28": {"length": 45940150, "synonyms": []},
"29": {"length": 51098607, "synonyms": []},
"X": {"length": 139009144, "synonyms": []},
"1": {"length": 158534110, "synonyms": ["chr1"]},
"2": {"length": 136231102, "synonyms": ["chr2"]},
"3": {"length": 121005158, "synonyms": ["chr3"]},
"4": {"length": 120000601, "synonyms": ["chr4"]},
"5": {"length": 120089316, "synonyms": ["chr5"]},
"6": {"length": 117806340, "synonyms": ["chr6"]},
"7": {"length": 110682743, "synonyms": ["chr7"]},
"8": {"length": 113319770, "synonyms": ["chr8"]},
"9": {"length": 105454467, "synonyms": ["chr9"]},
"10": {"length": 103308737, "synonyms": ["chr10"]},
"11": {"length": 106982474, "synonyms": ["chr11"]},
"12": {"length": 87216183, "synonyms": ["chr12"]},
"13": {"length": 83472345, "synonyms": ["chr13"]},
"14": {"length": 82403003, "synonyms": ["chr14"]},
"15": {"length": 85007780, "synonyms": ["chr15"]},
"16": {"length": 81013979, "synonyms": ["chr16"]},
"17": {"length": 73167244, "synonyms": ["chr17"]},
"18": {"length": 65820629, "synonyms": ["chr18"]},
"19": {"length": 63449741, "synonyms": ["chr19"]},
"20": {"length": 71974595, "synonyms": ["chr20"]},
"21": {"length": 69862954, "synonyms": ["chr21"]},
"22": {"length": 60773035, "synonyms": ["chr22"]},
"23": {"length": 52498615, "synonyms": ["chr23"]},
"24": {"length": 62317253, "synonyms": ["chr24"]},
"25": {"length": 42350435, "synonyms": ["chr25"]},
"26": {"length": 51992305, "synonyms": ["chr26"]},
"27": {"length": 45612108, "synonyms": ["chr27"]},
"28": {"length": 45940150, "synonyms": ["chr28"]},
"29": {"length": 51098607, "synonyms": ["chr29"]},
"X": {"length": 139009144, "synonyms": ["chrX"]},
"MT": {"length": 16338, "synonyms": []},
},
"assembly_source": "ensembl",
"assembly_build_version": "113",
}
Loading
Loading