diff --git a/src/vfb_connect.egg-info/PKG-INFO b/src/vfb_connect.egg-info/PKG-INFO index 653779b7..334d064f 100644 --- a/src/vfb_connect.egg-info/PKG-INFO +++ b/src/vfb_connect.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: vfb_connect -Version: 2.2.1.dev8+68834e4.dirty +Version: 2.2.5.dev6+8aacf1a Summary: Wrapper for querying VirtualFlyBrain servers. Home-page: https://github.com/VirtualFlyBrain/VFB_connect Author: David Osumi-Sutherland diff --git a/src/vfb_connect/cross_server_tools.py b/src/vfb_connect/cross_server_tools.py index 42b85f16..6fbbaed7 100644 --- a/src/vfb_connect/cross_server_tools.py +++ b/src/vfb_connect/cross_server_tools.py @@ -96,6 +96,8 @@ def __init__(self, neo_endpoint=get_default_servers()['neo_endpoint'], self.neo_query_wrapper = QueryWrapper(**connections['neo']) self.cache_file = self.get_cache_file_path() self.lookup = self.nc.get_lookup(cache=self.cache_file) + self.normalized_lookup = self.preprocess_lookup() + self.reverse_lookup = {v: k for k, v in self.lookup.items()} self.oc = OWLeryConnect(endpoint=owlery_endpoint, lookup=self.lookup) self.vfb_base = "https://v2.virtualflybrain.org/org.geppetto.frontend/geppetto?id=" @@ -107,7 +109,7 @@ def __init__(self, neo_endpoint=get_default_servers()['neo_endpoint'], self.queries = json.loads(saxutils.unescape(f.read())) self._term_cache = [] - self._use_cache = True + self._use_cache = False self._load_limit = False self._dbs = None self._gene_function_filters = None @@ -148,19 +150,41 @@ def reload_lookup_cache(self, verbose=False): self.lookup = self.nc.get_lookup(cache=self.cache_file, verbose=verbose) def lookup_name(self, ids): - """Lookup the name for a given ID using the internal lookup table. + """ + Lookup the name for a given ID using the internal lookup table. :param ids: A single ID or list of IDs to look up. - :return: The name associated with the ID. - :rtype: str + :return: The name associated with the ID or a list of names if input is a list. + :rtype: str or list of str """ if isinstance(ids, list): return [self.lookup_name(id) for id in ids] - if not ids in self.lookup.values(): - return ids # If not found, return the input - return {v: k for k, v in self.lookup.items()}[ids] - def lookup_id(self, key, return_curie=False, allow_subsitutions=True, subsitution_stages=['adult', 'larval', 'pupal'], verbose=False): + if ids not in self.reverse_lookup: + return ids # If not found, return the input + + return self.reverse_lookup[ids] + + def preprocess_lookup(self): + """Preprocesses the lookup table to create a normalized lookup for faster access.""" + normalized_lookup = {} + for k, v in self.lookup.items(): + norm_key = self.normalize_key(k) + if norm_key not in normalized_lookup: + normalized_lookup[norm_key] = v + return normalized_lookup + + def normalize_key(self, key): + """ + Normalize the key for comparison by making it lowercase and removing special characters. + + :param key: The key to normalize. + :return: A normalized string. + """ + return key.lower().replace('_', '').replace('-', '').replace(' ', '').replace(':', '').replace(';', '') + + + def lookup_id(self, key, return_curie=False, allow_substitutions=True, substitution_stages=['adult', 'larval', 'pupal'], verbose=False): """Lookup the ID for a given key (label or symbol) using the internal lookup table. :param key: The label symbol, synonym, or potential ID to look up. @@ -173,85 +197,75 @@ def lookup_id(self, key, return_curie=False, allow_subsitutions=True, subsitutio if not key: print("\033[31mError:\033[0m No key provided.") return '' - # Check if the key is a VFBTerm object + if isinstance(key, VFBTerm): return key.id - + if isinstance(key, VFBTerms): return key.get_ids() - + if isinstance(key, list): - return [self.lookup_id(k, return_curie=return_curie, allow_subsitutions=allow_subsitutions, subsitution_stages=subsitution_stages) for k in key] - + return [self.lookup_id(k, return_curie=return_curie, allow_substitutions=allow_substitutions, substitution_stages=substitution_stages) for k in key] + if isinstance(key, str): dbs = self.get_dbs() if ":" in key and any(key.startswith(db) for db in dbs): split_key = key.rsplit(':', 1) - print(f"Split xref: {split_key}") if verbose else None + if verbose: + print(f"Split xref: {split_key}") if len(split_key) == 2: id = self.xref_2_vfb_id(acc=split_key[1], db=split_key[0], return_just_ids=True) if id and len(id) == 1: return id[0] - - # Direct lookup: Check if the key is already a valid ID if key in self.lookup.values(): return key if not return_curie else key.replace('_', ':') - - # CARO lookup: Check if the key is a CARO/BFO/UBERON/FBbt(obsolete) term; though not in the lookup they need to be handled if explicitly called + prefixes = ('CARO_', 'BFO_', 'UBERON_', 'GENO_', 'CL_', 'FB', 'VFB_', 'GO_', 'SO_', 'RO_', 'PATO_', 'CHEBI_', 'PR_', 'NCBITaxon_', 'ENVO_', 'OBI_', 'IAO_', 'OBI_') - if isinstance(key,str) and key.startswith(prefixes) and not key in self.lookup.keys(): + if key.startswith(prefixes) and key not in self.lookup.keys(): return key if not return_curie else key.replace('_', ':') - - # Direct lookup in the dictionary - if key in self.lookup.keys(): + + if key in self.lookup: out = self.lookup[key] return out if not return_curie else out.replace('_', ':') - else: - print(f"No direct match found for {key}") - - if allow_subsitutions: - matched_key = '' - out = '' - # Case-insensitive and character-insensitive lookup - normalized_key = key.lower().replace('_', '').replace('-', '').replace(' ', '').replace(':','') - print(f"Normalized key: {normalized_key}") if verbose else None - matches = {k: v for k, v in self.lookup.items() if k.lower().replace('_', '').replace('-', '').replace(' ', '').replace(':','').replace(';','') == normalized_key} - - if isinstance(subsitution_stages, str): - subsitution_stages = [subsitution_stages] + + if allow_substitutions: + normalized_key = self.normalize_key(key) + if verbose: + print(f"Normalized key: {normalized_key}") + + matches = {k: v for k, v in self.lookup.items() if self.normalize_key(k) == normalized_key} + + if isinstance(substitution_stages, str): + substitution_stages = [substitution_stages] + if not matches: - for stage in subsitution_stages: - stage_normalized_key = stage + normalized_key - matches = {k: v for k, v in self.lookup.items() if k.lower().replace('_', '').replace('-', '').replace(' ', '').replace(':','').replace(';','') == stage_normalized_key} - if matches: - break + for stage in substitution_stages: + stage_normalized_key = self.normalize_key(stage + key) + matches = {k: v for k, v in self.lookup.items() if self.normalize_key(k) == stage_normalized_key} + if matches: + break if matches: - for k, v in matches.items(): - print(f"Matched: {k} -> {v}") if verbose else None - if not matched_key: - matched_key = k - - # Warn if a case substitution or normalization was performed - if matched_key: - if len(matches.keys()) < 2: - print(f"\033[33mWarning:\033[0m Substitution made. '\033[33m{key}\033[0m' was matched to '\033[32m{matched_key}\033[0m'.") - out = matches[matched_key] - else: - all_matches = ", ".join([f"'{k}': '{v}'" for k, v in matches.items()]) - print(f"\033[33mWarning:\033[0m Ambiguous match for '\033[33m{key}\033[0m'. Using '{matched_key}' -> '\033[32m{out}\033[0m'. Other possible matches: {all_matches}") + matched_key = min(matches.keys(), key=len) + if verbose: + for k, v in matches.items(): + print(f"Matched: {k} -> {v}") + + if len(matches) == 1: + print(f"\033[33mWarning:\033[0m Substitution made. '\033[33m{key}\033[0m' was matched to '\033[32m{matched_key}\033[0m'.") + return matches[matched_key] if not return_curie else matches[matched_key].replace('_', ':') - return out if not return_curie else out.replace('_', ':') + all_matches = ", ".join([f"'{k}': '{v}'" for k, v in matches.items()]) + print(f"\033[33mWarning:\033[0m Ambiguous match for '\033[33m{key}\033[0m'. Using '{matched_key}' -> '\033[32m{matches[matched_key]}\033[0m'. Other possible matches: {all_matches}") + return matches[matched_key] if not return_curie else matches[matched_key].replace('_', ':') - # Check for partial matches: starts with starts_with_matches = {k: v for k, v in self.lookup.items() if k.lower().startswith(key.lower())} if starts_with_matches: all_matches = ", ".join([f"'\033[36m{k}\033[0m': '{v}'" for k, v in starts_with_matches.items()]) print(f"Notice: No exact match found, but potential matches starting with '\033[31m{key}\033[0m': {all_matches}") return '' - # Check for partial matches: contains contains_matches = {k: v for k, v in self.lookup.items() if key.lower() in k.lower()} if contains_matches: all_matches = ", ".join([f"'\033[36m{k}\033[0m': '{v}'" for k, v in contains_matches.items()]) diff --git a/src/vfb_connect/schema/vfb_term.py b/src/vfb_connect/schema/vfb_term.py index 837480ac..6080f705 100644 --- a/src/vfb_connect/schema/vfb_term.py +++ b/src/vfb_connect/schema/vfb_term.py @@ -717,7 +717,7 @@ def get_terms(self): :return: A VFBTerms object containing all the related terms. """ - return VFBTerms([rel.object for rel in self.relations]) + return VFBTerms([rel.object for rel in self.relations], query_by_label=False) def get_relations(self): """ @@ -1378,7 +1378,7 @@ def get_terms(self): :return: A VFBTerms object containing all the related terms. """ - return VFBTerms([exp.term for exp in self.expressions]) + return VFBTerms([exp.term for exp in self.expressions], query_by_label=False) def get_summary(self, return_dataframe=True): """ @@ -1780,7 +1780,7 @@ def parents(self): """ if self._parents is None: print("Loading parents for the first time...") if self.debug else None - self._parents = VFBTerms(self._parents_ids) if self._parents_ids else None + self._parents = VFBTerms(self._parents_ids, query_by_label=False) if self._parents_ids else None return self._parents def add_anatomy_type_properties(self): @@ -1854,7 +1854,7 @@ def add_template_properties(self): def regions(self): if self._regions is None: print("Loading regions for the first time...") if self.debug else None - self._regions = VFBTerms(self._regions_ids) if self._regions_ids else None + self._regions = VFBTerms(self._regions_ids, query_by_label=False) if self._regions_ids else None return self._regions # Dynamically add the property to the instance @@ -1929,15 +1929,18 @@ def instances(self, return_type=None): self._instances_ids = [r['id'] for r in results] if self._instances_ids and len(self._instances_ids) > 0: self.has_image = True + print(f"Got {len(self._instances_ids)} instances...") if self.debug else None if return_type == 'id': return self._instances_ids - if not self._instances_names: - self._instances_names = self.vfb.lookup_name(self._instances_ids) if return_type == 'name': + if not self._instances_names: + self._instances_names = self.vfb.lookup_name(self._instances_ids) + print(f"Got {len(self._instances_names)} instance names...") if self.debug else None return self._instances_names if self._instances is None: print("Creating instances for the first time...") - self._instances = VFBTerms(self._instances_ids, verbose=self.debug) + self._instances = VFBTerms(self._instances_ids, verbose=self.debug, query_by_label=False) + print(f"Got {len(self._instances)} instances...") if self.debug else None return self._instances @property @@ -1957,7 +1960,7 @@ def datasets(self): """ if self._datasets is None: print("Loading datasets for the first time...") if self.debug else None - self._datasets = VFBTerms(self._dataset_ids) if self._dataset_ids else None + self._datasets = VFBTerms(self._dataset_ids, query_by_label=False) if self._dataset_ids else None return self._datasets # Dynamically add the property to the instance @@ -1971,7 +1974,7 @@ def subtypes(self): """ if self._subtypes is None: print("Loading subtypes for the first time...") if self.debug else None - self._subtypes = VFBTerms(self.vfb.oc.get_subclasses(query=f"'{self.id}'", )) + self._subtypes = VFBTerms(self.vfb.oc.get_subclasses(query=f"'{self.id}'", ), query_by_label=False) return self._subtypes @property @@ -1981,7 +1984,7 @@ def subparts(self): """ if self._subparts is None: print("Loading subparts for the first time...") if self.debug else None - self._subparts = VFBTerms(self.vfb.oc.get_subclasses(query=f"'is part of' some '{self.id}'")) + self._subparts = VFBTerms(self.vfb.oc.get_subclasses(query=f"'is part of' some '{self.id}'"), query_by_label=False) return self._subparts @property @@ -2395,13 +2398,13 @@ def __sub__(self, other, verbose=False): if isinstance(other, VFBTerms): other_ids = other.get_ids() print("Removing ", other_ids) if verbose else None - remaining_terms = VFBTerms([term for term in [self.term] if term.id not in other_ids]) + remaining_terms = VFBTerms([term for term in [self.term] if term.id not in other_ids], query_by_label=False) print ("Remaining ", remaining_terms.get_ids()) if verbose else None return remaining_terms if isinstance(other, VFBTerm): other_ids = [other.id] print("Removing ", other.id) if verbose else None - remaining_terms = VFBTerms([term for term in [self.term] if term.id != other.id]) + remaining_terms = VFBTerms([term for term in [self.term] if term.id != other.id], query_by_label=False) return remaining_terms raise TypeError("Unsupported operand type(s) for -: 'VFBTerms' and '{}'".format(type(other).__name__)) @@ -2754,7 +2757,7 @@ def plot3d(self, template=None, verbose=False, query_by_label=True, force_reload if self._skeleton: print(f"Skeleton found for {self.name}") if verbose else None if include_template: - combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form]) + self + combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form], query_by_label=False) + self combined.plot3d(template=selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form, **kwargs) return return self._skeleton.plot3d(**kwargs) @@ -2765,7 +2768,7 @@ def plot3d(self, template=None, verbose=False, query_by_label=True, force_reload if self._mesh: print(f"Mesh found for {self.name}") if verbose else None if include_template: - combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form]) + self.term + combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form], query_by_label=False) + self.term combined.plot3d(template=selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form, **kwargs) return return self._mesh.plot3d(**kwargs) @@ -2776,7 +2779,7 @@ def plot3d(self, template=None, verbose=False, query_by_label=True, force_reload if self._volume: print(f"Volume found for {self.name}") if verbose else None if include_template: - combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form]) + self.term + combined = VFBTerms([selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form], query_by_label=False) + self.term combined.plot3d(template=selected_template if selected_template else self.channel_images[0].image.template_anatomy.short_form, **kwargs) return return self._volume.plot3d(**kwargs) @@ -2789,7 +2792,7 @@ def plot3d(self, template=None, verbose=False, query_by_label=True, force_reload if self.instances and len(self._instances) > 0: print(f"Loading instances for {self.name}") if verbose else None if include_template: - combined = VFBTerms([selected_template if selected_template else self.instances[0].channel_images[0].image.template_anatomy.short_form]) + self.instances + combined = VFBTerms([selected_template if selected_template else self.instances[0].channel_images[0].image.template_anatomy.short_form], query_by_label=False) + self.instances combined.plot3d(template=selected_template if selected_template else self.instances[0].channel_images[0].image.template_anatomy.short_form, **kwargs) self._return_type = temp return @@ -2989,7 +2992,7 @@ class VFBTerms: verbose : bool Whether to print out information about the loading process. """ - def __init__(self, terms: Union[List[VFBTerm], List[str], pandas.core.frame.DataFrame, List[dict]], verbose=False): + def __init__(self, terms: Union[List[VFBTerm], List[str], pandas.core.frame.DataFrame, List[dict]], verbose=False, query_by_label=True): from vfb_connect import vfb self.vfb = vfb self._summary = None @@ -3010,17 +3013,18 @@ def __init__(self, terms: Union[List[VFBTerm], List[str], pandas.core.frame.Data # Check if terms is a list of strings (IDs) if isinstance(terms, list) and all(isinstance(term, str) for term in terms): self.terms = [] + print(f"Changing {len(terms)} term names to ids") if verbose else None terms = [self.vfb.lookup_id(term) for term in terms if term] if self.vfb._load_limit and len(terms) > self.vfb._load_limit: print(f"More thann the load limit of {self.vfb._load_limit} requested. Loading first {self.vfb._load_limit} terms out of {len(terms)}") terms = terms[:self.vfb._load_limit] print(f"Pulling {len(terms)} terms from VFB...") - json_list = self.vfb.get_TermInfo(terms, summary=False, verbose=verbose) + json_list = self.vfb.get_TermInfo(terms, summary=False, verbose=verbose, query_by_label=query_by_label) if len(json_list) < len(terms): print("Some terms not found in cache. Falling back to slower Neo4j queries.") loaded_ids = [j['term']['core']['short_form'] for j in json_list] missing_ids = [term for term in terms if term not in loaded_ids] - missing_json = self.vfb.get_TermInfo(missing_ids, summary=False, cache=False, verbose=verbose) + missing_json = self.vfb.get_TermInfo(missing_ids, summary=False, cache=False, verbose=verbose, query_by_label=query_by_label) json_list = json_list + missing_json if len(json_list) < len(terms): loaded_ids = [j['term']['core']['short_form'] for j in json_list] @@ -3226,13 +3230,13 @@ def __sub__(self, other, verbose=False): if isinstance(other, VFBTerms): other_ids = other.get_ids() print("Removing ", other_ids) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id not in other_ids]) + remaining_terms = VFBTerms([term for term in self.terms if term.id not in other_ids], query_by_label=False) print ("Remaining ", remaining_terms.get_ids()) if verbose else None return remaining_terms if isinstance(other, VFBTerm): other_ids = [other.id] print("Removing ", other.id) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id != other.id]) + remaining_terms = VFBTerms([term for term in self.terms if term.id != other.id], query_by_label=False) return remaining_terms raise TypeError("Unsupported operand type(s) for -: 'VFBTerms' and '{}'".format(type(other).__name__)) @@ -3473,13 +3477,13 @@ def AND(self, other, verbose=False): if isinstance(other, VFBTerms): other_ids = other.get_ids() print("ANDing with ", other_ids) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id in other_ids]) + remaining_terms = VFBTerms([term for term in self.terms if term.id in other_ids], query_by_label=False) print ("Remaining ", remaining_terms.get_ids()) if verbose else None return remaining_terms if isinstance(other, VFBTerm): other_ids = [other.id] print("ANDing with ", other.id) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id == other.id]) + remaining_terms = VFBTerms([term for term in self.terms if term.id == other.id], query_by_label=False) return remaining_terms raise TypeError("Unsupported operand type(s) for AND: 'VFBTerms' and '{}'".format(type(other).__name__)) @@ -3604,13 +3608,13 @@ def NOT(self, other, verbose=False): if isinstance(other, VFBTerms): other_ids = other.get_ids() print("NOTing with ", other_ids) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id not in other_ids]) + remaining_terms = VFBTerms([term for term in self.terms if term.id not in other_ids], query_by_label=False) print ("Remaining ", remaining_terms.get_ids()) if verbose else None return remaining_terms if isinstance(other, VFBTerm): other_ids = [other.id] print("NOTing with ", other.id) if verbose else None - remaining_terms = VFBTerms([term for term in self.terms if term.id != other.id]) + remaining_terms = VFBTerms([term for term in self.terms if term.id != other.id], query_by_label=False) return remaining_terms raise TypeError("Unsupported operand type(s) for NOT: 'VFBTerms' and '{}'".format(type(other).__name__)) @@ -4028,7 +4032,7 @@ def create_vfbterm_list_from_json(json_data, verbose=False): if isinstance(json_data, list): data = json_data - return VFBTerms([create_vfbterm_from_json(term, verbose=verbose) for term in VFBTerms.tqdm_with_threshold(VFBTerms, data, threshold=10, desc="Loading Terms")]) + return VFBTerms([create_vfbterm_from_json(term, verbose=verbose) for term in VFBTerms.tqdm_with_threshold(VFBTerms, data, threshold=10, desc="Loading Terms")], query_by_label=False) def create_vfbterm_from_json(json_data, verbose=False): """