diff --git a/examples/train_clip.py b/examples/train_clip.py new file mode 100644 index 00000000..1e29640b --- /dev/null +++ b/examples/train_clip.py @@ -0,0 +1,93 @@ +import argparse +import json, os +from ontolearn.concept_learner import NCES, NCES2, ROCES, CLIP +from transformers import set_seed +import time +from ontolearn.knowledge_base import KnowledgeBase +from ontolearn.lp_generator import LPGen +from ontolearn.refinement_operators import ExpressRefinement + + +def str2bool(v): + if isinstance(v, bool): + return v + elif v.lower() in ['t', 'true', 'y', 'yes', '1']: + return True + elif v.lower() in ['f', 'false', 'n', 'no', '0']: + return False + else: + raise ValueError('Invalid boolean value.') + + +def start(args): + assert (args.kb is not None), "Argument 'kb' is required." + training_data = None + if args.path_train_data is not None: + try: + if os.path.isdir(args.path_train_data): + with open(args.path_train_data + "/LPs.json") as file: + training_data = json.load(file) + if isinstance(training_data, dict): + training_data = list(training_data.items()) + else: + assert isinstance(training_data, + list), "The training data must either be stored as a dictionary ({'expr': {'positive examples': [], 'negative examples': []}, ...,}) or a list of items" + else: + with open(args.path_train_data) as file: + training_data = json.load(file) + if isinstance(training_data, dict): + training_data = list(training_data.items()) + else: + assert isinstance(training_data, + list), "The training data must either be stored as a dictionary ({'expr': {'positive examples': [], 'negative examples': []}, ...,}) or a list of items" + except FileNotFoundError: + print( + "Couldn't find training data in the specified path. Defaulting to generating training data.") + + # use clip here + knowledge_base_path = args.kb + path_of_embeddings = args.path_of_embeddings + if os.path.exists(knowledge_base_path) and os.path.exists(path_of_embeddings): + KB = KnowledgeBase(path=knowledge_base_path) + op = ExpressRefinement(knowledge_base=KB, use_inverse=False, + use_numeric_datatypes=False) + clip = CLIP(knowledge_base=KB, path_of_embeddings=path_of_embeddings, + refinement_operator=op, predictor_name=args.predictor_name, + load_pretrained=args.load_pretrained, + pretrained_predictor_name=args.pretrained_predictor_name, + max_runtime=args.max_runtime, num_workers=args.num_workers) + clip.train(training_data, epochs=args.epochs, learning_rate=args.lr, + storage_path=args.storage_path) + else: + print("Knowledge base or embeddings not found. Please check the paths.") + + +if __name__ == '__main__': + set_seed(42) + parser = argparse.ArgumentParser() + parser.add_argument('--kb', type=str, default=None, help='Paths of a knowledge base (OWL file)') + parser.add_argument('--path_train_data', type=str, default=None, help='Path to training data') + parser.add_argument('--path_of_embeddings', type=str, default=None, + help='Path to CLIP-compatible embeddings') + parser.add_argument('--epochs', type=int, default=5, help='Number of training epochs') + parser.add_argument('--lr', type=float, default=0.001, + help='Learning rate for training. The optimizer is Adam.') + parser.add_argument('--predictor_name', type=str, default=None, + help='Name of the length predictor to be used. The options are: SetTransformer, GRU, LSTM') + parser.add_argument('--load_pretrained', type=str2bool, default=True, + help='Whether to load the pretrained model') + parser.add_argument('--pretrained_predictor_name', type=str, default='SetTransformer', + help='Name of the pretrained length predictor to be used. The options are: SetTransformer, GRU, LSTM, CNN') + parser.add_argument('--storage_path', type=str, default=None, + help='Path to save the trained models') + parser.add_argument('--max_runtime', type=int, default=60, + help='Maximum runtime in for clip') + parser.add_argument('--num_workers', type=int, default=0, + help='Number of workers for data loading') + + args = parser.parse_args() + # make training data + # data_generator = LPGen(kb_path= args.kb, storage_path=args.storage_path) + # data_generator.generate() + args.tmax = min(args.max_runtime, args.epochs) + start(args) diff --git a/ontolearn/base_nces.py b/ontolearn/base_nces.py index ce262753..7b49788a 100644 --- a/ontolearn/base_nces.py +++ b/ontolearn/base_nces.py @@ -38,10 +38,10 @@ class BaseNCES: - def __init__(self, knowledge_base_path, nces2_or_roces, quality_func, num_predictions, auto_train=True, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False, learning_rate=1e-4, tmax=20, eta_min=1e-5, clip_value=5.0, + def __init__(self, knowledge_base, nces2_or_roces, quality_func, num_predictions, auto_train=True, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False, learning_rate=1e-4, tmax=20, eta_min=1e-5, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, verbose: int = 0): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - kb = KnowledgeBase(path=knowledge_base_path) + kb = knowledge_base#KnowledgeBase(path=knowledge_base_path) self.kb_namespace = list(kb.ontology.classes_in_signature())[0].iri.get_namespace() self.dl_parser = DLSyntaxParser(self.kb_namespace) self.renderer = DLSyntaxObjectRenderer() @@ -55,7 +55,7 @@ def __init__(self, knowledge_base_path, nces2_or_roces, quality_func, num_predic vocab.extend(concrete_role_names) vocab.extend(['⁻', '≤', '≥', 'True', 'False', 'true', 'false', '{', '}', ':', '[', ']', 'double', 'integer', 'date', 'xsd']) vocab = sorted(set(vocab)) + ['PAD'] - self.knowledge_base_path = knowledge_base_path + #self.knowledge_base_path = knowledge_base_path self.kb = kb self.all_individuals = set([ind.str.split("/")[-1] for ind in kb.individuals()]) self.inv_vocab = np.array(vocab, dtype='object') diff --git a/ontolearn/concept_learner.py b/ontolearn/concept_learner.py index a16f7916..2c6ceb5b 100644 --- a/ontolearn/concept_learner.py +++ b/ontolearn/concept_learner.py @@ -554,12 +554,11 @@ class CLIP(CELOE): __slots__ = 'best_descriptions', 'max_he', 'min_he', 'best_only', 'calculate_min_max', 'heuristic_queue', \ 'search_tree', '_learning_problem', '_max_runtime', '_seen_norm_concepts', 'predictor_name', \ 'pretrained_predictor_name', 'load_pretrained', 'output_size', 'num_examples', 'path_of_embeddings', \ - 'instance_embeddings', 'input_size', 'device', 'length_predictor', 'num_workers', 'knowledge_base_path' + 'instance_embeddings', 'input_size', 'device', 'length_predictor', 'num_workers', 'knowledge_base' name = 'CLIP' def __init__(self, knowledge_base: AbstractKnowledgeBase, - knowledge_base_path='', reasoner: Optional[AbstractOWLReasoner] = None, refinement_operator: Optional[BaseRefinement[OENode]] = ExpressRefinement, quality_func: Optional[AbstractScorer] = None, @@ -593,16 +592,18 @@ def __init__(self, calculate_min_max) self.predictor_name = predictor_name self.pretrained_predictor_name = pretrained_predictor_name - self.knowledge_base_path = knowledge_base_path + self.knowledge_base = knowledge_base self.load_pretrained = load_pretrained self.num_workers = num_workers self.output_size = output_size self.num_examples = num_examples self.path_of_embeddings = path_of_embeddings + if self.path_of_embeddings: assert os.path.isfile(self.path_of_embeddings), '!!! Wrong path for CLIP embeddings' self.instance_embeddings = pd.read_csv(path_of_embeddings, index_col=0) self.input_size = self.instance_embeddings.shape[1] + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.length_predictor = self.get_length_predictor() @@ -783,6 +784,7 @@ def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=256, learnin shuffle_examples=shuffle_examples, example_sizes=example_sizes) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=self.num_workers, collate_fn=self.collate_batch, shuffle=True) + #TODO: remove dependency on knowledge_base_path if storage_path is None: storage_path = self.knowledge_base_path[:self.knowledge_base_path.rfind("/")] elif not os.path.exists(storage_path) and (record_runtime or save_model): @@ -797,15 +799,15 @@ class NCES(BaseNCES): name = "NCES" - def __init__(self, knowledge_base_path, nces2_or_roces=False, + def __init__(self, knowledge_base, nces2_or_roces=False, quality_func: Optional[AbstractScorer] = None, num_predictions=5, learner_names=["SetTransformer", "LSTM", "GRU"], path_of_embeddings=None, path_temp_embeddings=None, path_of_trained_models=None, auto_train=True, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=32, ln=False, dicee_model="DeCaL", dicee_epochs=5, dicee_lr=0.01, dicee_emb_dim=128, learning_rate=1e-4, tmax=20, eta_min=1e-5, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, sorted_examples=False, verbose: int = 0): - - super().__init__(knowledge_base_path=knowledge_base_path, nces2_or_roces=nces2_or_roces, + self.knowledge_base = knowledge_base + super().__init__(knowledge_base=knowledge_base, nces2_or_roces=nces2_or_roces, quality_func=quality_func, num_predictions=num_predictions, auto_train=auto_train, proj_dim=proj_dim, drop_prob=drop_prob, num_heads=num_heads, num_seeds=num_seeds, m=m, ln=ln, learning_rate=learning_rate, tmax=tmax, eta_min=eta_min, clip_value=clip_value, @@ -834,8 +836,6 @@ def _set_prerequisites(self): if self.path_of_embeddings is None or (os.path.isdir(self.path_of_embeddings) and not glob.glob( self.path_of_embeddings + '*_entity_embeddings.csv')) or not os.path.exists( self.path_of_embeddings) or not self.path_of_embeddings.endswith('.csv'): - if not os.path.exists(self.knowledge_base_path): - raise ValueError(f"{self.knowledge_base_path} not found") try: import dicee print('\nĆheck packages... OK: dicee is installed.') @@ -851,6 +851,16 @@ def _set_prerequisites(self): "See the example script in `examples/train_nces.py` for this. " "Use `examples/train_nces.py -h` to view options.\x1b[0m"+"\n") try: + path_temp_triples = os.path.join(os.path.dirname(__file__), + "temp_embeddings4learners/abox.nt") + if os.path.exists(path_temp_triples): + os.remove(path_temp_triples) + + with open(path_temp_triples, "a") as f: + for s, p, o in self.knowledge_base.abox(): + f.write(f"<{s.str}> <{p.str}> <{o.str}> .\n") + + self.knowledge_base_path = path_temp_triples path_temp_embeddings = self.path_temp_embeddings if self.path_temp_embeddings and isinstance( self.path_temp_embeddings, str) else "temp_embeddings" subprocess.run(f"dicee --path_single_kg {self.knowledge_base_path} " @@ -861,7 +871,7 @@ def _set_prerequisites(self): f"--model {self.dicee_model} " f"--embedding_dim {self.dicee_emb_dim} " f"--eval_mode test", - shell=True, executable="/bin/bash") + shell=True)#, executable="/bin/bash") assert os.path.exists(f"{path_temp_embeddings}/{self.dicee_model}_entity_embeddings.csv"), \ (f"It seems that embeddings were not stored at the expected directory " f"({path_temp_embeddings}/{self.dicee_model}_entity_embeddings.csv)") @@ -877,7 +887,7 @@ def _set_prerequisites(self): print(f"\nUsing embeddings at: {self.path_of_embeddings} with {self.input_size} dimensions.\n") if self.auto_train: # Train NCES for 5 epochs - self.train(epochs=5) + self.train(epochs=5,num_workers = self.num_workers) self.refresh(self.path_of_trained_models) else: self.instance_embeddings = read_csv(self.path_of_embeddings) @@ -923,13 +933,13 @@ def get_synthesizer(self, path=None): raise FileNotFoundError(f"{self.path_of_trained_models} does not contain at least one of `vocab.json, " f"inv_vocab.npy or embedding_config.json`") - m1 = SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, + m1 = SetTransformer(self.vocab, self.inv_vocab, self.max_length, self.input_size, self.proj_dim, self.num_heads, self.num_seeds, self.m, self.ln) - m2 = GRU(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size, + m2 = GRU(self.vocab, self.inv_vocab, self.max_length, self.input_size, self.proj_dim, self.rnn_n_layers, self.drop_prob) - m3 = LSTM(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length, self.input_size, + m3 = LSTM(self.vocab, self.inv_vocab, self.max_length, self.input_size, self.proj_dim, self.rnn_n_layers, self.drop_prob) Models = {"SetTransformer": {"emb_model": None, "model": m1}, "GRU": {"emb_model": None, "model": m2}, @@ -1156,7 +1166,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, max_ if batch_size is None: batch_size = self.batch_size if data is None: - data = generate_training_data(self.knowledge_base_path, max_num_lps=max_num_lps, + data = generate_training_data(kb_path=None,kb=self.knowledge_base, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, beyond_alc=False, refs_sample_size=refs_sample_size, storage_path=storage_path) example_ind = data[0][-1]["positive examples"][0] @@ -1173,18 +1183,28 @@ class NCES2(BaseNCES): """Neural Class Expression Synthesis in ALCHIQ(D).""" name = "NCES2" - def __init__(self, knowledge_base_path, nces2_or_roces=True, + def __init__(self, knowledge_base, nces2_or_roces=True, quality_func: Optional[AbstractScorer] = None, num_predictions=5, path_of_trained_models=None, auto_train=True, proj_dim=128, drop_prob=0.1, num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=128, sampling_strategy="nces2", input_dropout=0.0, feature_map_dropout=0.1, kernel_size=4, num_of_output_channels=32, learning_rate=1e-4, tmax=20, eta_min=1e-5, clip_value=5.0, batch_size=256, num_workers=4, max_length=48, load_pretrained=True, verbose: int = 0, data=[]): - super().__init__(knowledge_base_path, nces2_or_roces, quality_func, num_predictions, auto_train, proj_dim, + self.knowledge_base = knowledge_base + super().__init__(knowledge_base, nces2_or_roces, quality_func, num_predictions, auto_train, proj_dim, drop_prob, num_heads, num_seeds, m, ln, learning_rate, tmax, eta_min, clip_value, batch_size, num_workers, max_length, load_pretrained, verbose) - self.triples_data = TriplesData(knowledge_base_path) + path_temp_triples = os.path.join(os.path.dirname(__file__), + "temp_embeddings4learners/abox.nt") + if os.path.exists(os.path.dirname(path_temp_triples)): + os.remove(path_temp_triples) + with open(path_temp_triples, "a") as f: + for s, p, o in self.knowledge_base.abox(): + f.write(f"<{s.str}> <{p.str}> <{o.str}> .\n") + + self.knowledge_base_path = path_temp_triples + self.triples_data = TriplesData(self.knowledge_base_path) self.num_entities = len(self.triples_data.entity2idx) self.num_relations = len(self.triples_data.relation2idx) self.path_of_trained_models = path_of_trained_models @@ -1194,25 +1214,19 @@ def __init__(self, knowledge_base_path, nces2_or_roces=True, self.feature_map_dropout = feature_map_dropout self.kernel_size = kernel_size self.num_of_output_channels = num_of_output_channels + self.num_workers = num_workers self._set_prerequisites() def _set_prerequisites(self): if isinstance(self.m, int): self.m = [self.m] - Models = {str(m): {"emb_model": ConEx(self.embedding_dim, self.num_entities, self.num_relations, - self.input_dropout, self.feature_map_dropout, self.kernel_size, - self.num_of_output_channels), - "model": SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, - self.max_length, self.embedding_dim, self.proj_dim, self.num_heads, - self.num_seeds, m, self.ln)} for m in self.m} - if self.load_pretrained and self.path_of_trained_models is None and self.auto_train: print(f"\n\x1b[0;30;43mPath to pretrained models is None and load_pretrained is True " f"and auto_train is True. Will quickly train neural synthesizers. " f"However, it is advisable that you properly train {self.name} using the " f"example script in `examples/train_nces.py`.\x1b[0m\n") - self.train(epochs=5) + self.train(epochs=5, num_workers = self.num_workers) self.refresh(self.path_of_trained_models) else: self.model = self.get_synthesizer(self.path_of_trained_models) @@ -1266,7 +1280,7 @@ def get_synthesizer(self, path=None, verbose=True): Models = {str(m): {"emb_model": ConEx(self.embedding_dim, self.num_entities, self.num_relations, self.input_dropout, self.feature_map_dropout, self.kernel_size, self.num_of_output_channels), - "model": SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, + "model": SetTransformer(self.vocab, self.inv_vocab, self.max_length, self.embedding_dim, self.proj_dim, self.num_heads, self.num_seeds, m, self.ln)} for m in self.m} @@ -1505,7 +1519,7 @@ def train(self, data: Iterable[List[Tuple]] = None, epochs=50, batch_size=64, ma if batch_size is None: batch_size = self.batch_size if data is None: - data = generate_training_data(self.knowledge_base_path, max_num_lps=max_num_lps, + data = generate_training_data(kb_path=None,kb=self.knowledge_base, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, beyond_alc=True, refs_sample_size=refs_sample_size, storage_path=storage_path) vocab_size_before = len(self.vocab) @@ -1513,6 +1527,7 @@ def train(self, data: Iterable[List[Tuple]] = None, epochs=50, batch_size=64, ma self.path_of_trained_models = storage_path+"/trained_models" if len(self.vocab) > vocab_size_before: self.model = self.get_synthesizer(verbose=False) + print(num_workers) trainer = NCESTrainer(self, epochs=epochs, batch_size=batch_size, learning_rate=learning_rate, tmax=tmax, eta_min=eta_min, clip_value=clip_value, num_workers=num_workers, storage_path=storage_path) @@ -1523,7 +1538,7 @@ class ROCES(NCES2): """Robust Class Expression Synthesis in Description Logics via Iterative Sampling.""" name = "ROCES" - def __init__(self, knowledge_base_path, nces2_or_roces=True, + def __init__(self, knowledge_base, nces2_or_roces=True, quality_func: Optional[AbstractScorer] = None, num_predictions=5, k=5, path_of_trained_models=None, auto_train=True, proj_dim=128, rnn_n_layers=2, drop_prob=0.1, num_heads=4, num_seeds=1, m=[32, 64, 128], ln=False, embedding_dim=128, sampling_strategy="p", @@ -1532,7 +1547,7 @@ def __init__(self, knowledge_base_path, nces2_or_roces=True, max_length=48, load_pretrained=True, verbose: int = 0, data=[]): self.k = k - super().__init__(knowledge_base_path, nces2_or_roces, + super().__init__(knowledge_base, nces2_or_roces, quality_func, num_predictions, path_of_trained_models, auto_train, proj_dim, drop_prob, num_heads, num_seeds, m, ln, embedding_dim, sampling_strategy, input_dropout, feature_map_dropout, kernel_size, num_of_output_channels, learning_rate, tmax, eta_min, diff --git a/ontolearn/lp_generator/generate_data.py b/ontolearn/lp_generator/generate_data.py index 7aa5fed8..02dbee81 100644 --- a/ontolearn/lp_generator/generate_data.py +++ b/ontolearn/lp_generator/generate_data.py @@ -29,7 +29,7 @@ class LPGen: - def __init__(self, kb_path, storage_path=None, max_num_lps=1000, beyond_alc=False, depth=3, max_child_length=20, refinement_expressivity=0.2, + def __init__(self, kb_path=None, kb=None, storage_path=None, max_num_lps=1000, beyond_alc=False, depth=3, max_child_length=20, refinement_expressivity=0.2, downsample_refinements=True, sample_fillers_count=10, num_sub_roots=50, min_num_pos_examples=1): """ Args @@ -40,7 +40,7 @@ def __init__(self, kb_path, storage_path=None, max_num_lps=1000, beyond_alc=Fals - depth, max_child_length, refinement_expressivity, sample_fillers_count, num_sub_roots all refer to the size of the data (learning problems) to be generated - downsample_refinements: whether to downsample refinements in ExpressRefinement. If refinement_expressivity<1, this must be set to True """ - self.lp_gen = KB2Data(path=kb_path, storage_path=storage_path, max_num_lps=max_num_lps, beyond_alc=beyond_alc, depth=depth, + self.lp_gen = KB2Data(path=kb_path,knowledge_base=kb, storage_path=storage_path, max_num_lps=max_num_lps, beyond_alc=beyond_alc, depth=depth, max_child_length=max_child_length, refinement_expressivity=refinement_expressivity, downsample_refinements=downsample_refinements, sample_fillers_count=sample_fillers_count, num_sub_roots=num_sub_roots, min_num_pos_examples=min_num_pos_examples) def generate(self): diff --git a/ontolearn/lp_generator/helper_classes.py b/ontolearn/lp_generator/helper_classes.py index 4168d721..d7963ac0 100644 --- a/ontolearn/lp_generator/helper_classes.py +++ b/ontolearn/lp_generator/helper_classes.py @@ -38,7 +38,8 @@ class ConceptDescriptionGenerator: Learning problem generator. """ - def __init__(self, knowledge_base, refinement_operator, depth=2, max_length=10, num_sub_roots=150): + def __init__(self, knowledge_base, refinement_operator, depth=2, max_length=10, + num_sub_roots=150): self.kb = knowledge_base self.rho = refinement_operator self.depth = depth @@ -68,8 +69,10 @@ class KB2Data: a json file. """ - def __init__(self, path, storage_path=None, max_num_lps=1000, beyond_alc=False, depth=3, max_child_length=20, refinement_expressivity=0.2, - downsample_refinements=True, sample_fillers_count=10, num_sub_roots=50, min_num_pos_examples=1): + def __init__(self, path=None, storage_path=None, max_num_lps=1000, beyond_alc=False, depth=3, + max_child_length=20, refinement_expressivity=0.2, + downsample_refinements=True, sample_fillers_count=10, num_sub_roots=50, + min_num_pos_examples=1,knowledge_base=None): """ Args - kb_path: path to the owl file representing the knowledge base/ontology @@ -88,41 +91,56 @@ def __init__(self, path, storage_path=None, max_num_lps=1000, beyond_alc=False, self.max_num_lps = max_num_lps self.beyond_alc = beyond_alc self.dl_syntax_renderer = DLSyntaxObjectRenderer() - self.kb = KnowledgeBase(path=path) + self.knowledge_base = knowledge_base + if self.knowledge_base is None: + self.kb = KnowledgeBase(path=path) + else: + self.kb = self.knowledge_base self.num_examples = self.find_optimal_number_of_examples() self.min_num_pos_examples = min_num_pos_examples atomic_concepts = frozenset(self.kb.ontology.classes_in_signature()) - self.atomic_concept_names = frozenset([self.dl_syntax_renderer.render(a) for a in atomic_concepts]) + self.atomic_concept_names = frozenset( + [self.dl_syntax_renderer.render(a) for a in atomic_concepts]) if self.beyond_alc: - rho = ExpressRefinement(knowledge_base=self.kb, max_child_length=max_child_length, sample_fillers_count=sample_fillers_count, - downsample=downsample_refinements, use_inverse=True, use_card_restrictions=True, - use_numeric_datatypes=True, use_time_datatypes=True, use_boolean_datatype=True, - expressivity=refinement_expressivity) + rho = ExpressRefinement(knowledge_base=self.kb, max_child_length=max_child_length, + sample_fillers_count=sample_fillers_count, + downsample=downsample_refinements, use_inverse=True, + use_card_restrictions=True, + use_numeric_datatypes=True, use_time_datatypes=True, + use_boolean_datatype=True, + expressivity=refinement_expressivity) else: - rho = ExpressRefinement(knowledge_base=self.kb, max_child_length=max_child_length, sample_fillers_count=sample_fillers_count, - downsample=downsample_refinements, use_inverse=False, use_card_restrictions=False, - use_numeric_datatypes=False, use_time_datatypes=False, use_boolean_datatype=False, + rho = ExpressRefinement(knowledge_base=self.kb, max_child_length=max_child_length, + sample_fillers_count=sample_fillers_count, + downsample=downsample_refinements, use_inverse=False, + use_card_restrictions=False, + use_numeric_datatypes=False, use_time_datatypes=False, + use_boolean_datatype=False, expressivity=refinement_expressivity) - self.lp_gen = ConceptDescriptionGenerator(knowledge_base=self.kb, refinement_operator=rho, depth=depth, + self.lp_gen = ConceptDescriptionGenerator(knowledge_base=self.kb, refinement_operator=rho, + depth=depth, num_sub_roots=num_sub_roots) def find_optimal_number_of_examples(self): if self.kb.individuals_count() >= 600: - return min(self.kb.individuals_count()//2, 1000) + return min(self.kb.individuals_count() // 2, 1000) return self.kb.individuals_count() def generate_descriptions(self): print() - print("#"*60) - print("Started generating data on the "+self.path.split("/")[-1].split(".")[0]+" knowledge base") - print("#"*60) + print("#" * 60) + if self.path: + print("Started generating data on the " + self.path.split("/")[-1].split(".")[ + 0] + " knowledge base") + print("#" * 60) print() All_individuals = set(self.kb.individuals()) print("Number of individuals in the knowledge base: {} \n".format(len(All_individuals))) Concepts = self.lp_gen.generate() non_redundancy_hash_map = dict() show_some_length = True - for concept in tqdm(sorted(Concepts, key=lambda c: concept_len(c)), desc="Filtering process..."): + for concept in tqdm(sorted(Concepts, key=lambda c: concept_len(c)), + desc="Filtering process..."): if not self.kb.individuals_set(concept) in non_redundancy_hash_map and \ self.min_num_pos_examples <= self.kb.individuals_count(concept): non_redundancy_hash_map[self.kb.individuals_set(concept)] = concept @@ -140,19 +158,19 @@ def generate_descriptions(self): return self def sample_examples(self, pos, neg): - if min(len(pos), len(neg)) >= self.num_examples//2: + if min(len(pos), len(neg)) >= self.num_examples // 2: if len(pos) > len(neg): - num_neg_ex = self.num_examples//2 - num_pos_ex = self.num_examples-num_neg_ex + num_neg_ex = self.num_examples // 2 + num_pos_ex = self.num_examples - num_neg_ex else: - num_pos_ex = self.num_examples//2 - num_neg_ex = self.num_examples-num_pos_ex + num_pos_ex = self.num_examples // 2 + num_neg_ex = self.num_examples - num_pos_ex elif len(pos) > len(neg): num_neg_ex = len(neg) - num_pos_ex = self.num_examples-num_neg_ex + num_pos_ex = self.num_examples - num_neg_ex elif len(pos) < len(neg): num_pos_ex = len(pos) - num_neg_ex = self.num_examples-num_pos_ex + num_neg_ex = self.num_examples - num_pos_ex positive = random.sample(pos, min(num_pos_ex, len(pos))) negative = random.sample(neg, min(num_neg_ex, len(neg))) return positive, negative @@ -161,16 +179,17 @@ def save_data(self): data = dict() for concept in tqdm(self.train_concepts, desc="Sample examples and save data..."): pos = set(self.kb.individuals(concept)) - neg = set(self.kb.individuals())-pos + neg = set(self.kb.individuals()) - pos if len(neg) == 0: continue pos = [ind.str.split("/")[-1] for ind in pos] neg = [ind.str.split("/")[-1] for ind in neg] positive, negative = self.sample_examples(pos, neg) concept_name = self.dl_syntax_renderer.render(concept.get_nnf()) - data[concept_name] = {'positive examples': positive, 'negative examples': negative} + data[concept_name] = {'positive examples': positive, 'negative examples': negative, + 'length': concept_len(concept.get_nnf())} data = list(data.items()) os.makedirs(self.storage_path, exist_ok=True) - with open(f'{self.storage_path}/LPs.json', 'w') as file_train: + with open(f'{self.storage_path}/LPs.json', 'w', encoding="utf-8") as file_train: json.dump(data, file_train, indent=3, ensure_ascii=False) print(f'Data saved at {self.storage_path}') diff --git a/ontolearn/nces_architectures.py b/ontolearn/nces_architectures.py index 690ce210..a8957955 100644 --- a/ontolearn/nces_architectures.py +++ b/ontolearn/nces_architectures.py @@ -28,7 +28,7 @@ class LSTM(nn.Module): """LSTM module.""" - def __init__(self, knowledge_base_path, vocab, inv_vocab, max_length, input_size, proj_dim, rnn_n_layers, + def __init__(self, vocab, inv_vocab, max_length, input_size, proj_dim, rnn_n_layers, drop_prob): super().__init__() self.name = 'LSTM' @@ -60,7 +60,7 @@ def forward(self, x1, x2, target_scores=None): class GRU(nn.Module): """GRU module.""" - def __init__(self, knowledge_base_path, vocab, inv_vocab, max_length, input_size, proj_dim, rnn_n_layers, + def __init__(self, vocab, inv_vocab, max_length, input_size, proj_dim, rnn_n_layers, drop_prob): super().__init__() self.name = 'GRU' @@ -92,7 +92,7 @@ def forward(self, x1, x2, target_scores=None): class SetTransformer(nn.Module): """SetTransformer module.""" - def __init__(self, knowledge_base_path, vocab, inv_vocab, max_length, input_size, proj_dim, num_heads, num_seeds, + def __init__(self, vocab, inv_vocab, max_length, input_size, proj_dim, num_heads, num_seeds, m, ln): super(SetTransformer, self).__init__() self.name = 'SetTransformer' diff --git a/ontolearn/nces_utils.py b/ontolearn/nces_utils.py index ddd7cb67..326e5f52 100644 --- a/ontolearn/nces_utils.py +++ b/ontolearn/nces_utils.py @@ -141,18 +141,25 @@ def try_get_embs(pos, neg, embeddings, num_examples): return pos, neg -def generate_training_data(kb_path, max_num_lps=1000, refinement_expressivity=0.2, refs_sample_size=50, +def generate_training_data(kb_path,kb, max_num_lps=1000, refinement_expressivity=0.2, refs_sample_size=50, beyond_alc=True, storage_path=None): if storage_path is None: storage_path = "./Training_Data" - lp_gen = LPGen(kb_path=kb_path, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, + lp_gen = LPGen(kb_path=kb_path, kb=kb, max_num_lps=max_num_lps, refinement_expressivity=refinement_expressivity, num_sub_roots=refs_sample_size, beyond_alc=beyond_alc, storage_path=storage_path) lp_gen.generate() print("Loading generated data...") - with open(f"{storage_path}/LPs.json") as file: - lps = json.load(file) - if isinstance(lps, dict): - lps = list(lps.items()) - print("Number of learning problems:", len(lps)) + try: + with open(f"{storage_path}/LPs.json") as file: + lps = json.load(file) + if isinstance(lps, dict): + lps = list(lps.items()) + print("Number of learning problems:", len(lps)) + except UnicodeDecodeError: + with open(f"{storage_path}/LPs.json", encoding='utf-8') as file: + lps = json.load(file) + if isinstance(lps, dict): + lps = list(lps.items()) + print("Number of learning problems:", len(lps)) return lps