dice-group · Titanx-exe · Jun 15, 2025
diff --git a/dicee/BytE_UMLS/configuration.json b/dicee/BytE_UMLS/configuration.json
@@ -0,0 +1,58 @@
+{
+   "dataset_dir": "KGs/Countries-S1",
+   "save_embeddings_as_csv": false,
+   "storage_path": "Experiments",
+   "path_to_store_single_run": "BytE_UMLS",
+   "path_single_kg": null,
+   "sparql_endpoint": null,
+   "model": "DistMult",
+   "optim": "Adam",
+   "embedding_dim": 64,
+   "num_epochs": 10,
+   "batch_size": 1024,
+   "lr": 0.1,
+   "add_noise_rate": null,
+   "gpus": null,
+   "callbacks": {},
+   "backend": "pandas",
+   "trainer": "torchCPUTrainer",
+   "scoring_technique": "KvsAll",
+   "neg_ratio": 0,
+   "weight_decay": 0.0,
+   "normalization": null,
+   "init_param": null,
+   "gradient_accumulation_steps": 0,
+   "num_folds_for_cv": 0,
+   "eval_model": "train_val_test",
+   "save_model_at_every_epoch": null,
+   "label_smoothing_rate": 0.0,
+   "num_core": 0,
+   "random_seed": 0,
+   "sample_triples_ratio": null,
+   "read_only_few": null,
+   "pykeen_model_kwargs": {},
+   "kernel_size": 3,
+   "num_of_output_channels": 32,
+   "p": 0,
+   "q": 1,
+   "input_dropout_rate": 0.0,
+   "hidden_dropout_rate": 0.0,
+   "feature_map_dropout_rate": 0.0,
+   "byte_pair_encoding": true,
+   "adaptive_swa": false,
+   "swa": false,
+   "block_size": null,
+   "continual_learning": null,
+   "use_custom_tokenizer": true,
+   "tokenizer_path": "C:/Users/Harshit Purohit/Tokenizer/tokenizer.json",
+   "use_transformer": true,
+   "padding": false,
+   "max_epochs": 10,
+   "min_epochs": 10,
+   "learning_rate": 0.1,
+   "deterministic": true,
+   "check_val_every_n_epoch": 1000000,
+   "logger": false,
+   "apply_reciprical_or_noise": true,
+   "full_storage_path": "BytE_UMLS"
+}
diff --git a/dicee/BytE_UMLS/ee_vocab.p b/dicee/BytE_UMLS/ee_vocab.p
diff --git a/dicee/BytE_UMLS/epoch_losses.csv b/dicee/BytE_UMLS/epoch_losses.csv
@@ -0,0 +1,11 @@
+,EpochLoss
+0,0.69423907995224
+1,12.179567337036133
+2,247.7322540283203
+3,24.623878479003906
+4,8.132453918457031
+5,11.207512855529785
+6,10.999801635742188
+7,8.120755195617676
+8,4.748770713806152
+9,5.247491359710693
diff --git a/dicee/BytE_UMLS/er_vocab.p b/dicee/BytE_UMLS/er_vocab.p
diff --git a/dicee/BytE_UMLS/eval_report.json b/dicee/BytE_UMLS/eval_report.json
@@ -0,0 +1,20 @@
+{
+    "Train": {
+        "H@1": 0.01485148514851485,
+        "H@3": 0.028352835283528353,
+        "H@10": 0.056255625562556255,
+        "MRR": 0.036501769520952534
+    },
+    "Val": {
+        "H@1": 0.020833333333333332,
+        "H@3": 0.08333333333333333,
+        "H@10": 0.08333333333333333,
+        "MRR": 0.059327546659818764
+    },
+    "Test": {
+        "H@1": 0.041666666666666664,
+        "H@3": 0.125,
+        "H@10": 0.125,
+        "MRR": 0.08945205313350602
+    }
+}
diff --git a/dicee/BytE_UMLS/model.pt b/dicee/BytE_UMLS/model.pt
diff --git a/dicee/BytE_UMLS/ordered_bpe_entities.p b/dicee/BytE_UMLS/ordered_bpe_entities.p
diff --git a/dicee/BytE_UMLS/ordered_bpe_relations.p b/dicee/BytE_UMLS/ordered_bpe_relations.p
diff --git a/dicee/BytE_UMLS/re_vocab.p b/dicee/BytE_UMLS/re_vocab.p
diff --git a/dicee/BytE_UMLS/report.json b/dicee/BytE_UMLS/report.json
@@ -0,0 +1,11 @@
+{
+    "num_train_triples": 624,
+    "num_entities": null,
+    "num_relations": null,
+    "max_length_subword_tokens": 5,
+    "runtime_kg_loading": 0.13901090621948242,
+    "EstimatedSizeMB": 0.1407470703125,
+    "NumParam": 147584,
+    "path_experiment_folder": "BytE_UMLS",
+    "Runtime": 9.463574886322021
+}
diff --git a/dicee/BytE_UMLS/train_set.npy b/dicee/BytE_UMLS/train_set.npy
diff --git a/dicee/Tokenizer/Tokenizer.py b/dicee/Tokenizer/Tokenizer.py
@@ -0,0 +1,86 @@
+import os
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from transformers import PreTrainedTokenizerFast
+
+def load_dataset_content(dataset_paths):
+    """
+    Load content from datasets for tokenizer training.
+
+    Args:
+        dataset_paths: List of paths to datasets
+
+    Returns:
+        List of text lines for training
+    """
+    all_lines = []
+
+    for dataset_path in dataset_paths:
+        for filename in ["train.txt", "valid.txt", "test.txt"]:
+            file_path = os.path.join(dataset_path, filename)
+            if os.path.exists(file_path):
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    first_line = f.readline().strip()
+                    if not first_line.isdigit():
+                        all_lines.append(first_line)
+
+                    for line in f:
+                        line = line.strip()
+                        if line:
+                            all_lines.append(line)
+
+    return all_lines
+
+def main():
+    umls_path = "KGs/UMLS"  
+    countries_s1_path = "KGs/Countries-S1"
+    countries_s2_path = "KGs/Countries-S2"
+    countries_s3_path = "KGs/Countries-S3"
+    kinship_path = "KGs/KINSHIP"
+    nell_h100 = "KGs/NELL-995-h100"
+    nell_h75 = "KGs/NELL-995-h75"
+    nell_h25 = "KGs/NELL-995-h25"
+    fb_15k_237 = "KGs/FB15k-237"
+
+
+    output_dir = "C:\\Users\\Harshit Purohit\\Tokenizer"
+
+    corpus_lines = load_dataset_content([countries_s1_path])
+    # corpus_lines = load_dataset_content([countries_s1_path])
+    # corpus_lines = load_dataset_content([countries_s2_path])
+    # corpus_lines = load_dataset_content([countries_s3_path])
+
+    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
+
+    trainer = BpeTrainer(
+        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+        vocab_size=1000000,  
+        min_frequency=2,
+        max_token_length=100
+    )
+
+    tokenizer.pre_tokenizer = WhitespaceSplit()
+
+
+    tokenizer.train_from_iterator(corpus_lines, trainer)
+
+    os.makedirs(output_dir, exist_ok=True)
+    tokenizer.save(os.path.join(output_dir, "tokenizer.json"))
+
+    pretrained_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+    new_tokens = [" "]
+    added_count = pretrained_tokenizer.add_tokens(new_tokens)
+    print(f"Added {added_count} new token(s): {new_tokens}")
+
+    pretrained_tokenizer.save_pretrained(output_dir)
+
+    print(f"Tokenizer training completed and saved to {output_dir}")
+
+    test_text = "ent_1 rel_1 ent_2"
+    encoded = pretrained_tokenizer.encode(test_text)
+    print(f"Example encoding: {encoded}")
+
+if __name__ == "__main__":
+    main()
diff --git a/dicee/Tokenizer/Tokenizer_Path/Tokenizer.py b/dicee/Tokenizer/Tokenizer_Path/Tokenizer.py
@@ -0,0 +1,85 @@
+import os
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from transformers import PreTrainedTokenizerFast
+
+def load_dataset_content(dataset_paths):
+    """
+    Load content from datasets for tokenizer training.
+
+    Args:
+        dataset_paths: List of paths to datasets
+
+    Returns:
+        List of text lines for training
+    """
+    all_lines = []
+
+    for dataset_path in dataset_paths:
+        for filename in ["train.txt", "valid.txt", "test.txt"]:
+            file_path = os.path.join(dataset_path, filename)
+            if os.path.exists(file_path):
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    # Skip header line if it's just a count (common in knowledge graph datasets)
+                    first_line = f.readline().strip()
+                    if not first_line.isdigit():
+                        all_lines.append(first_line)
+
+                    # Process rest of the file
+                    for line in f:
+                        line = line.strip()
+                        if line:  # Skip empty lines
+                            all_lines.append(line)
+
+    return all_lines
+
+def main():
+    # Define paths to datasets
+    umls_path = "C:\\Users\\Harshit Purohit\\KGDatasets\\UMLS"  # Replace with actual path
+    kinship_path = "C:\\Users\\Harshit Purohit\\KGDatasets\\Kinship"  # Replace with actual path
+    output_dir = "C:\\Users\\Harshit Purohit\\Tokenizer"
+
+    # Load content from both datasets
+    corpus_lines = load_dataset_content([umls_path, kinship_path])
+
+    # Initialize tokenizer with BPE model
+    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
+
+    # Configure the trainer
+    trainer = BpeTrainer(
+        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
+        vocab_size=10000,  # Adjust as needed
+        min_frequency=2    # Minimum frequency for a token to be included
+    )
+
+    # Set pre-tokenizer to split on whitespace
+    tokenizer.pre_tokenizer = WhitespaceSplit()
+
+    # Train the tokenizer using the iterator of lines
+    # This completes the missing part in the original code snippet
+    tokenizer.train_from_iterator(corpus_lines, trainer)
+
+    # Save the raw tokenizer
+    os.makedirs(output_dir, exist_ok=True)
+    tokenizer.save(os.path.join(output_dir, "tokenizer.json"))
+
+    # Convert to PreTrainedTokenizerFast as in the original code
+    pretrained_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+    new_tokens = [" "]
+    added_count = pretrained_tokenizer.add_tokens(new_tokens)
+    print(f"Added {added_count} new token(s): {new_tokens}")
+
+    pretrained_tokenizer.save_pretrained(output_dir)
+
+    print(f"Tokenizer training completed and saved to {output_dir}")
+
+    # Example usage
+    test_text = "ent_1 rel_1 ent_2"
+    encoded = pretrained_tokenizer.encode(test_text)
+    print(f"Example encoding: {encoded}")
+    print(f"Decoded tokens: {pretrained_tokenizer.convert_ids_to_tokens(encoded.ids)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/dicee/Tokenizer/Tokenizer_Path/special_tokens_map.json b/dicee/Tokenizer/Tokenizer_Path/special_tokens_map.json
@@ -0,0 +1 @@
+{}