Skip to content

Inference over unseen entities - code #309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions dicee/BytE_UMLS/configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"dataset_dir": "KGs/Countries-S1",
"save_embeddings_as_csv": false,
"storage_path": "Experiments",
"path_to_store_single_run": "BytE_UMLS",
"path_single_kg": null,
"sparql_endpoint": null,
"model": "DistMult",
"optim": "Adam",
"embedding_dim": 64,
"num_epochs": 10,
"batch_size": 1024,
"lr": 0.1,
"add_noise_rate": null,
"gpus": null,
"callbacks": {},
"backend": "pandas",
"trainer": "torchCPUTrainer",
"scoring_technique": "KvsAll",
"neg_ratio": 0,
"weight_decay": 0.0,
"normalization": null,
"init_param": null,
"gradient_accumulation_steps": 0,
"num_folds_for_cv": 0,
"eval_model": "train_val_test",
"save_model_at_every_epoch": null,
"label_smoothing_rate": 0.0,
"num_core": 0,
"random_seed": 0,
"sample_triples_ratio": null,
"read_only_few": null,
"pykeen_model_kwargs": {},
"kernel_size": 3,
"num_of_output_channels": 32,
"p": 0,
"q": 1,
"input_dropout_rate": 0.0,
"hidden_dropout_rate": 0.0,
"feature_map_dropout_rate": 0.0,
"byte_pair_encoding": true,
"adaptive_swa": false,
"swa": false,
"block_size": null,
"continual_learning": null,
"use_custom_tokenizer": true,
"tokenizer_path": "C:/Users/Harshit Purohit/Tokenizer/tokenizer.json",
"use_transformer": true,
"padding": false,
"max_epochs": 10,
"min_epochs": 10,
"learning_rate": 0.1,
"deterministic": true,
"check_val_every_n_epoch": 1000000,
"logger": false,
"apply_reciprical_or_noise": true,
"full_storage_path": "BytE_UMLS"
}
Binary file added dicee/BytE_UMLS/ee_vocab.p
Binary file not shown.
11 changes: 11 additions & 0 deletions dicee/BytE_UMLS/epoch_losses.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
,EpochLoss
0,0.69423907995224
1,12.179567337036133
2,247.7322540283203
3,24.623878479003906
4,8.132453918457031
5,11.207512855529785
6,10.999801635742188
7,8.120755195617676
8,4.748770713806152
9,5.247491359710693
Binary file added dicee/BytE_UMLS/er_vocab.p
Binary file not shown.
20 changes: 20 additions & 0 deletions dicee/BytE_UMLS/eval_report.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"Train": {
"H@1": 0.01485148514851485,
"H@3": 0.028352835283528353,
"H@10": 0.056255625562556255,
"MRR": 0.036501769520952534
},
"Val": {
"H@1": 0.020833333333333332,
"H@3": 0.08333333333333333,
"H@10": 0.08333333333333333,
"MRR": 0.059327546659818764
},
"Test": {
"H@1": 0.041666666666666664,
"H@3": 0.125,
"H@10": 0.125,
"MRR": 0.08945205313350602
}
}
Binary file added dicee/BytE_UMLS/model.pt
Binary file not shown.
Binary file added dicee/BytE_UMLS/ordered_bpe_entities.p
Binary file not shown.
Binary file added dicee/BytE_UMLS/ordered_bpe_relations.p
Binary file not shown.
Binary file added dicee/BytE_UMLS/re_vocab.p
Binary file not shown.
11 changes: 11 additions & 0 deletions dicee/BytE_UMLS/report.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"num_train_triples": 624,
"num_entities": null,
"num_relations": null,
"max_length_subword_tokens": 5,
"runtime_kg_loading": 0.13901090621948242,
"EstimatedSizeMB": 0.1407470703125,
"NumParam": 147584,
"path_experiment_folder": "BytE_UMLS",
"Runtime": 9.463574886322021
}
Binary file added dicee/BytE_UMLS/train_set.npy
Binary file not shown.
86 changes: 86 additions & 0 deletions dicee/Tokenizer/Tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import PreTrainedTokenizerFast

def load_dataset_content(dataset_paths):
"""
Load content from datasets for tokenizer training.

Args:
dataset_paths: List of paths to datasets

Returns:
List of text lines for training
"""
all_lines = []

for dataset_path in dataset_paths:
for filename in ["train.txt", "valid.txt", "test.txt"]:
file_path = os.path.join(dataset_path, filename)
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
if not first_line.isdigit():
all_lines.append(first_line)

for line in f:
line = line.strip()
if line:
all_lines.append(line)

return all_lines

def main():
umls_path = "KGs/UMLS"
countries_s1_path = "KGs/Countries-S1"
countries_s2_path = "KGs/Countries-S2"
countries_s3_path = "KGs/Countries-S3"
kinship_path = "KGs/KINSHIP"
nell_h100 = "KGs/NELL-995-h100"
nell_h75 = "KGs/NELL-995-h75"
nell_h25 = "KGs/NELL-995-h25"
fb_15k_237 = "KGs/FB15k-237"


output_dir = "C:\\Users\\Harshit Purohit\\Tokenizer"

corpus_lines = load_dataset_content([countries_s1_path])
# corpus_lines = load_dataset_content([countries_s1_path])
# corpus_lines = load_dataset_content([countries_s2_path])
# corpus_lines = load_dataset_content([countries_s3_path])

tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
vocab_size=1000000,
min_frequency=2,
max_token_length=100
)

tokenizer.pre_tokenizer = WhitespaceSplit()


tokenizer.train_from_iterator(corpus_lines, trainer)

os.makedirs(output_dir, exist_ok=True)
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

pretrained_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
new_tokens = [" "]
added_count = pretrained_tokenizer.add_tokens(new_tokens)
print(f"Added {added_count} new token(s): {new_tokens}")

pretrained_tokenizer.save_pretrained(output_dir)

print(f"Tokenizer training completed and saved to {output_dir}")

test_text = "ent_1 rel_1 ent_2"
encoded = pretrained_tokenizer.encode(test_text)
print(f"Example encoding: {encoded}")

if __name__ == "__main__":
main()
85 changes: 85 additions & 0 deletions dicee/Tokenizer/Tokenizer_Path/Tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from transformers import PreTrainedTokenizerFast

def load_dataset_content(dataset_paths):
"""
Load content from datasets for tokenizer training.

Args:
dataset_paths: List of paths to datasets

Returns:
List of text lines for training
"""
all_lines = []

for dataset_path in dataset_paths:
for filename in ["train.txt", "valid.txt", "test.txt"]:
file_path = os.path.join(dataset_path, filename)
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
# Skip header line if it's just a count (common in knowledge graph datasets)
first_line = f.readline().strip()
if not first_line.isdigit():
all_lines.append(first_line)

# Process rest of the file
for line in f:
line = line.strip()
if line: # Skip empty lines
all_lines.append(line)

return all_lines

def main():
# Define paths to datasets
umls_path = "C:\\Users\\Harshit Purohit\\KGDatasets\\UMLS" # Replace with actual path
kinship_path = "C:\\Users\\Harshit Purohit\\KGDatasets\\Kinship" # Replace with actual path
output_dir = "C:\\Users\\Harshit Purohit\\Tokenizer"

# Load content from both datasets
corpus_lines = load_dataset_content([umls_path, kinship_path])

# Initialize tokenizer with BPE model
tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

# Configure the trainer
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
vocab_size=10000, # Adjust as needed
min_frequency=2 # Minimum frequency for a token to be included
)

# Set pre-tokenizer to split on whitespace
tokenizer.pre_tokenizer = WhitespaceSplit()

# Train the tokenizer using the iterator of lines
# This completes the missing part in the original code snippet
tokenizer.train_from_iterator(corpus_lines, trainer)

# Save the raw tokenizer
os.makedirs(output_dir, exist_ok=True)
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))

# Convert to PreTrainedTokenizerFast as in the original code
pretrained_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
new_tokens = [" "]
added_count = pretrained_tokenizer.add_tokens(new_tokens)
print(f"Added {added_count} new token(s): {new_tokens}")

pretrained_tokenizer.save_pretrained(output_dir)

print(f"Tokenizer training completed and saved to {output_dir}")

# Example usage
test_text = "ent_1 rel_1 ent_2"
encoded = pretrained_tokenizer.encode(test_text)
print(f"Example encoding: {encoded}")
print(f"Decoded tokens: {pretrained_tokenizer.convert_ids_to_tokens(encoded.ids)}")

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions dicee/Tokenizer/Tokenizer_Path/special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading