Skip to content

Commit

Permalink
LoreNexus wrapper class for models. Proper responsability handling. U…
Browse files Browse the repository at this point in the history
…pdates #9. Fixes #11
  • Loading branch information
geru-scotland committed Nov 9, 2024
1 parent eaed1f3 commit ab55c37
Show file tree
Hide file tree
Showing 23 changed files with 117,391 additions and 87,484 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,6 @@ Desktop.ini

# Temp files
tmp/
temp/
temp/

__pycache__/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<h1>LoreNexus</h1>
</div>
<div align="center">
<img src="images/LoreNexus.png" width="400" alt="PyAtlas">
<img src="images/LoreNexus.png" width="600" alt="LoreNexus">
</div>

## Descripción
Expand Down
Empty file added __init__.py
Empty file.
14 changes: 13 additions & 1 deletion dataset/preprocessing/apis/mythology/mythdata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
import os
"""
*****************************************************
* Universidad del País Vasco (UPV/EHU)
* Facultad de Informática - Donostia-San Sebastián
* Asignatura: Procesamiento de Lenguaje Natural
* Proyecto: Lore Nexus
*
* File: mythdata.py
* Author: geru-scotland
* GitHub: https://github.com/geru-scotland
* Description:
*****************************************************
"""
import re
import unicodedata
from enum import Enum
Expand Down
66 changes: 63 additions & 3 deletions dataset/preprocessing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,24 @@
*****************************************************
"""
import json
import random
from collections import defaultdict
from enum import Enum

from textattack.augmentation import Augmenter
from textattack.transformations.word_swaps.word_swap_neighboring_character_swap import WordSwapNeighboringCharacterSwap

class MythLabels(Enum):
"""
TODO: Pasar enums a fichero estilo shared-defines
"""
MAIN_LABEL = "Mythology"

def __str__(self):
return self.value

class DataProcessor:
def __init__(self, config_path="config.json", output_path="output/dataset.txt"):
def __init__(self, config_path="config.json", output_path="output/dataset.txt"):
self.config_path = config_path
self.output_path = output_path
self.allowed_labels = self._load_labels(config_path)
Expand Down Expand Up @@ -82,8 +94,12 @@ def augment(self):
name = name.strip().lower() # OJO: Estoy pasando todo a minusculas para reducir el espacio OJO EN TEST!!
name_parts = name.split()

# Multiples palabras por nombre, este tiene que ir, si o si.
augmented_data.append(line.strip().lower())
# La original, este tiene que ir, si o si.
augmented_data.append(f"{label} {name}")

# No quiero aumentar mythology, por ahora se come a todas si no
if label == f"__label__{MythLabels.MAIN_LABEL}":
continue

# Y ahora, si el nombre tiene más de una palabra, la agrego
# como instancia también, CREO que puede ayudar.
Expand Down Expand Up @@ -111,6 +127,7 @@ def run_pipeline(self):
# no creo que sea bueno...
#unique_data = list(set(augmented_data))
self._save_data(augmented_data)
self._stratify_data(augmented_data)

def augment(self):
augmenter = self.DataAugmentator(self)
Expand All @@ -121,6 +138,49 @@ def _save_data(self, data):
for line in data:
file.write(line + '\n')

def _stratify_data(self, data):
random.seed(42)

data_by_label = defaultdict(list)
with open('output/dataset.txt', 'r', encoding='utf-8') as file:
lines = file.readlines()
# Organizo por label, para dividir equitativamente después, si no lo que me ha
# ocurrido es que había alguna label en dev y test que no estaba en train
for line in lines:
label = line.split()[0]
data_by_label[label].append(line)

# 80% train, 10% dev/val, 10% test
train_lines, dev_lines, test_lines = [], [], []
for label, items in data_by_label.items():
random.shuffle(items)

total_items = len(items)
train_size = int(0.88 * total_items)
dev_size = int(0.12 * total_items)
# test_size = total_items - train_size - dev_size

train_lines.extend(items[:train_size])
dev_lines.extend(items[train_size:train_size + dev_size])
# test_lines.extend(items[train_size + dev_size:])

random.shuffle(train_lines)
random.shuffle(dev_lines)
# random.shuffle(test_lines)

try:
with open('output/train.txt', 'w', encoding='utf-8') as file:
file.writelines(train_lines)

with open('output/dev.txt', 'w', encoding='utf-8') as file:
file.writelines(dev_lines)

with open('output/test.txt', 'w', encoding='utf-8') as file:
file.writelines(test_lines)
except FileNotFoundError:
print("Error saving stratified data.")



dataprocessor = DataProcessor()
dataprocessor.run_pipeline()
Loading

0 comments on commit ab55c37

Please sign in to comment.