LoreNexus wrapper class for models. Proper responsability handling. U…

…pdates #9. Fixes #11
geru-scotland · Nov 9, 2024 · ab55c37 · ab55c37
1 parent eaed1f3
commit ab55c37
Show file tree

Hide file tree

Showing 23 changed files with 117,391 additions and 87,484 deletions.
diff --git a/.gitignore b/.gitignore
@@ -91,4 +91,6 @@ Desktop.ini
 
 # Temp files
 tmp/
-temp/
+temp/
+
+__pycache__/
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
   <h1>LoreNexus</h1>
 </div>
 <div align="center">
-  <img src="images/LoreNexus.png" width="400" alt="PyAtlas">
+  <img src="images/LoreNexus.png" width="600" alt="LoreNexus">
 </div>
 
 ## Descripción

diff --git a/__init__.py b/__init__.py
diff --git a/dataset/preprocessing/apis/mythology/mythdata.py b/dataset/preprocessing/apis/mythology/mythdata.py
@@ -1,4 +1,16 @@
-import os
+"""
+*****************************************************
+ * Universidad del País Vasco (UPV/EHU)
+ * Facultad de Informática - Donostia-San Sebastián
+ * Asignatura: Procesamiento de Lenguaje Natural
+ * Proyecto: Lore Nexus
+ *
+ * File: mythdata.py
+ * Author: geru-scotland
+ * GitHub: https://github.com/geru-scotland
+ * Description:
+ *****************************************************
+"""
 import re
 import unicodedata
 from enum import Enum

diff --git a/dataset/preprocessing/data_processor.py b/dataset/preprocessing/data_processor.py
@@ -12,12 +12,24 @@
  *****************************************************
 """
 import json
+import random
+from collections import defaultdict
+from enum import Enum
+
 from textattack.augmentation import Augmenter
 from textattack.transformations.word_swaps.word_swap_neighboring_character_swap import WordSwapNeighboringCharacterSwap
 
+class MythLabels(Enum):
+    """
+    TODO: Pasar enums a fichero estilo shared-defines
+    """
+    MAIN_LABEL = "Mythology"
+
+    def __str__(self):
+        return self.value
 
 class DataProcessor:
-    def __init__(self,  config_path="config.json", output_path="output/dataset.txt"):
+    def __init__(self, config_path="config.json", output_path="output/dataset.txt"):
         self.config_path = config_path
         self.output_path = output_path
         self.allowed_labels = self._load_labels(config_path)
@@ -82,8 +94,12 @@ def augment(self):
                     name = name.strip().lower() # OJO: Estoy pasando todo a minusculas para reducir el espacio OJO EN TEST!!
                     name_parts = name.split()
 
-                    # Multiples palabras por nombre, este tiene que ir, si o si.
-                    augmented_data.append(line.strip().lower())
+                    # La original, este tiene que ir, si o si.
+                    augmented_data.append(f"{label} {name}")
+
+                    # No quiero aumentar mythology, por ahora se come a todas si no
+                    if label ==  f"__label__{MythLabels.MAIN_LABEL}":
+                        continue
 
                     # Y ahora, si el nombre tiene más de una palabra, la agrego
                     # como instancia también, CREO que puede ayudar.
@@ -111,6 +127,7 @@ def run_pipeline(self):
         # no creo que sea bueno...
         #unique_data = list(set(augmented_data))
         self._save_data(augmented_data)
+        self._stratify_data(augmented_data)
 
     def augment(self):
         augmenter = self.DataAugmentator(self)
@@ -121,6 +138,49 @@ def _save_data(self, data):
             for line in data:
                 file.write(line + '\n')
 
+    def _stratify_data(self, data):
+        random.seed(42)
+
+        data_by_label = defaultdict(list)
+        with open('output/dataset.txt', 'r', encoding='utf-8') as file:
+            lines = file.readlines()
+            # Organizo por label, para dividir equitativamente después, si no lo que me ha
+            # ocurrido es que había alguna label en dev y test que no estaba en train
+            for line in lines:
+                label = line.split()[0]
+                data_by_label[label].append(line)
+
+            # 80% train, 10% dev/val, 10% test
+            train_lines, dev_lines, test_lines = [], [], []
+            for label, items in data_by_label.items():
+                random.shuffle(items)
+
+                total_items = len(items)
+                train_size = int(0.88 * total_items)
+                dev_size = int(0.12 * total_items)
+                # test_size = total_items - train_size - dev_size
+
+                train_lines.extend(items[:train_size])
+                dev_lines.extend(items[train_size:train_size + dev_size])
+                # test_lines.extend(items[train_size + dev_size:])
+
+            random.shuffle(train_lines)
+            random.shuffle(dev_lines)
+            # random.shuffle(test_lines)
+
+            try:
+                with open('output/train.txt', 'w', encoding='utf-8') as file:
+                    file.writelines(train_lines)
+
+                with open('output/dev.txt', 'w', encoding='utf-8') as file:
+                    file.writelines(dev_lines)
+
+                with open('output/test.txt', 'w', encoding='utf-8') as file:
+                    file.writelines(test_lines)
+            except FileNotFoundError:
+                print("Error saving stratified data.")
+
+
 
 dataprocessor = DataProcessor()
 dataprocessor.run_pipeline()
-Original file line number
+Diff line change
@@ Expand Up / @@ -91,4 +91,6 @@ Desktop.ini @@
     # Temp files
     tmp/
-    temp/
+    temp/
+    __pycache__/