Implement a centralized data processing pipeline. Closes #7. Updates #8…

…. Updates #9
geru-scotland · Nov 9, 2024 · 899834e · 899834e
1 parent 149a34c
commit 899834e
Show file tree

Hide file tree

Showing 10 changed files with 149 additions and 47 deletions.
diff --git a/config.json b/config.json
@@ -0,0 +1,42 @@
+{
+    "datasets": [
+        {
+            "name": "Wikidata",
+            "path": "dataset/preprocessing/apis/wikidata",
+            "input_folder": "raw_data",
+            "output_folder": "processed_data",
+            "dataset_file": "wikidata-universes.csv",
+            "processed_file": "wikidata_dataset_FastText.txt"
+        },
+        {
+            "name": "Mythdata",
+            "path": "dataset/preprocessing/apis/mythology",
+            "input_folder": "raw_data",
+            "output_folder": "processed_data",
+            "dataset_file": "myth_dataset.csv",
+            "output_file": "myth_dataset.txt",
+            "processed_file": "myth_dataset.txt"
+        }
+    ],
+
+    "data_processor": {
+        "path": "dataset/preprocessing",
+        "output_folder": "output",
+        "output_file": "dataset.txt",
+        "train_file": "train.txt",
+        "dev_file": "dev.txt",
+        "test_file": "test.txt",
+        "labels": [
+            "HarryPotter",
+            "StarWars",
+            "Tolkien",
+            "Warcraft",
+            "DragonBall",
+            "Naruto",
+            "ForgottenRealms",
+            "FinalFantasy",
+            "GameofThrones",
+            "Mythology"
+        ]
+    }
+}
diff --git a/dataset/__init__.py b/dataset/__init__.py
diff --git a/dataset/preprocessing/__init__.py b/dataset/preprocessing/__init__.py
diff --git a/dataset/preprocessing/apis/__init__.py b/dataset/preprocessing/apis/__init__.py
diff --git a/dataset/preprocessing/apis/mythology/__init__.py b/dataset/preprocessing/apis/mythology/__init__.py
diff --git a/dataset/preprocessing/apis/mythology/mythdata.py b/dataset/preprocessing/apis/mythology/mythdata.py
@@ -27,7 +27,7 @@ def __str__(self):
 
 # TODO: Crear clase base de la que hereden todos los "processors", como este, el de WikiData, etc.
 class MythdataProcessor:
-    def __init__(self, input_file, output_file='processed_data/myth_dataset.txt'):
+    def __init__(self, input_file, output_file):
         """
         """
         self.input_file = input_file
@@ -71,5 +71,5 @@ def process_data(self):
         self.save_processed_data()
         print("Data processing completed.")
 
-processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv')
-processor.process_data()
+# processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv')
+# processor.process_data()
diff --git a/dataset/preprocessing/apis/wikidata/__init__.py b/dataset/preprocessing/apis/wikidata/__init__.py
diff --git a/dataset/preprocessing/apis/wikidata/wikidata.py b/dataset/preprocessing/apis/wikidata/wikidata.py
@@ -27,12 +27,12 @@ def __str__(self):
         return self.value
 
 class WikidataProcessor:
-    def __init__(self, input_file, output_file='processed_data/wikidata_dataset.csv', labels_file='labels/labels.txt'):
+    def __init__(self, input_file, output_folder, labels_file):
         """
         """
 
         self.input_file = input_file
-        self.output_file = output_file
+        self.output_folder = output_folder
         self.labels_file = labels_file
         self.df = pd.read_csv(self.input_file)
 
@@ -73,6 +73,8 @@ def adjust_and_homogenize_labels(text):
 
     def generate_label_list(self):
         """
+        Simplemente para generar un fichero con todas las etiquetas únicas
+        Por curiosidad, no forma parte del proceso de pipeline.
         """
 
         self.process_labels()
@@ -128,33 +130,27 @@ def write_to_file(file_path, data, conll_format=False, fast_text_format=False):
                     for item in data:
                         file.write(f"{item}\n")
 
-        output_file_base, extension = os.path.splitext(self.output_file)
-
         if dataset_format == DatasetFormats.CoNLL:
+            output_file = os.path.join(self.output_folder, "wikidata_dataset_CoNLL.txt")
             names = self.df['itemLabel']
             labels = self.df['universeLabel']
-            output_file = f"{output_file_base}_CoNLL.txt"
             write_to_file(output_file, data=[names, labels], conll_format=True)
 
         elif dataset_format == DatasetFormats.SEPARATED_DATA_LABELS:
-            names_file = f"{output_file_base}_names.txt"
-            labels_file = f"{output_file_base}_labels.txt"
+            names_file = os.path.join(self.output_folder, "wikidata_dataset_names.txt")
+            labels_file = os.path.join(self.output_folder, "wikidata_dataset_labels.txt")
             names = self.df['itemLabel']
             labels = self.df['universeLabel']
             write_to_file(names_file, data=names)
             write_to_file(labels_file, data=labels)
 
         elif dataset_format == DatasetFormats.FAST_TEXT:
+            output_file = os.path.join(self.output_folder, "wikidata_dataset_FastText.txt")
             names = self.df['itemLabel']
             labels = self.df['universeLabel']
-            output_file = f"{output_file_base}_FastText.txt"
             write_to_file(output_file, data=[names, labels], fast_text_format=True)
         else:
-            self.df.to_csv(self.output_file, index=False)
-
-        print(f"Processed data saved to {self.output_file}")
-
+            output_file = os.path.join(self.output_folder, "wikidata_dataset.csv")
+            self.df.to_csv(output_file, index=False)
 
-# Proceso todos los datos que he obtenido de Wikidata
-processor = WikidataProcessor(input_file='raw_data/wikidata-universes.csv')
-processor.process_data(dataset_format=DatasetFormats.FAST_TEXT)
+        print(f"Processed data saved to {self.output_folder}")
diff --git a/dataset/preprocessing/data_processor.py b/dataset/preprocessing/data_processor.py
@@ -11,6 +11,7 @@
  * Description:
  *****************************************************
 """
+import os
 import json
 import random
 from collections import defaultdict
@@ -30,41 +31,34 @@ def __str__(self):
         return self.value
 
 class DataProcessor:
-    def __init__(self, config_path="config.json", output_path="output/dataset.txt"):
-        self.config_path = config_path
-        self.output_path = output_path
-        self.allowed_labels = self._load_labels(config_path)
-        self.datasets = self._load_datasets()
+    def __init__(self, datasets, labels, output_folder, output_file, train_file, dev_file, test_file):
+        self.datasets = datasets
+        self.allowed_labels = self._load_labels(labels)
         self.data = self._load_data()
 
-    def _load_labels(self, config_path):
+        # Rutas de salida
+        self.output_path = os.path.join(output_folder, output_file)
+        self.train_path = os.path.join(output_folder, train_file)
+        self.dev_path = os.path.join(output_folder, dev_file)
+        self.test_path = os.path.join(output_folder, test_file)
+
+    def _load_labels(self, labels):
         """
         """
         allowed_labels = []
-        with open(config_path, 'r', encoding='utf-8') as config_file:
-            config = json.load(config_file)
-
-            raw_labels = config.get("labels", [])
 
-            for label in raw_labels:
-                allowed_labels.append(f"__label__{label}")
+        for label in labels:
+            allowed_labels.append(f"__label__{label}")
 
         return allowed_labels
 
-    def _load_datasets(self):
-        """
-        """
-        with open(self.config_path, 'r', encoding='utf-8') as config_file:
-            config = json.load(config_file)
-            return config.get("datasets", [])
-
     def _load_data(self):
         """
         """
         filtered_data = []
         for dataset in self.datasets:
+            dataset_path = os.path.join(dataset["path"], dataset["output_folder"], dataset["processed_file"])
             try:
-                dataset_path = dataset["path"]
                 with open(dataset_path, 'r', encoding='utf-8') as file:
                     for line in file:
                         if any(line.startswith(label) for label in self.allowed_labels):
@@ -153,7 +147,7 @@ def _stratify_data(self, data):
         random.seed(42)
 
         data_by_label = defaultdict(list)
-        with open('output/dataset.txt', 'r', encoding='utf-8') as file:
+        with open(self.output_path, 'r', encoding='utf-8') as file:
             lines = file.readlines()
             # Organizo por label, para dividir equitativamente después, si no lo que me ha
             # ocurrido es que había alguna label en dev y test que no estaba en train
@@ -180,18 +174,13 @@ def _stratify_data(self, data):
             # random.shuffle(test_lines)
 
             try:
-                with open('output/train.txt', 'w', encoding='utf-8') as file:
+                with open(self.train_path, 'w', encoding='utf-8') as file:
                     file.writelines(train_lines)
 
-                with open('output/dev.txt', 'w', encoding='utf-8') as file:
+                with open(self.dev_path, 'w', encoding='utf-8') as file:
                     file.writelines(dev_lines)
 
-                with open('output/test.txt', 'w', encoding='utf-8') as file:
+                with open(self.test_path, 'w', encoding='utf-8') as file:
                     file.writelines(test_lines)
             except FileNotFoundError:
                 print("Error saving stratified data.")
-
-
-
-dataprocessor = DataProcessor()
-dataprocessor.run_pipeline()
diff --git a/pipeline.py b/pipeline.py
@@ -0,0 +1,75 @@
+"""
+*****************************************************
+ * Universidad del País Vasco (UPV/EHU)
+ * Facultad de Informática - Donostia-San Sebastián
+ * Asignatura: Procesamiento de Lenguaje Natural
+ * Proyecto: Lore Nexus
+ *
+ * File: pipeline.py
+ * Author: geru-scotland
+ * GitHub: https://github.com/geru-scotland
+ * Description:
+ *****************************************************
+"""
+
+import json
+
+from dataset.preprocessing.apis.mythology.mythdata import MythdataProcessor
+from dataset.preprocessing.apis.wikidata.wikidata import WikidataProcessor, DatasetFormats
+from dataset.preprocessing.data_processor import DataProcessor
+
+
+class Config:
+    def __init__(self, config_path):
+        with open(config_path, 'r') as file:
+            self.config = json.load(file)
+
+    def get_dataset(self, name):
+        """
+        """
+        for dataset in self.config.get("datasets", []):
+            if dataset["name"] == name:
+                return dataset
+        return None
+
+    def get_data_processor_config(self):
+        """
+        """
+        return self.config.get("data_processor", {})
+
+config = Config('config.json')
+
+wikidata_config = config.get_dataset("Wikidata")
+
+if wikidata_config:
+    wikidata_processor = WikidataProcessor(
+        input_file=f"{wikidata_config['path']}/{wikidata_config['input_folder']}/{wikidata_config['dataset_file']}",
+        output_folder=f"{wikidata_config['path']}/{wikidata_config['output_folder']}",
+        labels_file=f"{wikidata_config['path']}/labels/labels.txt"
+    )
+    wikidata_processor.process_data(DatasetFormats.FAST_TEXT)
+
+
+mythdata_config = config.get_dataset("Mythdata")
+
+if mythdata_config:
+    mythdata_processor = MythdataProcessor(
+        input_file=f"{mythdata_config['path']}/{mythdata_config['input_folder']}/{mythdata_config['dataset_file']}",
+        output_file=f"{mythdata_config['path']}/{mythdata_config['output_folder']}/{mythdata_config['output_file']}"
+    )
+    mythdata_processor.process_data()
+
+
+data_processor_config = config.get_data_processor_config()
+base_path = data_processor_config["path"]
+
+data_processor = DataProcessor(
+    datasets=config.config["datasets"],
+    labels=data_processor_config["labels"],
+    output_folder=f"{base_path}/{data_processor_config['output_folder']}",
+    output_file=data_processor_config["output_file"],
+    train_file=data_processor_config["train_file"],
+    dev_file=data_processor_config["dev_file"],
+    test_file=data_processor_config["test_file"]
+)
+data_processor.run_pipeline()