From 899834edf9db5290bde32d86bfb78ca14a454681 Mon Sep 17 00:00:00 2001 From: geru-scotland Date: Sat, 9 Nov 2024 16:16:56 +0100 Subject: [PATCH] Implement a centralized data processing pipeline. Closes #7. Updates #8. Updates #9 --- config.json | 42 +++++++++++ dataset/__init__.py | 0 dataset/preprocessing/__init__.py | 0 dataset/preprocessing/apis/__init__.py | 0 .../preprocessing/apis/mythology/__init__.py | 0 .../preprocessing/apis/mythology/mythdata.py | 6 +- .../preprocessing/apis/wikidata/__init__.py | 0 .../preprocessing/apis/wikidata/wikidata.py | 26 +++---- dataset/preprocessing/data_processor.py | 47 +++++------- pipeline.py | 75 +++++++++++++++++++ 10 files changed, 149 insertions(+), 47 deletions(-) create mode 100644 config.json create mode 100644 dataset/__init__.py create mode 100644 dataset/preprocessing/__init__.py create mode 100644 dataset/preprocessing/apis/__init__.py create mode 100644 dataset/preprocessing/apis/mythology/__init__.py create mode 100644 dataset/preprocessing/apis/wikidata/__init__.py create mode 100644 pipeline.py diff --git a/config.json b/config.json new file mode 100644 index 0000000..bf5cac3 --- /dev/null +++ b/config.json @@ -0,0 +1,42 @@ +{ + "datasets": [ + { + "name": "Wikidata", + "path": "dataset/preprocessing/apis/wikidata", + "input_folder": "raw_data", + "output_folder": "processed_data", + "dataset_file": "wikidata-universes.csv", + "processed_file": "wikidata_dataset_FastText.txt" + }, + { + "name": "Mythdata", + "path": "dataset/preprocessing/apis/mythology", + "input_folder": "raw_data", + "output_folder": "processed_data", + "dataset_file": "myth_dataset.csv", + "output_file": "myth_dataset.txt", + "processed_file": "myth_dataset.txt" + } + ], + + "data_processor": { + "path": "dataset/preprocessing", + "output_folder": "output", + "output_file": "dataset.txt", + "train_file": "train.txt", + "dev_file": "dev.txt", + "test_file": "test.txt", + "labels": [ + "HarryPotter", + "StarWars", + "Tolkien", + "Warcraft", + "DragonBall", + "Naruto", + "ForgottenRealms", + "FinalFantasy", + "GameofThrones", + "Mythology" + ] + } +} diff --git a/dataset/__init__.py b/dataset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/preprocessing/__init__.py b/dataset/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/preprocessing/apis/__init__.py b/dataset/preprocessing/apis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/preprocessing/apis/mythology/__init__.py b/dataset/preprocessing/apis/mythology/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/preprocessing/apis/mythology/mythdata.py b/dataset/preprocessing/apis/mythology/mythdata.py index 8374bee..14eff3c 100644 --- a/dataset/preprocessing/apis/mythology/mythdata.py +++ b/dataset/preprocessing/apis/mythology/mythdata.py @@ -27,7 +27,7 @@ def __str__(self): # TODO: Crear clase base de la que hereden todos los "processors", como este, el de WikiData, etc. class MythdataProcessor: - def __init__(self, input_file, output_file='processed_data/myth_dataset.txt'): + def __init__(self, input_file, output_file): """ """ self.input_file = input_file @@ -71,5 +71,5 @@ def process_data(self): self.save_processed_data() print("Data processing completed.") -processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv') -processor.process_data() +# processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv') +# processor.process_data() diff --git a/dataset/preprocessing/apis/wikidata/__init__.py b/dataset/preprocessing/apis/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataset/preprocessing/apis/wikidata/wikidata.py b/dataset/preprocessing/apis/wikidata/wikidata.py index 3bbc97b..1a1b58a 100644 --- a/dataset/preprocessing/apis/wikidata/wikidata.py +++ b/dataset/preprocessing/apis/wikidata/wikidata.py @@ -27,12 +27,12 @@ def __str__(self): return self.value class WikidataProcessor: - def __init__(self, input_file, output_file='processed_data/wikidata_dataset.csv', labels_file='labels/labels.txt'): + def __init__(self, input_file, output_folder, labels_file): """ """ self.input_file = input_file - self.output_file = output_file + self.output_folder = output_folder self.labels_file = labels_file self.df = pd.read_csv(self.input_file) @@ -73,6 +73,8 @@ def adjust_and_homogenize_labels(text): def generate_label_list(self): """ + Simplemente para generar un fichero con todas las etiquetas únicas + Por curiosidad, no forma parte del proceso de pipeline. """ self.process_labels() @@ -128,33 +130,27 @@ def write_to_file(file_path, data, conll_format=False, fast_text_format=False): for item in data: file.write(f"{item}\n") - output_file_base, extension = os.path.splitext(self.output_file) - if dataset_format == DatasetFormats.CoNLL: + output_file = os.path.join(self.output_folder, "wikidata_dataset_CoNLL.txt") names = self.df['itemLabel'] labels = self.df['universeLabel'] - output_file = f"{output_file_base}_CoNLL.txt" write_to_file(output_file, data=[names, labels], conll_format=True) elif dataset_format == DatasetFormats.SEPARATED_DATA_LABELS: - names_file = f"{output_file_base}_names.txt" - labels_file = f"{output_file_base}_labels.txt" + names_file = os.path.join(self.output_folder, "wikidata_dataset_names.txt") + labels_file = os.path.join(self.output_folder, "wikidata_dataset_labels.txt") names = self.df['itemLabel'] labels = self.df['universeLabel'] write_to_file(names_file, data=names) write_to_file(labels_file, data=labels) elif dataset_format == DatasetFormats.FAST_TEXT: + output_file = os.path.join(self.output_folder, "wikidata_dataset_FastText.txt") names = self.df['itemLabel'] labels = self.df['universeLabel'] - output_file = f"{output_file_base}_FastText.txt" write_to_file(output_file, data=[names, labels], fast_text_format=True) else: - self.df.to_csv(self.output_file, index=False) - - print(f"Processed data saved to {self.output_file}") - + output_file = os.path.join(self.output_folder, "wikidata_dataset.csv") + self.df.to_csv(output_file, index=False) -# Proceso todos los datos que he obtenido de Wikidata -processor = WikidataProcessor(input_file='raw_data/wikidata-universes.csv') -processor.process_data(dataset_format=DatasetFormats.FAST_TEXT) \ No newline at end of file + print(f"Processed data saved to {self.output_folder}") diff --git a/dataset/preprocessing/data_processor.py b/dataset/preprocessing/data_processor.py index 8a2c388..8a2ebec 100644 --- a/dataset/preprocessing/data_processor.py +++ b/dataset/preprocessing/data_processor.py @@ -11,6 +11,7 @@ * Description: ***************************************************** """ +import os import json import random from collections import defaultdict @@ -30,41 +31,34 @@ def __str__(self): return self.value class DataProcessor: - def __init__(self, config_path="config.json", output_path="output/dataset.txt"): - self.config_path = config_path - self.output_path = output_path - self.allowed_labels = self._load_labels(config_path) - self.datasets = self._load_datasets() + def __init__(self, datasets, labels, output_folder, output_file, train_file, dev_file, test_file): + self.datasets = datasets + self.allowed_labels = self._load_labels(labels) self.data = self._load_data() - def _load_labels(self, config_path): + # Rutas de salida + self.output_path = os.path.join(output_folder, output_file) + self.train_path = os.path.join(output_folder, train_file) + self.dev_path = os.path.join(output_folder, dev_file) + self.test_path = os.path.join(output_folder, test_file) + + def _load_labels(self, labels): """ """ allowed_labels = [] - with open(config_path, 'r', encoding='utf-8') as config_file: - config = json.load(config_file) - - raw_labels = config.get("labels", []) - for label in raw_labels: - allowed_labels.append(f"__label__{label}") + for label in labels: + allowed_labels.append(f"__label__{label}") return allowed_labels - def _load_datasets(self): - """ - """ - with open(self.config_path, 'r', encoding='utf-8') as config_file: - config = json.load(config_file) - return config.get("datasets", []) - def _load_data(self): """ """ filtered_data = [] for dataset in self.datasets: + dataset_path = os.path.join(dataset["path"], dataset["output_folder"], dataset["processed_file"]) try: - dataset_path = dataset["path"] with open(dataset_path, 'r', encoding='utf-8') as file: for line in file: if any(line.startswith(label) for label in self.allowed_labels): @@ -153,7 +147,7 @@ def _stratify_data(self, data): random.seed(42) data_by_label = defaultdict(list) - with open('output/dataset.txt', 'r', encoding='utf-8') as file: + with open(self.output_path, 'r', encoding='utf-8') as file: lines = file.readlines() # Organizo por label, para dividir equitativamente después, si no lo que me ha # ocurrido es que había alguna label en dev y test que no estaba en train @@ -180,18 +174,13 @@ def _stratify_data(self, data): # random.shuffle(test_lines) try: - with open('output/train.txt', 'w', encoding='utf-8') as file: + with open(self.train_path, 'w', encoding='utf-8') as file: file.writelines(train_lines) - with open('output/dev.txt', 'w', encoding='utf-8') as file: + with open(self.dev_path, 'w', encoding='utf-8') as file: file.writelines(dev_lines) - with open('output/test.txt', 'w', encoding='utf-8') as file: + with open(self.test_path, 'w', encoding='utf-8') as file: file.writelines(test_lines) except FileNotFoundError: print("Error saving stratified data.") - - - -dataprocessor = DataProcessor() -dataprocessor.run_pipeline() diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..92966c4 --- /dev/null +++ b/pipeline.py @@ -0,0 +1,75 @@ +""" +***************************************************** + * Universidad del País Vasco (UPV/EHU) + * Facultad de Informática - Donostia-San Sebastián + * Asignatura: Procesamiento de Lenguaje Natural + * Proyecto: Lore Nexus + * + * File: pipeline.py + * Author: geru-scotland + * GitHub: https://github.com/geru-scotland + * Description: + ***************************************************** +""" + +import json + +from dataset.preprocessing.apis.mythology.mythdata import MythdataProcessor +from dataset.preprocessing.apis.wikidata.wikidata import WikidataProcessor, DatasetFormats +from dataset.preprocessing.data_processor import DataProcessor + + +class Config: + def __init__(self, config_path): + with open(config_path, 'r') as file: + self.config = json.load(file) + + def get_dataset(self, name): + """ + """ + for dataset in self.config.get("datasets", []): + if dataset["name"] == name: + return dataset + return None + + def get_data_processor_config(self): + """ + """ + return self.config.get("data_processor", {}) + +config = Config('config.json') + +wikidata_config = config.get_dataset("Wikidata") + +if wikidata_config: + wikidata_processor = WikidataProcessor( + input_file=f"{wikidata_config['path']}/{wikidata_config['input_folder']}/{wikidata_config['dataset_file']}", + output_folder=f"{wikidata_config['path']}/{wikidata_config['output_folder']}", + labels_file=f"{wikidata_config['path']}/labels/labels.txt" + ) + wikidata_processor.process_data(DatasetFormats.FAST_TEXT) + + +mythdata_config = config.get_dataset("Mythdata") + +if mythdata_config: + mythdata_processor = MythdataProcessor( + input_file=f"{mythdata_config['path']}/{mythdata_config['input_folder']}/{mythdata_config['dataset_file']}", + output_file=f"{mythdata_config['path']}/{mythdata_config['output_folder']}/{mythdata_config['output_file']}" + ) + mythdata_processor.process_data() + + +data_processor_config = config.get_data_processor_config() +base_path = data_processor_config["path"] + +data_processor = DataProcessor( + datasets=config.config["datasets"], + labels=data_processor_config["labels"], + output_folder=f"{base_path}/{data_processor_config['output_folder']}", + output_file=data_processor_config["output_file"], + train_file=data_processor_config["train_file"], + dev_file=data_processor_config["dev_file"], + test_file=data_processor_config["test_file"] +) +data_processor.run_pipeline() \ No newline at end of file