Skip to content

Commit

Permalink
Implement a centralized data processing pipeline. Closes #7. Updates #8
Browse files Browse the repository at this point in the history
…. Updates #9
  • Loading branch information
geru-scotland committed Nov 9, 2024
1 parent 149a34c commit 899834e
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 47 deletions.
42 changes: 42 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"datasets": [
{
"name": "Wikidata",
"path": "dataset/preprocessing/apis/wikidata",
"input_folder": "raw_data",
"output_folder": "processed_data",
"dataset_file": "wikidata-universes.csv",
"processed_file": "wikidata_dataset_FastText.txt"
},
{
"name": "Mythdata",
"path": "dataset/preprocessing/apis/mythology",
"input_folder": "raw_data",
"output_folder": "processed_data",
"dataset_file": "myth_dataset.csv",
"output_file": "myth_dataset.txt",
"processed_file": "myth_dataset.txt"
}
],

"data_processor": {
"path": "dataset/preprocessing",
"output_folder": "output",
"output_file": "dataset.txt",
"train_file": "train.txt",
"dev_file": "dev.txt",
"test_file": "test.txt",
"labels": [
"HarryPotter",
"StarWars",
"Tolkien",
"Warcraft",
"DragonBall",
"Naruto",
"ForgottenRealms",
"FinalFantasy",
"GameofThrones",
"Mythology"
]
}
}
Empty file added dataset/__init__.py
Empty file.
Empty file.
Empty file.
Empty file.
6 changes: 3 additions & 3 deletions dataset/preprocessing/apis/mythology/mythdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __str__(self):

# TODO: Crear clase base de la que hereden todos los "processors", como este, el de WikiData, etc.
class MythdataProcessor:
def __init__(self, input_file, output_file='processed_data/myth_dataset.txt'):
def __init__(self, input_file, output_file):
"""
"""
self.input_file = input_file
Expand Down Expand Up @@ -71,5 +71,5 @@ def process_data(self):
self.save_processed_data()
print("Data processing completed.")

processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv')
processor.process_data()
# processor = MythdataProcessor(input_file='raw_data/myth_dataset.csv')
# processor.process_data()
Empty file.
26 changes: 11 additions & 15 deletions dataset/preprocessing/apis/wikidata/wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ def __str__(self):
return self.value

class WikidataProcessor:
def __init__(self, input_file, output_file='processed_data/wikidata_dataset.csv', labels_file='labels/labels.txt'):
def __init__(self, input_file, output_folder, labels_file):
"""
"""

self.input_file = input_file
self.output_file = output_file
self.output_folder = output_folder
self.labels_file = labels_file
self.df = pd.read_csv(self.input_file)

Expand Down Expand Up @@ -73,6 +73,8 @@ def adjust_and_homogenize_labels(text):

def generate_label_list(self):
"""
Simplemente para generar un fichero con todas las etiquetas únicas
Por curiosidad, no forma parte del proceso de pipeline.
"""

self.process_labels()
Expand Down Expand Up @@ -128,33 +130,27 @@ def write_to_file(file_path, data, conll_format=False, fast_text_format=False):
for item in data:
file.write(f"{item}\n")

output_file_base, extension = os.path.splitext(self.output_file)

if dataset_format == DatasetFormats.CoNLL:
output_file = os.path.join(self.output_folder, "wikidata_dataset_CoNLL.txt")
names = self.df['itemLabel']
labels = self.df['universeLabel']
output_file = f"{output_file_base}_CoNLL.txt"
write_to_file(output_file, data=[names, labels], conll_format=True)

elif dataset_format == DatasetFormats.SEPARATED_DATA_LABELS:
names_file = f"{output_file_base}_names.txt"
labels_file = f"{output_file_base}_labels.txt"
names_file = os.path.join(self.output_folder, "wikidata_dataset_names.txt")
labels_file = os.path.join(self.output_folder, "wikidata_dataset_labels.txt")
names = self.df['itemLabel']
labels = self.df['universeLabel']
write_to_file(names_file, data=names)
write_to_file(labels_file, data=labels)

elif dataset_format == DatasetFormats.FAST_TEXT:
output_file = os.path.join(self.output_folder, "wikidata_dataset_FastText.txt")
names = self.df['itemLabel']
labels = self.df['universeLabel']
output_file = f"{output_file_base}_FastText.txt"
write_to_file(output_file, data=[names, labels], fast_text_format=True)
else:
self.df.to_csv(self.output_file, index=False)

print(f"Processed data saved to {self.output_file}")

output_file = os.path.join(self.output_folder, "wikidata_dataset.csv")
self.df.to_csv(output_file, index=False)

# Proceso todos los datos que he obtenido de Wikidata
processor = WikidataProcessor(input_file='raw_data/wikidata-universes.csv')
processor.process_data(dataset_format=DatasetFormats.FAST_TEXT)
print(f"Processed data saved to {self.output_folder}")
47 changes: 18 additions & 29 deletions dataset/preprocessing/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
* Description:
*****************************************************
"""
import os
import json
import random
from collections import defaultdict
Expand All @@ -30,41 +31,34 @@ def __str__(self):
return self.value

class DataProcessor:
def __init__(self, config_path="config.json", output_path="output/dataset.txt"):
self.config_path = config_path
self.output_path = output_path
self.allowed_labels = self._load_labels(config_path)
self.datasets = self._load_datasets()
def __init__(self, datasets, labels, output_folder, output_file, train_file, dev_file, test_file):
self.datasets = datasets
self.allowed_labels = self._load_labels(labels)
self.data = self._load_data()

def _load_labels(self, config_path):
# Rutas de salida
self.output_path = os.path.join(output_folder, output_file)
self.train_path = os.path.join(output_folder, train_file)
self.dev_path = os.path.join(output_folder, dev_file)
self.test_path = os.path.join(output_folder, test_file)

def _load_labels(self, labels):
"""
"""
allowed_labels = []
with open(config_path, 'r', encoding='utf-8') as config_file:
config = json.load(config_file)

raw_labels = config.get("labels", [])

for label in raw_labels:
allowed_labels.append(f"__label__{label}")
for label in labels:
allowed_labels.append(f"__label__{label}")

return allowed_labels

def _load_datasets(self):
"""
"""
with open(self.config_path, 'r', encoding='utf-8') as config_file:
config = json.load(config_file)
return config.get("datasets", [])

def _load_data(self):
"""
"""
filtered_data = []
for dataset in self.datasets:
dataset_path = os.path.join(dataset["path"], dataset["output_folder"], dataset["processed_file"])
try:
dataset_path = dataset["path"]
with open(dataset_path, 'r', encoding='utf-8') as file:
for line in file:
if any(line.startswith(label) for label in self.allowed_labels):
Expand Down Expand Up @@ -153,7 +147,7 @@ def _stratify_data(self, data):
random.seed(42)

data_by_label = defaultdict(list)
with open('output/dataset.txt', 'r', encoding='utf-8') as file:
with open(self.output_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Organizo por label, para dividir equitativamente después, si no lo que me ha
# ocurrido es que había alguna label en dev y test que no estaba en train
Expand All @@ -180,18 +174,13 @@ def _stratify_data(self, data):
# random.shuffle(test_lines)

try:
with open('output/train.txt', 'w', encoding='utf-8') as file:
with open(self.train_path, 'w', encoding='utf-8') as file:
file.writelines(train_lines)

with open('output/dev.txt', 'w', encoding='utf-8') as file:
with open(self.dev_path, 'w', encoding='utf-8') as file:
file.writelines(dev_lines)

with open('output/test.txt', 'w', encoding='utf-8') as file:
with open(self.test_path, 'w', encoding='utf-8') as file:
file.writelines(test_lines)
except FileNotFoundError:
print("Error saving stratified data.")



dataprocessor = DataProcessor()
dataprocessor.run_pipeline()
75 changes: 75 additions & 0 deletions pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
*****************************************************
* Universidad del País Vasco (UPV/EHU)
* Facultad de Informática - Donostia-San Sebastián
* Asignatura: Procesamiento de Lenguaje Natural
* Proyecto: Lore Nexus
*
* File: pipeline.py
* Author: geru-scotland
* GitHub: https://github.com/geru-scotland
* Description:
*****************************************************
"""

import json

from dataset.preprocessing.apis.mythology.mythdata import MythdataProcessor
from dataset.preprocessing.apis.wikidata.wikidata import WikidataProcessor, DatasetFormats
from dataset.preprocessing.data_processor import DataProcessor


class Config:
def __init__(self, config_path):
with open(config_path, 'r') as file:
self.config = json.load(file)

def get_dataset(self, name):
"""
"""
for dataset in self.config.get("datasets", []):
if dataset["name"] == name:
return dataset
return None

def get_data_processor_config(self):
"""
"""
return self.config.get("data_processor", {})

config = Config('config.json')

wikidata_config = config.get_dataset("Wikidata")

if wikidata_config:
wikidata_processor = WikidataProcessor(
input_file=f"{wikidata_config['path']}/{wikidata_config['input_folder']}/{wikidata_config['dataset_file']}",
output_folder=f"{wikidata_config['path']}/{wikidata_config['output_folder']}",
labels_file=f"{wikidata_config['path']}/labels/labels.txt"
)
wikidata_processor.process_data(DatasetFormats.FAST_TEXT)


mythdata_config = config.get_dataset("Mythdata")

if mythdata_config:
mythdata_processor = MythdataProcessor(
input_file=f"{mythdata_config['path']}/{mythdata_config['input_folder']}/{mythdata_config['dataset_file']}",
output_file=f"{mythdata_config['path']}/{mythdata_config['output_folder']}/{mythdata_config['output_file']}"
)
mythdata_processor.process_data()


data_processor_config = config.get_data_processor_config()
base_path = data_processor_config["path"]

data_processor = DataProcessor(
datasets=config.config["datasets"],
labels=data_processor_config["labels"],
output_folder=f"{base_path}/{data_processor_config['output_folder']}",
output_file=data_processor_config["output_file"],
train_file=data_processor_config["train_file"],
dev_file=data_processor_config["dev_file"],
test_file=data_processor_config["test_file"]
)
data_processor.run_pipeline()

0 comments on commit 899834e

Please sign in to comment.