-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
…. Updates #9
- Loading branch information
1 parent
149a34c
commit 899834e
Showing
10 changed files
with
149 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
"datasets": [ | ||
{ | ||
"name": "Wikidata", | ||
"path": "dataset/preprocessing/apis/wikidata", | ||
"input_folder": "raw_data", | ||
"output_folder": "processed_data", | ||
"dataset_file": "wikidata-universes.csv", | ||
"processed_file": "wikidata_dataset_FastText.txt" | ||
}, | ||
{ | ||
"name": "Mythdata", | ||
"path": "dataset/preprocessing/apis/mythology", | ||
"input_folder": "raw_data", | ||
"output_folder": "processed_data", | ||
"dataset_file": "myth_dataset.csv", | ||
"output_file": "myth_dataset.txt", | ||
"processed_file": "myth_dataset.txt" | ||
} | ||
], | ||
|
||
"data_processor": { | ||
"path": "dataset/preprocessing", | ||
"output_folder": "output", | ||
"output_file": "dataset.txt", | ||
"train_file": "train.txt", | ||
"dev_file": "dev.txt", | ||
"test_file": "test.txt", | ||
"labels": [ | ||
"HarryPotter", | ||
"StarWars", | ||
"Tolkien", | ||
"Warcraft", | ||
"DragonBall", | ||
"Naruto", | ||
"ForgottenRealms", | ||
"FinalFantasy", | ||
"GameofThrones", | ||
"Mythology" | ||
] | ||
} | ||
} |
Empty file.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
""" | ||
***************************************************** | ||
* Universidad del País Vasco (UPV/EHU) | ||
* Facultad de Informática - Donostia-San Sebastián | ||
* Asignatura: Procesamiento de Lenguaje Natural | ||
* Proyecto: Lore Nexus | ||
* | ||
* File: pipeline.py | ||
* Author: geru-scotland | ||
* GitHub: https://github.com/geru-scotland | ||
* Description: | ||
***************************************************** | ||
""" | ||
|
||
import json | ||
|
||
from dataset.preprocessing.apis.mythology.mythdata import MythdataProcessor | ||
from dataset.preprocessing.apis.wikidata.wikidata import WikidataProcessor, DatasetFormats | ||
from dataset.preprocessing.data_processor import DataProcessor | ||
|
||
|
||
class Config: | ||
def __init__(self, config_path): | ||
with open(config_path, 'r') as file: | ||
self.config = json.load(file) | ||
|
||
def get_dataset(self, name): | ||
""" | ||
""" | ||
for dataset in self.config.get("datasets", []): | ||
if dataset["name"] == name: | ||
return dataset | ||
return None | ||
|
||
def get_data_processor_config(self): | ||
""" | ||
""" | ||
return self.config.get("data_processor", {}) | ||
|
||
config = Config('config.json') | ||
|
||
wikidata_config = config.get_dataset("Wikidata") | ||
|
||
if wikidata_config: | ||
wikidata_processor = WikidataProcessor( | ||
input_file=f"{wikidata_config['path']}/{wikidata_config['input_folder']}/{wikidata_config['dataset_file']}", | ||
output_folder=f"{wikidata_config['path']}/{wikidata_config['output_folder']}", | ||
labels_file=f"{wikidata_config['path']}/labels/labels.txt" | ||
) | ||
wikidata_processor.process_data(DatasetFormats.FAST_TEXT) | ||
|
||
|
||
mythdata_config = config.get_dataset("Mythdata") | ||
|
||
if mythdata_config: | ||
mythdata_processor = MythdataProcessor( | ||
input_file=f"{mythdata_config['path']}/{mythdata_config['input_folder']}/{mythdata_config['dataset_file']}", | ||
output_file=f"{mythdata_config['path']}/{mythdata_config['output_folder']}/{mythdata_config['output_file']}" | ||
) | ||
mythdata_processor.process_data() | ||
|
||
|
||
data_processor_config = config.get_data_processor_config() | ||
base_path = data_processor_config["path"] | ||
|
||
data_processor = DataProcessor( | ||
datasets=config.config["datasets"], | ||
labels=data_processor_config["labels"], | ||
output_folder=f"{base_path}/{data_processor_config['output_folder']}", | ||
output_file=data_processor_config["output_file"], | ||
train_file=data_processor_config["train_file"], | ||
dev_file=data_processor_config["dev_file"], | ||
test_file=data_processor_config["test_file"] | ||
) | ||
data_processor.run_pipeline() |