diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 36fb433e..8eb34acb 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -16,13 +16,13 @@ TabularDataset ArrowDataset -------------------- -.. autoclass:: podium.arrow.ArrowDataset +.. autoclass:: podium.datasets.ArrowDataset :members: :no-undoc-members: HuggingFaceDatasetConverter --------------------------- -.. autoclass:: podium.dataload.hf.HFDatasetConverter +.. autoclass:: podium.datasets.hf.HFDatasetConverter :members: :no-undoc-members: @@ -88,31 +88,3 @@ Catacx Datasets .. autoclass:: podium.datasets.impl.CatacxDataset :members: :no-undoc-members: - - -Various helpers for data loading --------------------------------- - -podium.dataload.cornell\_movie\_dialogs module ------------------------------------------------ - -.. automodule:: podium.dataload.cornell_movie_dialogs - :members: - :no-undoc-members: - :show-inheritance: - -podium.dataload.eurovoc module -------------------------------- - -.. automodule:: podium.dataload.eurovoc - :members: - :no-undoc-members: - :show-inheritance: - -podium.dataload.ner\_croatian module -------------------------------------- - -.. automodule:: podium.dataload.ner_croatian - :members: - :no-undoc-members: - :show-inheritance: diff --git a/docs/source/index.rst b/docs/source/index.rst index 854dc3c6..808e9ebd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -53,7 +53,6 @@ The documentation is organized in four parts: under_development model_implementations - metrics Indices and tables ================== diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst deleted file mode 100644 index f5a10552..00000000 --- a/docs/source/metrics.rst +++ /dev/null @@ -1,7 +0,0 @@ - -Metrics -======== -.. automodule:: podium.metrics.metrics - :members: - :no-undoc-members: - :show-inheritance: diff --git a/docs/source/model_implementations.rst b/docs/source/model_implementations.rst index f5d8e8a8..791aae16 100644 --- a/docs/source/model_implementations.rst +++ b/docs/source/model_implementations.rst @@ -2,12 +2,12 @@ Concrete implementations of models ================================== -.. automodule:: podium.models.impl +.. automodule:: podium.experimental.models.impl :members: :no-undoc-members: :show-inheritance: -.. automodule:: podium.models.impl.pytorch +.. automodule:: podium.experimental.models.impl.pytorch :members: :no-undoc-members: :show-inheritance: diff --git a/docs/source/under_development.rst b/docs/source/under_development.rst index 0dca60b4..0dc7ba17 100644 --- a/docs/source/under_development.rst +++ b/docs/source/under_development.rst @@ -3,28 +3,28 @@ Abstractions for model training with Podium Models ------ -.. automodule:: podium.models +.. automodule:: podium.experimental.models :members: :no-undoc-members: :show-inheritance: Pipeline -------- -.. automodule:: podium.pipeline +.. automodule:: podium.experimental.pipeline :members: :no-undoc-members: :show-inheritance: Model selection ---------------- -.. automodule:: podium.model_selection.model_selection +.. automodule:: podium.experimental.model_selection.model_selection :members: :no-undoc-members: :show-inheritance: Model validation ----------------- -.. automodule:: podium.validation +.. automodule:: podium.experimental.validation :members: :no-undoc-members: :show-inheritance: diff --git a/docs/source/vectorizers.rst b/docs/source/vectorizers.rst index 79dbbde9..6318b763 100644 --- a/docs/source/vectorizers.rst +++ b/docs/source/vectorizers.rst @@ -2,18 +2,18 @@ Data vectorizers ================ -podium.storage.vectorizers.tfidf module +Tf-Idf vectorizer ---------------------------------------- -.. automodule:: podium.storage.vectorizers.tfidf +.. automodule:: podium.vectorizers.tfidf :members: :no-undoc-members: :show-inheritance: -podium.storage.vectorizers.vectorizer module +Loader classes for dense word vectorizers --------------------------------------------- -.. automodule:: podium.storage.vectorizers.vectorizer +.. automodule:: podium.vectorizers.vectorizer :members: :no-undoc-members: :show-inheritance: @@ -22,7 +22,7 @@ podium.storage.vectorizers.vectorizer module Module contents --------------- -.. automodule:: podium.storage.vectorizers +.. automodule:: podium.vectorizers :members: :no-undoc-members: :show-inheritance: diff --git a/docs/source/vocab_and_fields.rst b/docs/source/vocab_and_fields.rst index 1a23fa4f..f7039429 100644 --- a/docs/source/vocab_and_fields.rst +++ b/docs/source/vocab_and_fields.rst @@ -3,30 +3,30 @@ Vocab and Fields Vocab ----------------- -.. autoclass:: podium.storage.Vocab +.. autoclass:: podium.Vocab :members: :no-undoc-members: Field ------ -.. autoclass:: podium.storage.Field +.. autoclass:: podium.Field :members: :no-undoc-members: MultioutputField ----------------- -.. autoclass:: podium.storage.MultioutputField +.. autoclass:: podium.MultioutputField :members: :no-undoc-members: LabelField ----------- -.. autoclass:: podium.storage.LabelField +.. autoclass:: podium.LabelField :members: :no-undoc-members: MultilabelField ---------------- -.. autoclass:: podium.storage.MultilabelField +.. autoclass:: podium.MultilabelField :members: :no-undoc-members: diff --git a/podium/__init__.py b/podium/__init__.py index bfde6adb..f32e5fa7 100644 --- a/podium/__init__.py +++ b/podium/__init__.py @@ -9,13 +9,15 @@ from .datasets import ( BucketIterator, Dataset, + Example, HierarchicalDataset, HierarchicalDatasetIterator, Iterator, SingleBatchIterator, TabularDataset, ) -from .storage import Field, LabelField, MultilabelField, MultioutputField, Vocab +from .field import Field, LabelField, MultilabelField, MultioutputField +from .vocab import Vocab __name__ = "podium" diff --git a/podium/arrow/__init__.py b/podium/arrow/__init__.py deleted file mode 100644 index a40f6d74..00000000 --- a/podium/arrow/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Package contains PyArrow integrations for Podium. -""" - -from .arrow_tabular_dataset import ArrowDataset diff --git a/podium/dataload/__init__.py b/podium/dataload/__init__.py deleted file mode 100644 index 6aeaf423..00000000 --- a/podium/dataload/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Package contains loaders and converters for datasets in specific formats. -""" diff --git a/podium/dataload/cornell_movie_dialogs.py b/podium/dataload/cornell_movie_dialogs.py deleted file mode 100644 index bcd99127..00000000 --- a/podium/dataload/cornell_movie_dialogs.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -Dataloader for Cornell Movie-Dialogs Corpus, available at -http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html. -""" -import os -import re -from collections import namedtuple - -from podium.storage import LargeResource - - -try: - import pandas as pd -except ImportError: - print( - "Problem occured while trying to import pandas. If the library is not " - "installed visit https://pandas.pydata.org/ for more details." - ) - raise - - -CornellMovieDialogsNamedTuple = namedtuple( - "CornellMovieDialogsNamedTuple", - ["titles", "conversations", "lines", "characters", "url"], -) - - -class CornellMovieDialogsLoader: - """ - Class for downloading and parsing the Cornell Movie-Dialogs dataset. - - This class is used for downloading the dataset (if it's not already - downloaded) and parsing the files in the dataset. If it's not already - present LargeResource.BASE_RESOURCE_DIR, the dataset is automatically - downloaded when an instance of the loader is created. The downloaded - resources can be parsed using the load_dataset method. - """ - - URL = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip" - ARCHIVE_TYPE = "zip" - NAME = "cornell_movie_dialogs_corpus" - DATA_FOLDER_NAME = "cornell movie-dialogs corpus" - DELIMITER = " +++$+++ " - ENCODING = "iso-8859-1" - - TITLE_FIELDS = ["movieID", "title", "year", "rating", "votes", "genres"] - TITLE_FILENAME = "movie_titles_metadata.txt" - - CHARACTERS_FIELDS = [ - "characterID", - "character", - "movieID", - "title", - "gender", - "position", - ] - CHARACTERS_FILENAME = "movie_characters_metadata.txt" - - LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"] - LINES_FILENAME = "movie_lines.txt" - - CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"] - CONVERSATIONS_FILENAME = "movie_conversations.txt" - - URL_FIELDS = ["movieID", "title", "url"] - URL_FILENAME = "raw_script_urls.txt" - - def __init__(self): - """ - The constructor will check if the dataset is already been downloaded in - the LargeResource.BASE_RESOURCE_DIR. - - If the dataset is not present, it will atempt to download it. - """ - LargeResource( - **{ - LargeResource.RESOURCE_NAME: CornellMovieDialogsLoader.NAME, - LargeResource.ARCHIVE: CornellMovieDialogsLoader.ARCHIVE_TYPE, - LargeResource.URI: CornellMovieDialogsLoader.URL, - } - ) - - def load_dataset(self): - """ - Loads and parses all the necessary files from the dataset folder. - - Returns - ------- - data : CornellMovieDialogsNamedTuple - tuple that contains dictionaries for 5 types of Cornell movie dialogs data: - titles, conversations, lines, characters and script urls. - Fields for every type are defined in class constants. - """ - titles = self.load_titles() - conversations = self.load_conversations() - lines = self.load_lines() - characters = self.load_characters() - url = self.load_urls() - - return CornellMovieDialogsNamedTuple( - titles=titles, - conversations=conversations, - lines=lines, - characters=characters, - url=url, - ) - - @staticmethod - def _load_file(file_name, fields, columns_hooks=None): - """ - Method loads file from Cornell movie dialogs dataset defined with file - name and fields that are used in the file. - - Parameters - ---------- - file_name : str - string containing file path - fields : list(str) - list containing field names - columns_hooks : dict(str, callable) - functions that will be called on columns - variable represents dictionary that maps column name to a function - """ - data_frame = pd.read_csv( - filepath_or_buffer=os.path.join( - LargeResource.BASE_RESOURCE_DIR, - CornellMovieDialogsLoader.NAME, - CornellMovieDialogsLoader.DATA_FOLDER_NAME, - file_name, - ), - sep=re.escape(CornellMovieDialogsLoader.DELIMITER), - encoding=CornellMovieDialogsLoader.ENCODING, - header=None, - names=fields, - engine="python", - ) - if columns_hooks is not None: - for column_name in columns_hooks: - data_frame[column_name] = data_frame[column_name].apply( - columns_hooks[column_name] - ) - return data_frame.to_dict(orient="list") - - def load_titles(self): - """ - Method loads file containing movie titles. - """ - column_hooks = {} - column_hooks["genres"] = lambda s: s.strip("[]''").split("', '") - return self._load_file( - file_name=CornellMovieDialogsLoader.TITLE_FILENAME, - fields=CornellMovieDialogsLoader.TITLE_FIELDS, - columns_hooks=column_hooks, - ) - - def load_conversations(self): - """ - Method loads file containing movie conversations. - """ - column_hooks = {} - column_hooks["utteranceIDs"] = lambda s: s.strip("[]''").split("', '") - return self._load_file( - file_name=CornellMovieDialogsLoader.CONVERSATIONS_FILENAME, - fields=CornellMovieDialogsLoader.CONVERSATIONS_FIELDS, - columns_hooks=column_hooks, - ) - - def load_lines(self): - """ - Method loads file containing movie lines. - """ - return self._load_file( - file_name=CornellMovieDialogsLoader.LINES_FILENAME, - fields=CornellMovieDialogsLoader.LINES_FIELDS, - ) - - def load_characters(self): - """ - Method loads file containing movie characters. - """ - return self._load_file( - file_name=CornellMovieDialogsLoader.CHARACTERS_FILENAME, - fields=CornellMovieDialogsLoader.CHARACTERS_FIELDS, - ) - - def load_urls(self): - """ - Method loads file containing movie script urls. - """ - return self._load_file( - file_name=CornellMovieDialogsLoader.URL_FILENAME, - fields=CornellMovieDialogsLoader.URL_FIELDS, - ) diff --git a/podium/dataload/eurovoc.py b/podium/dataload/eurovoc.py deleted file mode 100644 index 47639cc1..00000000 --- a/podium/dataload/eurovoc.py +++ /dev/null @@ -1,617 +0,0 @@ -""" -Module for loading raw eurovoc dataset. -""" -import glob -import os -import warnings -import xml.etree.ElementTree as ET -from collections import namedtuple -from dataclasses import dataclass -from enum import Enum -from typing import List, Optional, Set - -import dill - -from podium.datasets.impl.eurovoc_dataset import EuroVocDataset -from podium.storage.resources.large_resource import ( - LargeResource, - init_scp_large_resource_from_kwargs, -) - - -try: - import xlrd -except ImportError: - print( - "Problem occured while trying to import xlrd. If the " - "library is not installed visit http://www.python-excel.org/ " - "for more details." - ) - raise - - -Id = int -Document = namedtuple("Document", ["filename", "title", "text"]) - - -class LabelRank(Enum): - """ - Levels of labels in EuroVoc. - """ - - THESAURUS = 3 - MICRO_THESAURUS = 2 - TERM = 1 - - -@dataclass -class Label: - """ - Label in EuroVoc dataset. - - Labels are assigned to documents. One document has multiple labels. - Labels have a hierarchy in which one label can have one or more parents (broader - terms). All labels apart from thesaurus rank labels have at least one parent. - Apart from parents, labels can also have similar labels which describe related - areas, but aren't connected by the label hierarchy. - - Attributes - ---------- - name : str - name of the label - id : int - numerical id of the label - direct_parents : list(int) - list of ids of direct parents - similar_terms : list(int) - list of ids of similar terms - rank : LabelRank - rank of the label - thesaurus : int - id of the thesaurus of the label (if the label represents a - thesaurus, it has its own id listed in this field) - micro_thesaurus : int - id of the microthesaurus of the label (if the label represents - a microthesaurus, it has its own id listed in this field) - all_ancestors : set(int) - set of ids of all ancestors of the label in the label hierarchy - """ - - name: str - id: Id - direct_parents: List[Id] - similar_terms: List[Id] - rank: LabelRank - thesaurus: Optional[Id] = None - micro_thesaurus: Optional[Id] = None - all_ancestors: Optional[Set[Id]] = None - - -class EuroVocLoader: - """ - Class for downloading and parsing the EuroVoc dataset. - - This class is used for downloading the EuroVoc dataset (if it's not already - downloaded) and parsing the files in the dataset. If it's not already - present LargeResource.BASE_RESOURCE_DIR, the dataset is automatically - downloaded when an instance of EuroVocLoader is created. The downloaded - resources can be parsed using the load_dataset method. - """ - - URL = "/proj/sci/uisusd/data/eurovoc_data/eurovoc.zip" - EUROVOC_LABELS_FILENAME = "EUROVOC.xml" - CROVOC_LABELS_FILENAME = "CROVOC.xml" - MAPPING_FILENAME = "mapping.xls" - DATASET_DIR = "Data" - DOCUMENT_PATHS = "*.xml" - SCP_HOST = "djurdja.takelab.fer.hr" - ARCHIVE_TYPE = "zip" - NAME = "EuroVocDataset" - - def __init__(self, **kwargs): - """ - Constructor of the EuroVocLoader class. - - The constructor will check if the dataset is already been downloaded in the - LargeResource.BASE_RESOURCE_DIR. If the dataset is not present, it will atempt to - download it. - - kwargs: - SCPLargeResource.SCP_USER_KEY: - Username on the host machine from which the dataset is downloaded. Not - required if the username on the local machine matches the username on the - host. - SCPLargeResource.SCP_PRIVATE_KEY: - Path to the ssh private key eligible to access the host. Not required on - Unix if the private key is stored in the default location. - SCPLargeResource.SCP_PASS_KEY: - Password for the ssh private key (optional). Can be omitted - if the private key is not encrypted. - """ - init_scp_large_resource_from_kwargs( - resource=EuroVocLoader.NAME, - uri=EuroVocLoader.URL, - user_dict=kwargs, - archive=EuroVocLoader.ARCHIVE_TYPE, - scp_host=EuroVocLoader.SCP_HOST, - ) - - def load_dataset(self): - """ - Loads and parses all the necessary files from the dataset folder. - - Returns - ------- - tuple: - (EuroVoc label hierarchy, CroVoc label hierarchy, document mapping, - documents) - - EuroVoc label hierarchy : dict(label_id : Label) - CroVoc label hierarchy : dict(label_id : Label) - document mapping : dict(document_id : list of label ids) - documents : list(Document) - """ - eurovoc_label_hierarchy_path = os.path.join( - LargeResource.BASE_RESOURCE_DIR, - EuroVocLoader.NAME, - EuroVocLoader.EUROVOC_LABELS_FILENAME, - ) - eurovoc_labels = EuroVocLoader._parse_label_hierarchy( - eurovoc_label_hierarchy_path - ) - - crovoc_label_hierarchy_path = os.path.join( - LargeResource.BASE_RESOURCE_DIR, - EuroVocLoader.NAME, - EuroVocLoader.CROVOC_LABELS_FILENAME, - ) - crovoc_labels = EuroVocLoader._parse_label_hierarchy(crovoc_label_hierarchy_path) - - document_mapping_path = os.path.join( - LargeResource.BASE_RESOURCE_DIR, - EuroVocLoader.NAME, - EuroVocLoader.MAPPING_FILENAME, - ) - mapping = EuroVocLoader._parse_mappings(document_mapping_path) - - dataset_path = os.path.join( - LargeResource.BASE_RESOURCE_DIR, - EuroVocLoader.NAME, - EuroVocLoader.DATASET_DIR, - EuroVocLoader.DOCUMENT_PATHS, - ) - documents = EuroVocLoader._parse_documents(dataset_path, mapping) - - return eurovoc_labels, crovoc_labels, mapping, documents - - @staticmethod - def _parse_labels_by_name(label_hierarchy_path): - """ - Does the first pass through the label file that maps label names to - label ids. - - The label hierarchy is parsed from an xml file and returned as a dictionary where - keys are label names and values are instances of Label class. This is done - because in the original xml file labels are connected to other labels (e.g. as - parents or similar terms) using their names. We wish to connect them using unique - label ids instead. - - Parameters - ---------- - label_hierarchy_path : path to xml file containing label hierarchy - - Returns - ------- - tuple: - (terms_by_name, microthesaurus_by_name, thesaurus_by_name, labels_by_id) - - terms_by_name : dict(term_name : term_id) - microthesaurus_by_name : dict(microthesaurus_name : microthesaurus_id) - thesaurus_by_name : dict(thesaurus_name : thesaurus_id) - labels_by_id : dict(label_id : Label) - """ - xml_document = label_hierarchy_path - tree = ET.parse(xml_document) - root = tree.getroot() - - # These dictionaries are used in the second pass for replacing string names with - # label ids. Keys are string names and their values are ids. - # Sometimes a term and a thesaurus or a microthesaurus may share the same name, - # that's the reason for separate dictionaries for every label category. - terms_by_name = {} - microthesaurus_by_name = {} - thesaurus_by_name = {} - - # This is the final label list that will eventually be used in the dataset. - labels_by_id = {} - - for child in root: - # If tag 'Podrucje' does not exist, it means this record is a thesaurus. - if child.find("Podrucje") is None and child.find("Potpojmovnik") is None: - rank = LabelRank.THESAURUS - thesaurus = int(_get_text(child, "ID")) - micro_thesaurus = None - - elif ( - child.find("Podrucje") is not None and child.find("Potpojmovnik") is None - ): - # If tag 'Podrucje' exists, but there is not 'Potpojmovnik' tag, it means - # this record is a microthesaurus. - rank = LabelRank.MICRO_THESAURUS - thesaurus = _get_text(child, "Podrucje").split(";")[1] - micro_thesaurus = int(_get_text(child, "ID")) - - elif ( - child.find("Podrucje") is not None - and child.find("Potpojmovnik") is not None - ): - # If both 'Podrucje' and 'Potpojmovnik' tags exist, it means this record - # is a term. - rank = LabelRank.TERM - thesaurus = _get_text(child, "Podrucje").split(";")[1] - micro_thesaurus = _get_text(child, "Potpojmovnik") - - else: - raise ValueError( - "Invalid label record. The record contains tag" - " but lacks the tag." - ) - - name = child.find("Odrednica").text.lower().strip() - label_id = int(_get_text(child, "ID")) - - parents = [ - broader_term.text.lower().strip() - for broader_term in child.findall("SiriPojam") - ] - - similar_terms = [ - similar_term.text.lower().strip() - for similar_term in child.findall("SrodniPojam") - ] - - # Here parents, similar terms, thesaurus and micro-thesaurus are all stored - # using string names. In the second pass, these fields will be replaces by - # matching ids. - label = Label( - name=name, - id=label_id, - direct_parents=parents, - similar_terms=similar_terms, - rank=rank, - thesaurus=thesaurus, - micro_thesaurus=micro_thesaurus, - ) - labels_by_id[label_id] = label - - if rank == LabelRank.THESAURUS: - thesaurus_by_name[name] = label - - elif rank == LabelRank.MICRO_THESAURUS: - microthesaurus_by_name[name] = label - - elif rank == LabelRank.TERM: - terms_by_name[name] = label - - return terms_by_name, microthesaurus_by_name, thesaurus_by_name, labels_by_id - - @staticmethod - def _parse_label_hierarchy(label_hierarchy_path): - """ - Parses the label hierarchy. - - The label hierarchy is parsed from an xml file and returned as a dictionary where - keys are label ids and values are instances of Label class. - - Parameters - ---------- - label_hierarchy_path : path to xml file containing label hierarchy - - Returns - ------- - dict: - Dictionary of (key, value) = (label_id, Label) - """ - - ( - terms_by_name, - microthesaurus_by_name, - thesaurus_by_name, - labels_by_id, - ) = EuroVocLoader._parse_labels_by_name(label_hierarchy_path) - - for label_id in labels_by_id: - label = labels_by_id[label_id] - - # Names of the parents are replaced by their ids. - # Parents can only be terms here, never thesaurus of microthesaurus. - label.direct_parents = [ - terms_by_name[parent].id for parent in label.direct_parents - ] - - # Names of the similar terms are replaced by their ids. - # Similar terms can only be terms, never thesaurus of microthesaurus. - label.similar_terms = [ - terms_by_name[similar_term].id for similar_term in label.similar_terms - ] - - # If label is not thesaurus, replace its thesaurus name by thesaurus id - if label.rank != LabelRank.THESAURUS: - if label.thesaurus not in thesaurus_by_name: - # Error: thesaurus name does not exist (this shouldn't happen) - warnings.warn( - f"Label {label.id} has a non-existing thesaurus name " - f"assigned: {label.thesaurus}", - RuntimeWarning, - ) - label.thesaurus = None - else: - label.thesaurus = thesaurus_by_name[label.thesaurus].id - - # If label is microthesaurus, then its thesaurus is listed as its parent - if label.rank == LabelRank.MICRO_THESAURUS: - if label.thesaurus: - label.direct_parents.append(label.thesaurus) - - # If label is term, replace its microthesaurus name by its id - if label.rank == LabelRank.TERM: - if label.micro_thesaurus not in microthesaurus_by_name: - # Error: microthesaurus name does not exist (this shouldn't happen) - warnings.warn( - f"Label {label.id} has a non-existing microthesaurus name " - f"assigned: {label.micro_thesaurus}", - RuntimeWarning, - ) - label.micro_thesaurus = None - else: - label.micro_thesaurus = microthesaurus_by_name[ - label.micro_thesaurus - ].id - # if term has no parent term then its microthesaurus is listed as - # its parent - if not label.direct_parents and label.micro_thesaurus: - label.direct_parents.append(label.micro_thesaurus) - - labels_by_id = EuroVocLoader._collect_all_ancestors(labels_by_id) - return labels_by_id - - @staticmethod - def _collect_all_ancestors(label_hierarchy): - """ - Finds and stores the ancestors of all the labels in the label hierarchy. - - Parameters - ---------- - label_hierarchy : dict(int, Label) - Dictionary that maps label_id to Label. - - Returns - ------- - dict: - Dictionary of (key, value) = (label_id, Label) - """ - - new_label_hierarchy = {} - - for label_id in label_hierarchy: - ancestors = EuroVocLoader._get_all_ancestors(label_id, label_hierarchy) - label = label_hierarchy[label_id] - new_label_hierarchy[label_id] = Label( - name=label.name, - id=label.id, - rank=label.rank, - direct_parents=label.direct_parents, - similar_terms=label.similar_terms, - thesaurus=label.thesaurus, - micro_thesaurus=label.micro_thesaurus, - all_ancestors=ancestors, - ) - return new_label_hierarchy - - @staticmethod - def _get_all_ancestors(label_id, label_hierarchy): - """ - Finds and returns the ancestors of the label with the given label_id. - - Parameters - ---------- - label_id : int - - label_hierarchy : dict(int, Label) - Dictionary that maps label_id to Label. - """ - direct_parents = label_hierarchy[label_id].direct_parents - parents = set(direct_parents) - # while the iterations of the loop find new, untraversed parents - while direct_parents: - new_parents = set() - # for each parent, add all its parents to next iteration parents - for label in direct_parents: - label_direct_parents = label_hierarchy[label].direct_parents - new_parents.update( - parent for parent in label_direct_parents if parent not in parents - ) - parents.update(new_parents) - direct_parents = new_parents - return parents - - @staticmethod - def _parse_mappings(mappings_path): - """ - Parses the mappings of documents to labels from a xls file. - - Parameters - ---------- - mappings_path : path to mappings in xls format - - Returns - ------- - dict - Dictionary of (key, value) = (document_id, list of label ids) - """ - wb = xlrd.open_workbook(mappings_path) - sheet_0 = wb.sheet_by_index(0) - - mappings = {} - # row zero is table header, data starts from row 1 - row = 1 - while row < sheet_0.nrows: - # cokumn 0 contains document id - document_id = int(sheet_0.cell_value(row, 0)) - label_ids = [] - - while True: - # column 2 contains label id and label name split by semicolon character - # (sometimes this field can be an empty string, in that case we simply - # skip the row) - label = sheet_0.cell_value(row, 2) - if label: - label_id = label.split(";")[0] - label_id = label_id - label_ids.append(int(label_id)) - row += 1 - # If a row has an empty first column, then the row contains another label - # for the previously seen document. When the first column is not empty, - # we have read all the labels for the previous document and need to - # switch to a new document. - if row >= sheet_0.nrows or sheet_0.cell_value(row, 0): - break - - mappings[document_id] = label_ids - return mappings - - @staticmethod - def _parse_documents(path, document_mapping): - """ - Parses xml documents from the given path. - - If the document_id is not present in the given document_mapping dictionary, the - document is not parsed. - - Parameters - ---------- - path : path that specifies all documents to be parsed - document_mapping : dictionary of (key, value) = (document_id, list of label ids) - - Returns - ------- - list - List of parsed documents as Document type objects. - """ - xml_documents = glob.glob(path) - parsed_documents = [] - for doc in xml_documents: - # if there is no mapping to labels for the document, the document can't be - # used in the dataset and it's therefore not parsed - # this happens often because certain categoried of documents are not maped to - # labels - filename = os.path.basename(doc) - document_id = int(os.path.splitext(filename)[0].replace("NN", "")) - if document_id not in document_mapping: - warnings.warn( - f"{document_id} document id not found in document mappings.", - RuntimeWarning, - ) - continue - parsed_doc = EuroVocLoader._parse_document(doc) - # parsed_doc is None if there's been an error on document text extraction - if parsed_doc: - parsed_documents.append(parsed_doc) - return parsed_documents - - @staticmethod - def _parse_document(doc): - """ - Parses the given document from xml. - - Parameters - ---------- - doc : path to document in xml format - - Returns - ------- - Document - Parsed document as instance of Document named tuple. - """ - tree = ET.parse(doc) - root = tree.getroot() - root_children = list(root) - head = root_children[0] - body = root_children[1] - - filename = os.path.basename(doc) - title_text = " ".join([t.text for t in head.iter() if t.text]).strip() - # Proper header should begin with a digit have the following format: - # "document_number date title" - # This is true for 99% of the documents and excpetions are ignored - if title_text and title_text[0].isdigit(): - title_text = " ".join(title_text.split(" ")[2:]) - else: - warnings.warn( - f"{filename} file contains invalid document title: {title_text}", - RuntimeWarning, - ) - title_text = title_text.lower().replace("\r", "").replace("\n", "") - - body_text = [] - for b in body.iter(): - if b.text: - body_text.append(b.text) - # everything after the
tag will end up in tail - if b.tail and b.tail.strip(): - body_text.append(b.tail) - body_text = "\n".join(body_text).lower() - - # If a document is stored as pdf in the database, the extraction process - # generates an xml contaning the following string. Text for these documents is - # not available and they are therefore ignored. - if "postupak ekstrakcije teksta" in body_text: - warnings.warn( - f"{filename} XML file does not contain a valid text", RuntimeWarning - ) - return - - return Document(title=title_text, text=body_text, filename=filename) - - -def _get_text(child, filed_name): - """ - Extracts and returns lowercase striped text from field with the given name. - - Parameters - ---------- - child : Element - Element contaning label record from XML file. - - field_name : str - Name of the field to be extracted. - - Returns - ------- - str : Lowercase striped contents of the field. - """ - return child.find(filed_name).text.lower().strip() - - -def dill_dataset(output_path): - """ - Downloads the EuroVoc dataset (if not already present) and stores the - dataset in a dill file. - - Parameters - ---------- - output_path : str - Path to the file where the dataset instance will be stored. - """ - loader = EuroVocLoader() - eurovoc_labels, crovoc_labels, mapping, documents = loader.load_dataset() - dataset = EuroVocDataset( - documents=documents, - mappings=mapping, - eurovoc_labels=eurovoc_labels, - crovoc_labels=crovoc_labels, - ) - dataset.finalize_fields() - - with open(output_path, "wb") as output_file: - dill.dump(dataset, output_file) diff --git a/podium/dataload/ner_croatian.py b/podium/dataload/ner_croatian.py deleted file mode 100644 index 43cfb3a2..00000000 --- a/podium/dataload/ner_croatian.py +++ /dev/null @@ -1,291 +0,0 @@ -""" -Simple NERCroatian dataset module. -""" -import glob -import os -import xml.etree.ElementTree as ET - -from podium.preproc.tokenizers import get_tokenizer -from podium.storage.resources.large_resource import init_scp_large_resource_from_kwargs - - -class NERCroatianXMLLoader: - """ - Simple croatian NER class. - """ - - URL = "/storage/takepod_data/datasets/CroatianNERDataset.zip" - NAME = "CroatianNERDataset" - SCP_HOST = "djurdja.takelab.fer.hr" - ARCHIVE_TYPE = "zip" - - SENTENCE_DELIMITER_TOKEN = (None, None) - - def __init__( - self, path="downloaded_datasets/", tokenizer="split", tag_schema="IOB", **kwargs - ): - """ - Constructor for Croatian NER dataset. Downloads and extracts the - dataset. - - Parameters - ---------- - path: str - Path to the folder where the dataset should be downloaded or loaded - from if it is already downloaded - tokenizer: str - Word-level tokenizer used to tokenize the input text - tag_schema: str - Tag schema used for constructing the token labels - - supported tag schemas: - - - 'IOB': the label of the beginning token of the entity is - prefixed with 'B-', the remaining tokens that belong to the - same entity are prefixed with 'I-'. The tokens that don't - belong to any named entity are labeled 'O' - - kwargs: - SCPLargeResource.SCP_USER_KEY: - User on the host machine. Not required if the user on the - local machine matches the user on the host machine. - SCPLargeResource.SCP_PRIVATE_KEY: - Path to the ssh private key eligible to access the host - machine. Not required on Unix if the private is in the default - location. - SCPLargeResource.SCP_PASS_KEY: - Password for the ssh private key (optional). Can be omitted - if the private key is not encrypted. - """ - self._data_dir = path - self._tokenizer = get_tokenizer(tokenizer) - self._label_resolver = self._get_label_resolver(tag_schema) - init_scp_large_resource_from_kwargs( - resource=NERCroatianXMLLoader.NAME, - uri=NERCroatianXMLLoader.URL, - archive=NERCroatianXMLLoader.ARCHIVE_TYPE, - user_dict=kwargs, - scp_host=NERCroatianXMLLoader.SCP_HOST, - ) - - def load_dataset(self): - """ - Method loads the dataset and returns tokenized NER documents. - - Returns - ------- - tokenized_documents: list of lists of tuples - List of tokenized documents. Each document is represented - as a list of tuples (token, label). The sentences in document are - delimited by tuple (None, None) - """ - source_dir_location = os.path.join(self._data_dir, NERCroatianXMLLoader.NAME) - - tokenized_documents = [] - - for xml_file_path in sorted(glob.glob(source_dir_location + "/*.xml")): - word_label_pairs = self._xml_to_token_label_pairs(xml_file_path) - tokenized_documents.append(word_label_pairs) - - return tokenized_documents - - def _xml_to_token_label_pairs(self, xml_file_path): - """ - Converts the xml file located at the given path to the list of tuples - (token, label) - - Parameters - ---------- - xml_file_path: str - Path to the XML file - - Returns - ------- - token_label_pairs: list of tuples - List of tuples (token, label) - """ - root = ET.parse(xml_file_path).getroot() - - token_label_pairs = [] - - for sentence in root.iter(tag="s"): - for sub_element in sentence.iter(): - - if sub_element.text is not None: - token_label_pairs_subelement = self._tokenize( - sub_element.text.strip(), sub_element - ) - token_label_pairs.extend(token_label_pairs_subelement) - - if sub_element.tail is not None: - token_label_pairs_outside = self._tokenize(sub_element.tail.strip()) - token_label_pairs.extend(token_label_pairs_outside) - - token_label_pairs.append(self.SENTENCE_DELIMITER_TOKEN) - - return token_label_pairs - - def _tokenize(self, text, element=None): - """ - Method tokenizes the text and assigns the labels to the tokens according - to the element's 'type' attribute. - - Parameters - ---------- - text: str - Input text - element: ET.Element - Element with which the text is associated. - - Returns - ------- - token_label_pairs: list of tuples - List of tuples (token, label) - """ - if not text: - return [] - - tokenized_text = self._tokenizer(text) - - token_label_pairs = [] - for index, token in enumerate(tokenized_text): - if element is not None: - label_unprefixed = element.attrib.get("type", None) - label = self._label_resolver(index, label_unprefixed) - else: - label = "O" - token_label_pairs.append((token, label)) - - return token_label_pairs - - def _get_label_resolver(self, tag_schema): - """ - Gets the label resolver associated with the given tag schema. - - Parameters - ---------- - tag_schema: str - Tag schema for label prefixes - - Returns - ------- - label_resolver: callable - Label resolver associated with the given tag schema - """ - if tag_schema == "IOB": - return self._iob_label_resolver - - raise ValueError(f"No label resolver for tag schema {tag_schema} exists.") - - @staticmethod - def _iob_label_resolver(index, label): - """ - A resolver that prefixes the label according to the IOB tag schema. - - Parameters - ---------- - index: int - Index of the token/label in the named entity (starts from 0) - label: str - Label of the named entity - - Returns - ------- - Label prefixed with the appropriate prefix according to the IOB - tag schema - """ - if label is None: - return "O" - elif index == 0: - return "B-" + label - return "I-" + label - - -def convert_sequence_to_entities(sequence, text, delimiter="-"): - """ - Converts sequences of the BIO tagging schema to entities. - - Parameters - ---------- - sequence: list(string) - Sequence of tags consisting that start with either B, I, or O. - label: list(string) - Tokenized text that correponds to the tag sequence - - Returns - ------- - entities: list(dict) - List of entities. Each entity is a dict that has four attributes: - name, type, start, and end. Name is a list of tokens from text - that belong to that entity, start denotes the index which starts - the entity, and end is the end index of the entity. - - ```text[entity['start'] : entity['end']]``` retrieves the entity text - - This means that the entity has the following form: - { \ - 'name': list(str), \ - 'type': str, \ - 'start': int, \ - 'end': int \ - } - - - Raises - ------ - ValueError - If the given sequence and text are not of the same length. - """ - entities = [] - state = "start" - current_tag = "N/A" - - if len(text) != len(sequence): - raise ValueError("Sequence and text must be of same length") - - for index, (tag, word) in enumerate(zip(sequence, text)): - # must be either B, I, O - if delimiter in tag: - tag_type, tag_description = tag.split(delimiter) - else: - tag_type = tag[0] - tag_description = "" - - if tag_type == "B" and state == "start": - state = "named_entity" - current_tag = tag_description - # create new entity - entity = {"name": [word], "type": tag_description, "start": index, "end": -1} - entities.append(entity) - - elif tag_type == "B" and state == "named_entity": - state = "named_entity" - # save previous - entities[-1]["end"] = index - # create new one - entity = {"name": [word], "type": tag_description, "start": index, "end": -1} - entities.append(entity) - - elif tag_type == "I" and state == "named_entity": - # I tag has to be after a B tag of the same type - # B-Org I-Org is good, B-Org I-Time is not - # I-Time part of the entity is skipped - if tag_description == current_tag and entities: - entities[-1]["name"].append(word) - - # if it does not match, just close the started entity - elif tag_description != current_tag and entities: - entities[-1]["end"] = index - state = "start" - - elif tag_type == "O" and state == "named_entity": - state = "start" - if entities: - entities[-1]["end"] = index - - elif tag_type == "O": - state = "start" - - if entities and entities[-1]["end"] == -1: - entities[-1]["end"] = len(sequence) - - return entities diff --git a/podium/datasets/__init__.py b/podium/datasets/__init__.py index b4337bab..e25fef0c 100644 --- a/podium/datasets/__init__.py +++ b/podium/datasets/__init__.py @@ -2,8 +2,9 @@ Package contains datasets. """ -from .dataset import Dataset, rationed_split, stratified_split -from .dataset_abc import DatasetABC +from .arrow_tabular_dataset import ArrowDataset +from .dataset import Dataset, DatasetBase, rationed_split, stratified_split +from .example_factory import Example, ExampleFactory, ExampleFormat from .hierarhical_dataset import HierarchicalDataset from .impl.catacx_dataset import CatacxDataset from .impl.conllu_dataset import CoNLLUDataset diff --git a/podium/arrow/arrow_tabular_dataset.py b/podium/datasets/arrow_tabular_dataset.py similarity index 99% rename from podium/arrow/arrow_tabular_dataset.py rename to podium/datasets/arrow_tabular_dataset.py index e7a353c7..c20248d3 100644 --- a/podium/arrow/arrow_tabular_dataset.py +++ b/podium/datasets/arrow_tabular_dataset.py @@ -8,9 +8,10 @@ from collections import defaultdict from typing import Any, Dict, Iterable, Iterator, List, Tuple, Union -from podium.datasets import Dataset, DatasetABC -from podium.storage import ExampleFactory, Field, unpack_fields -from podium.storage.example_factory import Example +from podium.field import Field, unpack_fields + +from .dataset import Dataset, DatasetBase +from .example_factory import Example, ExampleFactory try: @@ -35,7 +36,7 @@ def _chunkify(iterable, n): yield chunk -class ArrowDataset(DatasetABC): +class ArrowDataset(DatasetBase): """ Podium dataset implementation which uses PyArrow as its data storage backend. diff --git a/podium/datasets/dataset.py b/podium/datasets/dataset.py index 7fa4d0f3..30aa2d3b 100644 --- a/podium/datasets/dataset.py +++ b/podium/datasets/dataset.py @@ -4,13 +4,287 @@ import copy import itertools import random -from typing import Callable, Iterable, List, Union +from abc import ABC, abstractmethod +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + NamedTuple, + Optional, + Tuple, + Union, + overload, +) -from podium.datasets.dataset_abc import DatasetABC -from podium.storage.example_factory import Example +import numpy as np +from podium.field import Field, unpack_fields -class Dataset(DatasetABC): +from .example_factory import Example + + +FieldType = Optional[Union[Field, List[Field]]] + + +class DatasetBase(ABC): + """ + Abstract base class for all datasets in Podium. + """ + + def __init__(self, fields: Union[Dict[str, FieldType], List[FieldType]]): + self._fields = tuple(unpack_fields(fields)) + + @property + def fields(self) -> Tuple[Field]: + """ + List containing all fields of this dataset. + """ + return self._fields + + @property + def field_dict(self) -> Dict[str, Field]: + """ + Dictionary containing all field names mapping to their respective + Fields. + """ + return {f.name: f for f in self.fields} + + @property + def examples(self) -> List[Example]: + """ + List containing all Examples. + """ + return self._get_examples() + + def __iter__(self) -> Iterator[Example]: + """ + Iterates over all examples in the dataset in order. + + Yields + ------ + Example + Yields examples in the dataset. + """ + for i in range(len(self)): + yield self[i] + + def __getattr__(self, field: Union[str, Field]) -> Iterator[Tuple[Any, Any]]: + """ + Returns an Iterator iterating over values of the field with the given + name for every example in the dataset. + + Parameters + ---------- + field_name : str + The name of the field whose values are to be returned. + + Returns + ------ + an Iterator iterating over values of the field with the given name + for every example in the dataset. + + Raises + ------ + AttributeError + If there is no Field with the given name. + """ + field_name = field.name if isinstance(field, Field) else field + + if field_name in self.field_dict: + + def attr_generator(_dataset, _field_name): + for x in _dataset: + yield x[field_name] + + return attr_generator(self, field_name) + + else: + raise AttributeError(f"Dataset has no field {field_name}.") + + def finalize_fields(self, *datasets: "DatasetBase") -> None: + """ + Builds vocabularies of all the non-eager fields in the dataset, from the + Dataset objects given as \\*args and then finalizes all the fields. + + Parameters + ---------- + \\*datasets + A variable number of DatasetBase objects from which to build the + vocabularies for non-eager fields. If none provided, the + vocabularies are built from this Dataset (self). + """ + + # if there are non-eager fields, we need to build their vocabularies + fields_to_build = [f for f in self.fields if not f.eager and f.use_vocab] + if fields_to_build: + # there can be multiple datasets we want to iterate over + data_sources = [ds for ds in datasets if isinstance(ds, DatasetBase)] + + # use self as a data source if no other given + if not data_sources: + data_sources.append(self) + + # for each example in each dataset, + # update _all_ non-eager fields + for dataset in data_sources: + for example in dataset: + for field in fields_to_build: + _, tokenized = example[field.name] + field.update_vocab(tokenized) + + for field in self.fields: + field.finalize() + + def batch(self) -> Tuple[NamedTuple, NamedTuple]: + """ + Creates an input and target batch containing the whole dataset. The + format of the batch is the same as the batches returned by the. + + Returns + ------- + input_batch, target_batch + Two objects containing the input and target batches over + the whole dataset. + """ + # Imported here because of circular import + from podium.datasets import SingleBatchIterator + + return next(iter(SingleBatchIterator(self, shuffle=False))) + + def sorted(self, key: Callable[[Example], Any], reverse=False) -> "DatasetBase": + """ + Creates a new DatasetBase instance in which all Examples are sorted + according to the value returned by `key`. + + Parameters + ---------- + key: callable + specifies a function of one argument that is used to extract a comparison key + from each Example. + + reverse: bool + If set to True, then the list elements are sorted as if each comparison were + reversed. + + Returns + ------- + DatasetBase + A new DatasetBase instance with sorted Examples. + """ + + def index_key(i): + return key(self[i]) + + indices = list(range(len(self))) + indices.sort(key=index_key, reverse=reverse) + return self[indices] + + def filtered(self, predicate: Callable[[Example], bool]) -> "DatasetBase": + """ + Filters examples with given predicate and returns a new DatasetBase + instance containing those examples. + + Parameters + ---------- + predicate : callable + predicate should be a callable that accepts example as input and returns + true if the example shouldn't be filtered, otherwise returns false + + Returns + ------- + DatasetBase + A new DatasetBase instance containing only the Examples for which `predicate` + returned True. + """ + indices = [i for i, example in enumerate(self) if predicate(example)] + return self[indices] + + def shuffled(self) -> "DatasetBase": + """ + Creates a new DatasetBase instance containing all Examples, but in + shuffled order. + + Returns + ------- + DatasetBase + A new DatasetBase instance containing all Examples, but in shuffled + order. + """ + shuffled_indices = np.random.permutation(len(self)) + return self[shuffled_indices] + + def __repr__(self): + return f"{type(self).__name__}[Size: {len(self)}, Fields: {self.fields}]" + + @abstractmethod + def __len__(self) -> int: + """ + Returns the number of examples in the dataset. + + Returns + ------- + int + The number of examples in the dataset. + """ + pass + + @overload + def __getitem__(self, i: int) -> Example: + ... + + @overload + def __getitem__(self, i: Iterable[int]) -> "DatasetBase": + ... + + @abstractmethod + def __getitem__(self, i: slice) -> "DatasetBase": + """ + Returns an example or a new dataset containing the indexed examples. + + If indexed with an int, only the example at that position will be returned. + If Indexed with a slice or iterable, all examples indexed by the object + will be collected and a new dataset containing only those examples will be + returned. The new dataset will contain copies of the old dataset's fields and + will be identical to the original dataset, with the exception of the example + number and ordering. See wiki for detailed examples. + + Examples in the returned Dataset are the same ones present in the + original dataset. If a complete deep-copy of the dataset, or its slice, + is needed please refer to the `get` method. + + Usage example: + + example = dataset[1] # Indexing by single integer returns a single example + + new_dataset = dataset[1:10] # Multi-indexing returns a new dataset containing + # the indexed examples. + + Parameters + ---------- + i : int or slice or iterable of ints + Index used to index examples. + + Returns + ------- + single example or Dataset + If i is an int, a single example will be returned. + If i is a slice or iterable, a copy of this dataset containing + only the indexed examples will be returned. + """ + pass + + @abstractmethod + def _get_examples(self) -> List[Example]: + """ + Returns a list containing all examples of this dataset. + """ + pass + + +class Dataset(DatasetBase): """ A general purpose container for datasets. A dataset is a shallow wrapper for a list of `Example` classes which store the instance data as well as the @@ -46,7 +320,7 @@ def __init__(self, examples, fields, sort_key=None): def __getitem__( self, i: Union[int, Iterable[int], slice] - ) -> Union["DatasetABC", Example]: + ) -> Union["DatasetBase", Example]: """ Returns an example or a new dataset containing the indexed examples. @@ -186,7 +460,7 @@ def filter(self, predicate, inplace=False): examples=filtered_examples, fields=self.fields, sort_key=self.sort_key ) - def filtered(self, predicate: Callable[[Example], bool]) -> "DatasetABC": + def filtered(self, predicate: Callable[[Example], bool]) -> "DatasetBase": return self.filter(predicate, inplace=False) def split( diff --git a/podium/datasets/dataset_abc.py b/podium/datasets/dataset_abc.py deleted file mode 100644 index 981ce543..00000000 --- a/podium/datasets/dataset_abc.py +++ /dev/null @@ -1,276 +0,0 @@ -from abc import ABC, abstractmethod -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - List, - NamedTuple, - Optional, - Tuple, - Union, - overload, -) - -import numpy as np - -from podium.storage import Example, Field, unpack_fields - - -FieldType = Optional[Union[Field, List[Field]]] - - -class DatasetABC(ABC): - """ - Abstract base class for all datasets in Podium. - """ - - def __init__(self, fields: Union[Dict[str, FieldType], List[FieldType]]): - self._fields = tuple(unpack_fields(fields)) - - @property - def fields(self) -> Tuple[Field]: - """ - List containing all fields of this dataset. - """ - return self._fields - - @property - def field_dict(self) -> Dict[str, Field]: - """ - Dictionary containing all field names mapping to their respective - Fields. - """ - return {f.name: f for f in self.fields} - - @property - def examples(self) -> List[Example]: - """ - List containing all Examples. - """ - return self._get_examples() - - def __iter__(self) -> Iterator[Example]: - """ - Iterates over all examples in the dataset in order. - - Yields - ------ - Example - Yields examples in the dataset. - """ - for i in range(len(self)): - yield self[i] - - def __getattr__(self, field: Union[str, Field]) -> Iterator[Tuple[Any, Any]]: - """ - Returns an Iterator iterating over values of the field with the given - name for every example in the dataset. - - Parameters - ---------- - field_name : str - The name of the field whose values are to be returned. - - Returns - ------ - an Iterator iterating over values of the field with the given name - for every example in the dataset. - - Raises - ------ - AttributeError - If there is no Field with the given name. - """ - field_name = field.name if isinstance(field, Field) else field - - if field_name in self.field_dict: - - def attr_generator(_dataset, _field_name): - for x in _dataset: - yield x[field_name] - - return attr_generator(self, field_name) - - else: - raise AttributeError(f"Dataset has no field {field_name}.") - - def finalize_fields(self, *datasets: "DatasetABC") -> None: - """ - Builds vocabularies of all the non-eager fields in the dataset, from the - Dataset objects given as \\*args and then finalizes all the fields. - - Parameters - ---------- - \\*datasets - A variable number of DatasetABC objects from which to build the - vocabularies for non-eager fields. If none provided, the - vocabularies are built from this Dataset (self). - """ - - # if there are non-eager fields, we need to build their vocabularies - fields_to_build = [f for f in self.fields if not f.eager and f.use_vocab] - if fields_to_build: - # there can be multiple datasets we want to iterate over - data_sources = [ds for ds in datasets if isinstance(ds, DatasetABC)] - - # use self as a data source if no other given - if not data_sources: - data_sources.append(self) - - # for each example in each dataset, - # update _all_ non-eager fields - for dataset in data_sources: - for example in dataset: - for field in fields_to_build: - _, tokenized = example[field.name] - field.update_vocab(tokenized) - - for field in self.fields: - field.finalize() - - def batch(self) -> Tuple[NamedTuple, NamedTuple]: - """ - Creates an input and target batch containing the whole dataset. The - format of the batch is the same as the batches returned by the. - - Returns - ------- - input_batch, target_batch - Two objects containing the input and target batches over - the whole dataset. - """ - # Imported here because of circular import - from podium.datasets import SingleBatchIterator - - return next(iter(SingleBatchIterator(self, shuffle=False))) - - def sorted(self, key: Callable[[Example], Any], reverse=False) -> "DatasetABC": - """ - Creates a new DatasetABC instance in which all Examples are sorted - according to the value returned by `key`. - - Parameters - ---------- - key: callable - specifies a function of one argument that is used to extract a comparison key - from each Example. - - reverse: bool - If set to True, then the list elements are sorted as if each comparison were - reversed. - - Returns - ------- - DatasetABC - A new DatasetABC instance with sorted Examples. - """ - - def index_key(i): - return key(self[i]) - - indices = list(range(len(self))) - indices.sort(key=index_key, reverse=reverse) - return self[indices] - - def filtered(self, predicate: Callable[[Example], bool]) -> "DatasetABC": - """ - Filters examples with given predicate and returns a new DatasetABC - instance containing those examples. - - Parameters - ---------- - predicate : callable - predicate should be a callable that accepts example as input and returns - true if the example shouldn't be filtered, otherwise returns false - - Returns - ------- - DatasetABC - A new DatasetABC instance containing only the Examples for which `predicate` - returned True. - """ - indices = [i for i, example in enumerate(self) if predicate(example)] - return self[indices] - - def shuffled(self) -> "DatasetABC": - """ - Creates a new DatasetABC instance containing all Examples, but in - shuffled order. - - Returns - ------- - DatasetABC - A new DatasetABC instance containing all Examples, but in shuffled - order. - """ - shuffled_indices = np.random.permutation(len(self)) - return self[shuffled_indices] - - def __repr__(self): - return f"{type(self).__name__}[Size: {len(self)}, Fields: {self.fields}]" - - @abstractmethod - def __len__(self) -> int: - """ - Returns the number of examples in the dataset. - - Returns - ------- - int - The number of examples in the dataset. - """ - pass - - @overload - def __getitem__(self, i: int) -> Example: - ... - - @overload - def __getitem__(self, i: Iterable[int]) -> "DatasetABC": - ... - - @abstractmethod - def __getitem__(self, i: slice) -> "DatasetABC": - """ - Returns an example or a new dataset containing the indexed examples. - - If indexed with an int, only the example at that position will be returned. - If Indexed with a slice or iterable, all examples indexed by the object - will be collected and a new dataset containing only those examples will be - returned. The new dataset will contain copies of the old dataset's fields and - will be identical to the original dataset, with the exception of the example - number and ordering. See wiki for detailed examples. - - Examples in the returned Dataset are the same ones present in the - original dataset. If a complete deep-copy of the dataset, or its slice, - is needed please refer to the `get` method. - - Usage example: - - example = dataset[1] # Indexing by single integer returns a single example - - new_dataset = dataset[1:10] # Multi-indexing returns a new dataset containing - # the indexed examples. - - Parameters - ---------- - i : int or slice or iterable of ints - Index used to index examples. - - Returns - ------- - single example or Dataset - If i is an int, a single example will be returned. - If i is a slice or iterable, a copy of this dataset containing - only the indexed examples will be returned. - """ - pass - - @abstractmethod - def _get_examples(self) -> List[Example]: - """ - Returns a list containing all examples of this dataset. - """ - pass diff --git a/podium/storage/example_factory.py b/podium/datasets/example_factory.py similarity index 100% rename from podium/storage/example_factory.py rename to podium/datasets/example_factory.py diff --git a/podium/dataload/hf.py b/podium/datasets/hf.py similarity index 98% rename from podium/dataload/hf.py rename to podium/datasets/hf.py index 898dcec9..6ad5602d 100644 --- a/podium/dataload/hf.py +++ b/podium/datasets/hf.py @@ -4,7 +4,10 @@ from typing import Dict, Iterator, Optional from podium.datasets import Dataset -from podium.storage import Example, ExampleFactory, Field, LabelField, Vocab +from podium.field import Field, LabelField +from podium.vocab import Vocab + +from .example_factory import Example, ExampleFactory try: diff --git a/podium/datasets/hierarhical_dataset.py b/podium/datasets/hierarhical_dataset.py index 93e248a8..d220177f 100644 --- a/podium/datasets/hierarhical_dataset.py +++ b/podium/datasets/hierarhical_dataset.py @@ -3,8 +3,9 @@ from typing import Optional, Tuple from podium.datasets.dataset import Dataset -from podium.storage import Example, ExampleFactory -from podium.storage.field import unpack_fields +from podium.field import unpack_fields + +from .example_factory import Example, ExampleFactory @dataclass diff --git a/podium/datasets/impl/catacx_comments_dataset.py b/podium/datasets/impl/catacx_comments_dataset.py index 1992153f..0e43d6b0 100644 --- a/podium/datasets/impl/catacx_comments_dataset.py +++ b/podium/datasets/impl/catacx_comments_dataset.py @@ -5,8 +5,8 @@ import os from podium.datasets.dataset import Dataset -from podium.storage import ExampleFactory -from podium.storage.field import Field +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field from podium.storage.resources.large_resource import LargeResource diff --git a/podium/datasets/impl/catacx_dataset.py b/podium/datasets/impl/catacx_dataset.py index d484d8b8..4cd31d36 100644 --- a/podium/datasets/impl/catacx_dataset.py +++ b/podium/datasets/impl/catacx_dataset.py @@ -1,7 +1,9 @@ import os +from podium.datasets.example_factory import ExampleFactory from podium.datasets.hierarhical_dataset import HierarchicalDataset -from podium.storage import ExampleFactory, Field, MultilabelField, Vocab +from podium.field import Field, MultilabelField +from podium.vocab import Vocab class CatacxDataset(HierarchicalDataset): diff --git a/podium/datasets/impl/conllu_dataset.py b/podium/datasets/impl/conllu_dataset.py index f0110207..6a377e28 100644 --- a/podium/datasets/impl/conllu_dataset.py +++ b/podium/datasets/impl/conllu_dataset.py @@ -4,7 +4,9 @@ import collections from podium.datasets import Dataset -from podium.storage import ExampleFactory, Field, Vocab +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field +from podium.vocab import Vocab class CoNLLUDataset(Dataset): diff --git a/podium/datasets/impl/cornell_movie_dialogs_dataset.py b/podium/datasets/impl/cornell_movie_dialogs_dataset.py index 49917cd8..389fe310 100644 --- a/podium/datasets/impl/cornell_movie_dialogs_dataset.py +++ b/podium/datasets/impl/cornell_movie_dialogs_dataset.py @@ -1,11 +1,32 @@ """ -Module contains Cornell Movie Dialogs datasets. +Module contains Cornell Movie-Dialogs Corpus, available at +http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html. """ -from podium.dataload.cornell_movie_dialogs import CornellMovieDialogsNamedTuple +import os +import re +from collections import namedtuple + from podium.datasets.dataset import Dataset -from podium.storage.example_factory import ExampleFactory -from podium.storage.field import Field -from podium.storage.vocab import Vocab +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field +from podium.storage import LargeResource +from podium.vocab import Vocab + + +try: + import pandas as pd +except ImportError: + print( + "Problem occured while trying to import pandas. If the library is not " + "installed visit https://pandas.pydata.org/ for more details." + ) + raise + + +CornellMovieDialogsNamedTuple = namedtuple( + "CornellMovieDialogsNamedTuple", + ["titles", "conversations", "lines", "characters", "url"], +) class CornellMovieDialogsConversationalDataset(Dataset): @@ -112,3 +133,171 @@ def get_default_fields(): ) fields = {"statement": statement, "reply": reply} return fields + + +class CornellMovieDialogsLoader: + """ + Class for downloading and parsing the Cornell Movie-Dialogs dataset. + + This class is used for downloading the dataset (if it's not already + downloaded) and parsing the files in the dataset. If it's not already + present LargeResource.BASE_RESOURCE_DIR, the dataset is automatically + downloaded when an instance of the loader is created. The downloaded + resources can be parsed using the load_dataset method. + """ + + URL = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip" + ARCHIVE_TYPE = "zip" + NAME = "cornell_movie_dialogs_corpus" + DATA_FOLDER_NAME = "cornell movie-dialogs corpus" + DELIMITER = " +++$+++ " + ENCODING = "iso-8859-1" + + TITLE_FIELDS = ["movieID", "title", "year", "rating", "votes", "genres"] + TITLE_FILENAME = "movie_titles_metadata.txt" + + CHARACTERS_FIELDS = [ + "characterID", + "character", + "movieID", + "title", + "gender", + "position", + ] + CHARACTERS_FILENAME = "movie_characters_metadata.txt" + + LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"] + LINES_FILENAME = "movie_lines.txt" + + CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"] + CONVERSATIONS_FILENAME = "movie_conversations.txt" + + URL_FIELDS = ["movieID", "title", "url"] + URL_FILENAME = "raw_script_urls.txt" + + def __init__(self): + """ + The constructor will check if the dataset is already been downloaded in + the LargeResource.BASE_RESOURCE_DIR. + + If the dataset is not present, it will atempt to download it. + """ + LargeResource( + **{ + LargeResource.RESOURCE_NAME: CornellMovieDialogsLoader.NAME, + LargeResource.ARCHIVE: CornellMovieDialogsLoader.ARCHIVE_TYPE, + LargeResource.URI: CornellMovieDialogsLoader.URL, + } + ) + + def load_dataset(self): + """ + Loads and parses all the necessary files from the dataset folder. + + Returns + ------- + data : CornellMovieDialogsNamedTuple + tuple that contains dictionaries for 5 types of Cornell movie dialogs data: + titles, conversations, lines, characters and script urls. + Fields for every type are defined in class constants. + """ + titles = self.load_titles() + conversations = self.load_conversations() + lines = self.load_lines() + characters = self.load_characters() + url = self.load_urls() + + return CornellMovieDialogsNamedTuple( + titles=titles, + conversations=conversations, + lines=lines, + characters=characters, + url=url, + ) + + @staticmethod + def _load_file(file_name, fields, columns_hooks=None): + """ + Method loads file from Cornell movie dialogs dataset defined with file + name and fields that are used in the file. + + Parameters + ---------- + file_name : str + string containing file path + fields : list(str) + list containing field names + columns_hooks : dict(str, callable) + functions that will be called on columns + variable represents dictionary that maps column name to a function + """ + data_frame = pd.read_csv( + filepath_or_buffer=os.path.join( + LargeResource.BASE_RESOURCE_DIR, + CornellMovieDialogsLoader.NAME, + CornellMovieDialogsLoader.DATA_FOLDER_NAME, + file_name, + ), + sep=re.escape(CornellMovieDialogsLoader.DELIMITER), + encoding=CornellMovieDialogsLoader.ENCODING, + header=None, + names=fields, + engine="python", + ) + if columns_hooks is not None: + for column_name in columns_hooks: + data_frame[column_name] = data_frame[column_name].apply( + columns_hooks[column_name] + ) + return data_frame.to_dict(orient="list") + + def load_titles(self): + """ + Method loads file containing movie titles. + """ + column_hooks = {} + column_hooks["genres"] = lambda s: s.strip("[]''").split("', '") + return self._load_file( + file_name=CornellMovieDialogsLoader.TITLE_FILENAME, + fields=CornellMovieDialogsLoader.TITLE_FIELDS, + columns_hooks=column_hooks, + ) + + def load_conversations(self): + """ + Method loads file containing movie conversations. + """ + column_hooks = {} + column_hooks["utteranceIDs"] = lambda s: s.strip("[]''").split("', '") + return self._load_file( + file_name=CornellMovieDialogsLoader.CONVERSATIONS_FILENAME, + fields=CornellMovieDialogsLoader.CONVERSATIONS_FIELDS, + columns_hooks=column_hooks, + ) + + def load_lines(self): + """ + Method loads file containing movie lines. + """ + return self._load_file( + file_name=CornellMovieDialogsLoader.LINES_FILENAME, + fields=CornellMovieDialogsLoader.LINES_FIELDS, + ) + + def load_characters(self): + """ + Method loads file containing movie characters. + """ + return self._load_file( + file_name=CornellMovieDialogsLoader.CHARACTERS_FILENAME, + fields=CornellMovieDialogsLoader.CHARACTERS_FIELDS, + ) + + def load_urls(self): + """ + Method loads file containing movie script urls. + """ + return self._load_file( + file_name=CornellMovieDialogsLoader.URL_FILENAME, + fields=CornellMovieDialogsLoader.URL_FIELDS, + ) diff --git a/podium/datasets/impl/croatian_ner_dataset.py b/podium/datasets/impl/croatian_ner_dataset.py index 1d0daef9..63ebf2da 100644 --- a/podium/datasets/impl/croatian_ner_dataset.py +++ b/podium/datasets/impl/croatian_ner_dataset.py @@ -1,11 +1,19 @@ """ Module contains Croatian NER dataset. """ -from podium.dataload.ner_croatian import NERCroatianXMLLoader +import glob +import os +import xml.etree.ElementTree as ET + from podium.datasets.dataset import Dataset -from podium.storage import ExampleFactory, Field -from podium.storage.resources.large_resource import LargeResource -from podium.storage.vocab import Vocab +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field +from podium.preproc.tokenizers import get_tokenizer +from podium.storage.resources.large_resource import ( + LargeResource, + init_scp_large_resource_from_kwargs, +) +from podium.vocab import Vocab class CroatianNERDataset(Dataset): @@ -140,3 +148,285 @@ def _is_delimiter_line(line): True if the line is delimiter line. """ return not any(line) + + +class NERCroatianXMLLoader: + """ + Simple croatian NER class. + """ + + URL = "/storage/takepod_data/datasets/CroatianNERDataset.zip" + NAME = "CroatianNERDataset" + SCP_HOST = "djurdja.takelab.fer.hr" + ARCHIVE_TYPE = "zip" + + SENTENCE_DELIMITER_TOKEN = (None, None) + + def __init__( + self, path="downloaded_datasets/", tokenizer="split", tag_schema="IOB", **kwargs + ): + """ + Constructor for Croatian NER dataset. Downloads and extracts the + dataset. + + Parameters + ---------- + path: str + Path to the folder where the dataset should be downloaded or loaded + from if it is already downloaded + tokenizer: str + Word-level tokenizer used to tokenize the input text + tag_schema: str + Tag schema used for constructing the token labels + - supported tag schemas: + + - 'IOB': the label of the beginning token of the entity is + prefixed with 'B-', the remaining tokens that belong to the + same entity are prefixed with 'I-'. The tokens that don't + belong to any named entity are labeled 'O' + + kwargs: + SCPLargeResource.SCP_USER_KEY: + User on the host machine. Not required if the user on the + local machine matches the user on the host machine. + SCPLargeResource.SCP_PRIVATE_KEY: + Path to the ssh private key eligible to access the host + machine. Not required on Unix if the private is in the default + location. + SCPLargeResource.SCP_PASS_KEY: + Password for the ssh private key (optional). Can be omitted + if the private key is not encrypted. + """ + self._data_dir = path + self._tokenizer = get_tokenizer(tokenizer) + self._label_resolver = self._get_label_resolver(tag_schema) + init_scp_large_resource_from_kwargs( + resource=NERCroatianXMLLoader.NAME, + uri=NERCroatianXMLLoader.URL, + archive=NERCroatianXMLLoader.ARCHIVE_TYPE, + user_dict=kwargs, + scp_host=NERCroatianXMLLoader.SCP_HOST, + ) + + def load_dataset(self): + """ + Method loads the dataset and returns tokenized NER documents. + + Returns + ------- + tokenized_documents: list of lists of tuples + List of tokenized documents. Each document is represented + as a list of tuples (token, label). The sentences in document are + delimited by tuple (None, None) + """ + source_dir_location = os.path.join(self._data_dir, NERCroatianXMLLoader.NAME) + + tokenized_documents = [] + + for xml_file_path in sorted(glob.glob(source_dir_location + "/*.xml")): + word_label_pairs = self._xml_to_token_label_pairs(xml_file_path) + tokenized_documents.append(word_label_pairs) + + return tokenized_documents + + def _xml_to_token_label_pairs(self, xml_file_path): + """ + Converts the xml file located at the given path to the list of tuples + (token, label) + + Parameters + ---------- + xml_file_path: str + Path to the XML file + + Returns + ------- + token_label_pairs: list of tuples + List of tuples (token, label) + """ + root = ET.parse(xml_file_path).getroot() + + token_label_pairs = [] + + for sentence in root.iter(tag="s"): + for sub_element in sentence.iter(): + + if sub_element.text is not None: + token_label_pairs_subelement = self._tokenize( + sub_element.text.strip(), sub_element + ) + token_label_pairs.extend(token_label_pairs_subelement) + + if sub_element.tail is not None: + token_label_pairs_outside = self._tokenize(sub_element.tail.strip()) + token_label_pairs.extend(token_label_pairs_outside) + + token_label_pairs.append(self.SENTENCE_DELIMITER_TOKEN) + + return token_label_pairs + + def _tokenize(self, text, element=None): + """ + Method tokenizes the text and assigns the labels to the tokens according + to the element's 'type' attribute. + + Parameters + ---------- + text: str + Input text + element: ET.Element + Element with which the text is associated. + + Returns + ------- + token_label_pairs: list of tuples + List of tuples (token, label) + """ + if not text: + return [] + + tokenized_text = self._tokenizer(text) + + token_label_pairs = [] + for index, token in enumerate(tokenized_text): + if element is not None: + label_unprefixed = element.attrib.get("type", None) + label = self._label_resolver(index, label_unprefixed) + else: + label = "O" + token_label_pairs.append((token, label)) + + return token_label_pairs + + def _get_label_resolver(self, tag_schema): + """ + Gets the label resolver associated with the given tag schema. + + Parameters + ---------- + tag_schema: str + Tag schema for label prefixes + + Returns + ------- + label_resolver: callable + Label resolver associated with the given tag schema + """ + if tag_schema == "IOB": + return self._iob_label_resolver + + raise ValueError(f"No label resolver for tag schema {tag_schema} exists.") + + @staticmethod + def _iob_label_resolver(index, label): + """ + A resolver that prefixes the label according to the IOB tag schema. + + Parameters + ---------- + index: int + Index of the token/label in the named entity (starts from 0) + label: str + Label of the named entity + + Returns + ------- + Label prefixed with the appropriate prefix according to the IOB + tag schema + """ + if label is None: + return "O" + elif index == 0: + return "B-" + label + return "I-" + label + + +def convert_sequence_to_entities(sequence, text, delimiter="-"): + """ + Converts sequences of the BIO tagging schema to entities. + + Parameters + ---------- + sequence: list(string) + Sequence of tags consisting that start with either B, I, or O. + label: list(string) + Tokenized text that correponds to the tag sequence + + Returns + ------- + entities: list(dict) + List of entities. Each entity is a dict that has four attributes: + name, type, start, and end. Name is a list of tokens from text + that belong to that entity, start denotes the index which starts + the entity, and end is the end index of the entity. + + ```text[entity['start'] : entity['end']]``` retrieves the entity text + + This means that the entity has the following form: + { \ + 'name': list(str), \ + 'type': str, \ + 'start': int, \ + 'end': int \ + } + + + Raises + ------ + ValueError + If the given sequence and text are not of the same length. + """ + entities = [] + state = "start" + current_tag = "N/A" + + if len(text) != len(sequence): + raise ValueError("Sequence and text must be of same length") + + for index, (tag, word) in enumerate(zip(sequence, text)): + # must be either B, I, O + if delimiter in tag: + tag_type, tag_description = tag.split(delimiter) + else: + tag_type = tag[0] + tag_description = "" + + if tag_type == "B" and state == "start": + state = "named_entity" + current_tag = tag_description + # create new entity + entity = {"name": [word], "type": tag_description, "start": index, "end": -1} + entities.append(entity) + + elif tag_type == "B" and state == "named_entity": + state = "named_entity" + # save previous + entities[-1]["end"] = index + # create new one + entity = {"name": [word], "type": tag_description, "start": index, "end": -1} + entities.append(entity) + + elif tag_type == "I" and state == "named_entity": + # I tag has to be after a B tag of the same type + # B-Org I-Org is good, B-Org I-Time is not + # I-Time part of the entity is skipped + if tag_description == current_tag and entities: + entities[-1]["name"].append(word) + + # if it does not match, just close the started entity + elif tag_description != current_tag and entities: + entities[-1]["end"] = index + state = "start" + + elif tag_type == "O" and state == "named_entity": + state = "start" + if entities: + entities[-1]["end"] = index + + elif tag_type == "O": + state = "start" + + if entities and entities[-1]["end"] == -1: + entities[-1]["end"] = len(sequence) + + return entities diff --git a/podium/datasets/impl/eurovoc_dataset.py b/podium/datasets/impl/eurovoc_dataset.py index 3f114b19..89141f90 100644 --- a/podium/datasets/impl/eurovoc_dataset.py +++ b/podium/datasets/impl/eurovoc_dataset.py @@ -2,15 +2,39 @@ Module contains EuroVoc dataset. """ import functools +import glob import os import re import warnings +import xml.etree.ElementTree as ET +from collections import namedtuple +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Set + +import dill from podium.datasets.dataset import Dataset +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field, MultilabelField from podium.preproc.lemmatizer.croatian_lemmatizer import get_croatian_lemmatizer_hook from podium.preproc.stop_words import CROATIAN_EXTENDED -from podium.storage import Field, MultilabelField, Vocab -from podium.storage.example_factory import ExampleFactory +from podium.storage.resources.large_resource import ( + LargeResource, + init_scp_large_resource_from_kwargs, +) +from podium.vocab import Vocab + + +try: + import xlrd +except ImportError: + print( + "Problem occured while trying to import xlrd. If the " + "library is not installed visit http://www.python-excel.org/ " + "for more details." + ) + raise class EuroVocDataset(Dataset): @@ -294,3 +318,590 @@ def remove_nonalpha_and_stopwords(raw, tokenized, stop_words): if len(token) > 1 and token not in stop_words: tokens.append(token) return (raw, tokens) + + +Id = int +Document = namedtuple("Document", ["filename", "title", "text"]) + + +class LabelRank(Enum): + """ + Levels of labels in EuroVoc. + """ + + THESAURUS = 3 + MICRO_THESAURUS = 2 + TERM = 1 + + +@dataclass +class Label: + """ + Label in EuroVoc dataset. + + Labels are assigned to documents. One document has multiple labels. + Labels have a hierarchy in which one label can have one or more parents (broader + terms). All labels apart from thesaurus rank labels have at least one parent. + Apart from parents, labels can also have similar labels which describe related + areas, but aren't connected by the label hierarchy. + + Attributes + ---------- + name : str + name of the label + id : int + numerical id of the label + direct_parents : list(int) + list of ids of direct parents + similar_terms : list(int) + list of ids of similar terms + rank : LabelRank + rank of the label + thesaurus : int + id of the thesaurus of the label (if the label represents a + thesaurus, it has its own id listed in this field) + micro_thesaurus : int + id of the microthesaurus of the label (if the label represents + a microthesaurus, it has its own id listed in this field) + all_ancestors : set(int) + set of ids of all ancestors of the label in the label hierarchy + """ + + name: str + id: Id + direct_parents: List[Id] + similar_terms: List[Id] + rank: LabelRank + thesaurus: Optional[Id] = None + micro_thesaurus: Optional[Id] = None + all_ancestors: Optional[Set[Id]] = None + + +class EuroVocLoader: + """ + Class for downloading and parsing the EuroVoc dataset. + + This class is used for downloading the EuroVoc dataset (if it's not already + downloaded) and parsing the files in the dataset. If it's not already + present LargeResource.BASE_RESOURCE_DIR, the dataset is automatically + downloaded when an instance of EuroVocLoader is created. The downloaded + resources can be parsed using the load_dataset method. + """ + + URL = "/proj/sci/uisusd/data/eurovoc_data/eurovoc.zip" + EUROVOC_LABELS_FILENAME = "EUROVOC.xml" + CROVOC_LABELS_FILENAME = "CROVOC.xml" + MAPPING_FILENAME = "mapping.xls" + DATASET_DIR = "Data" + DOCUMENT_PATHS = "*.xml" + SCP_HOST = "djurdja.takelab.fer.hr" + ARCHIVE_TYPE = "zip" + NAME = "EuroVocDataset" + + def __init__(self, **kwargs): + """ + Constructor of the EuroVocLoader class. + + The constructor will check if the dataset is already been downloaded in the + LargeResource.BASE_RESOURCE_DIR. If the dataset is not present, it will atempt to + download it. + + kwargs: + SCPLargeResource.SCP_USER_KEY: + Username on the host machine from which the dataset is downloaded. Not + required if the username on the local machine matches the username on the + host. + SCPLargeResource.SCP_PRIVATE_KEY: + Path to the ssh private key eligible to access the host. Not required on + Unix if the private key is stored in the default location. + SCPLargeResource.SCP_PASS_KEY: + Password for the ssh private key (optional). Can be omitted + if the private key is not encrypted. + """ + init_scp_large_resource_from_kwargs( + resource=EuroVocLoader.NAME, + uri=EuroVocLoader.URL, + user_dict=kwargs, + archive=EuroVocLoader.ARCHIVE_TYPE, + scp_host=EuroVocLoader.SCP_HOST, + ) + + def load_dataset(self): + """ + Loads and parses all the necessary files from the dataset folder. + + Returns + ------- + tuple: + (EuroVoc label hierarchy, CroVoc label hierarchy, document mapping, + documents) + + EuroVoc label hierarchy : dict(label_id : Label) + CroVoc label hierarchy : dict(label_id : Label) + document mapping : dict(document_id : list of label ids) + documents : list(Document) + """ + eurovoc_label_hierarchy_path = os.path.join( + LargeResource.BASE_RESOURCE_DIR, + EuroVocLoader.NAME, + EuroVocLoader.EUROVOC_LABELS_FILENAME, + ) + eurovoc_labels = EuroVocLoader._parse_label_hierarchy( + eurovoc_label_hierarchy_path + ) + + crovoc_label_hierarchy_path = os.path.join( + LargeResource.BASE_RESOURCE_DIR, + EuroVocLoader.NAME, + EuroVocLoader.CROVOC_LABELS_FILENAME, + ) + crovoc_labels = EuroVocLoader._parse_label_hierarchy(crovoc_label_hierarchy_path) + + document_mapping_path = os.path.join( + LargeResource.BASE_RESOURCE_DIR, + EuroVocLoader.NAME, + EuroVocLoader.MAPPING_FILENAME, + ) + mapping = EuroVocLoader._parse_mappings(document_mapping_path) + + dataset_path = os.path.join( + LargeResource.BASE_RESOURCE_DIR, + EuroVocLoader.NAME, + EuroVocLoader.DATASET_DIR, + EuroVocLoader.DOCUMENT_PATHS, + ) + documents = EuroVocLoader._parse_documents(dataset_path, mapping) + + return eurovoc_labels, crovoc_labels, mapping, documents + + @staticmethod + def _parse_labels_by_name(label_hierarchy_path): + """ + Does the first pass through the label file that maps label names to + label ids. + + The label hierarchy is parsed from an xml file and returned as a dictionary where + keys are label names and values are instances of Label class. This is done + because in the original xml file labels are connected to other labels (e.g. as + parents or similar terms) using their names. We wish to connect them using unique + label ids instead. + + Parameters + ---------- + label_hierarchy_path : path to xml file containing label hierarchy + + Returns + ------- + tuple: + (terms_by_name, microthesaurus_by_name, thesaurus_by_name, labels_by_id) + + terms_by_name : dict(term_name : term_id) + microthesaurus_by_name : dict(microthesaurus_name : microthesaurus_id) + thesaurus_by_name : dict(thesaurus_name : thesaurus_id) + labels_by_id : dict(label_id : Label) + """ + xml_document = label_hierarchy_path + tree = ET.parse(xml_document) + root = tree.getroot() + + # These dictionaries are used in the second pass for replacing string names with + # label ids. Keys are string names and their values are ids. + # Sometimes a term and a thesaurus or a microthesaurus may share the same name, + # that's the reason for separate dictionaries for every label category. + terms_by_name = {} + microthesaurus_by_name = {} + thesaurus_by_name = {} + + # This is the final label list that will eventually be used in the dataset. + labels_by_id = {} + + for child in root: + # If tag 'Podrucje' does not exist, it means this record is a thesaurus. + if child.find("Podrucje") is None and child.find("Potpojmovnik") is None: + rank = LabelRank.THESAURUS + thesaurus = int(_get_text(child, "ID")) + micro_thesaurus = None + + elif ( + child.find("Podrucje") is not None and child.find("Potpojmovnik") is None + ): + # If tag 'Podrucje' exists, but there is not 'Potpojmovnik' tag, it means + # this record is a microthesaurus. + rank = LabelRank.MICRO_THESAURUS + thesaurus = _get_text(child, "Podrucje").split(";")[1] + micro_thesaurus = int(_get_text(child, "ID")) + + elif ( + child.find("Podrucje") is not None + and child.find("Potpojmovnik") is not None + ): + # If both 'Podrucje' and 'Potpojmovnik' tags exist, it means this record + # is a term. + rank = LabelRank.TERM + thesaurus = _get_text(child, "Podrucje").split(";")[1] + micro_thesaurus = _get_text(child, "Potpojmovnik") + + else: + raise ValueError( + "Invalid label record. The record contains tag" + " but lacks the tag." + ) + + name = child.find("Odrednica").text.lower().strip() + label_id = int(_get_text(child, "ID")) + + parents = [ + broader_term.text.lower().strip() + for broader_term in child.findall("SiriPojam") + ] + + similar_terms = [ + similar_term.text.lower().strip() + for similar_term in child.findall("SrodniPojam") + ] + + # Here parents, similar terms, thesaurus and micro-thesaurus are all stored + # using string names. In the second pass, these fields will be replaces by + # matching ids. + label = Label( + name=name, + id=label_id, + direct_parents=parents, + similar_terms=similar_terms, + rank=rank, + thesaurus=thesaurus, + micro_thesaurus=micro_thesaurus, + ) + labels_by_id[label_id] = label + + if rank == LabelRank.THESAURUS: + thesaurus_by_name[name] = label + + elif rank == LabelRank.MICRO_THESAURUS: + microthesaurus_by_name[name] = label + + elif rank == LabelRank.TERM: + terms_by_name[name] = label + + return terms_by_name, microthesaurus_by_name, thesaurus_by_name, labels_by_id + + @staticmethod + def _parse_label_hierarchy(label_hierarchy_path): + """ + Parses the label hierarchy. + + The label hierarchy is parsed from an xml file and returned as a dictionary where + keys are label ids and values are instances of Label class. + + Parameters + ---------- + label_hierarchy_path : path to xml file containing label hierarchy + + Returns + ------- + dict: + Dictionary of (key, value) = (label_id, Label) + """ + + ( + terms_by_name, + microthesaurus_by_name, + thesaurus_by_name, + labels_by_id, + ) = EuroVocLoader._parse_labels_by_name(label_hierarchy_path) + + for label_id in labels_by_id: + label = labels_by_id[label_id] + + # Names of the parents are replaced by their ids. + # Parents can only be terms here, never thesaurus of microthesaurus. + label.direct_parents = [ + terms_by_name[parent].id for parent in label.direct_parents + ] + + # Names of the similar terms are replaced by their ids. + # Similar terms can only be terms, never thesaurus of microthesaurus. + label.similar_terms = [ + terms_by_name[similar_term].id for similar_term in label.similar_terms + ] + + # If label is not thesaurus, replace its thesaurus name by thesaurus id + if label.rank != LabelRank.THESAURUS: + if label.thesaurus not in thesaurus_by_name: + # Error: thesaurus name does not exist (this shouldn't happen) + warnings.warn( + f"Label {label.id} has a non-existing thesaurus name " + f"assigned: {label.thesaurus}", + RuntimeWarning, + ) + label.thesaurus = None + else: + label.thesaurus = thesaurus_by_name[label.thesaurus].id + + # If label is microthesaurus, then its thesaurus is listed as its parent + if label.rank == LabelRank.MICRO_THESAURUS: + if label.thesaurus: + label.direct_parents.append(label.thesaurus) + + # If label is term, replace its microthesaurus name by its id + if label.rank == LabelRank.TERM: + if label.micro_thesaurus not in microthesaurus_by_name: + # Error: microthesaurus name does not exist (this shouldn't happen) + warnings.warn( + f"Label {label.id} has a non-existing microthesaurus name " + f"assigned: {label.micro_thesaurus}", + RuntimeWarning, + ) + label.micro_thesaurus = None + else: + label.micro_thesaurus = microthesaurus_by_name[ + label.micro_thesaurus + ].id + # if term has no parent term then its microthesaurus is listed as + # its parent + if not label.direct_parents and label.micro_thesaurus: + label.direct_parents.append(label.micro_thesaurus) + + labels_by_id = EuroVocLoader._collect_all_ancestors(labels_by_id) + return labels_by_id + + @staticmethod + def _collect_all_ancestors(label_hierarchy): + """ + Finds and stores the ancestors of all the labels in the label hierarchy. + + Parameters + ---------- + label_hierarchy : dict(int, Label) + Dictionary that maps label_id to Label. + + Returns + ------- + dict: + Dictionary of (key, value) = (label_id, Label) + """ + + new_label_hierarchy = {} + + for label_id in label_hierarchy: + ancestors = EuroVocLoader._get_all_ancestors(label_id, label_hierarchy) + label = label_hierarchy[label_id] + new_label_hierarchy[label_id] = Label( + name=label.name, + id=label.id, + rank=label.rank, + direct_parents=label.direct_parents, + similar_terms=label.similar_terms, + thesaurus=label.thesaurus, + micro_thesaurus=label.micro_thesaurus, + all_ancestors=ancestors, + ) + return new_label_hierarchy + + @staticmethod + def _get_all_ancestors(label_id, label_hierarchy): + """ + Finds and returns the ancestors of the label with the given label_id. + + Parameters + ---------- + label_id : int + + label_hierarchy : dict(int, Label) + Dictionary that maps label_id to Label. + """ + direct_parents = label_hierarchy[label_id].direct_parents + parents = set(direct_parents) + # while the iterations of the loop find new, untraversed parents + while direct_parents: + new_parents = set() + # for each parent, add all its parents to next iteration parents + for label in direct_parents: + label_direct_parents = label_hierarchy[label].direct_parents + new_parents.update( + parent for parent in label_direct_parents if parent not in parents + ) + parents.update(new_parents) + direct_parents = new_parents + return parents + + @staticmethod + def _parse_mappings(mappings_path): + """ + Parses the mappings of documents to labels from a xls file. + + Parameters + ---------- + mappings_path : path to mappings in xls format + + Returns + ------- + dict + Dictionary of (key, value) = (document_id, list of label ids) + """ + wb = xlrd.open_workbook(mappings_path) + sheet_0 = wb.sheet_by_index(0) + + mappings = {} + # row zero is table header, data starts from row 1 + row = 1 + while row < sheet_0.nrows: + # cokumn 0 contains document id + document_id = int(sheet_0.cell_value(row, 0)) + label_ids = [] + + while True: + # column 2 contains label id and label name split by semicolon character + # (sometimes this field can be an empty string, in that case we simply + # skip the row) + label = sheet_0.cell_value(row, 2) + if label: + label_id = label.split(";")[0] + label_id = label_id + label_ids.append(int(label_id)) + row += 1 + # If a row has an empty first column, then the row contains another label + # for the previously seen document. When the first column is not empty, + # we have read all the labels for the previous document and need to + # switch to a new document. + if row >= sheet_0.nrows or sheet_0.cell_value(row, 0): + break + + mappings[document_id] = label_ids + return mappings + + @staticmethod + def _parse_documents(path, document_mapping): + """ + Parses xml documents from the given path. + + If the document_id is not present in the given document_mapping dictionary, the + document is not parsed. + + Parameters + ---------- + path : path that specifies all documents to be parsed + document_mapping : dictionary of (key, value) = (document_id, list of label ids) + + Returns + ------- + list + List of parsed documents as Document type objects. + """ + xml_documents = glob.glob(path) + parsed_documents = [] + for doc in xml_documents: + # if there is no mapping to labels for the document, the document can't be + # used in the dataset and it's therefore not parsed + # this happens often because certain categoried of documents are not maped to + # labels + filename = os.path.basename(doc) + document_id = int(os.path.splitext(filename)[0].replace("NN", "")) + if document_id not in document_mapping: + warnings.warn( + f"{document_id} document id not found in document mappings.", + RuntimeWarning, + ) + continue + parsed_doc = EuroVocLoader._parse_document(doc) + # parsed_doc is None if there's been an error on document text extraction + if parsed_doc: + parsed_documents.append(parsed_doc) + return parsed_documents + + @staticmethod + def _parse_document(doc): + """ + Parses the given document from xml. + + Parameters + ---------- + doc : path to document in xml format + + Returns + ------- + Document + Parsed document as instance of Document named tuple. + """ + tree = ET.parse(doc) + root = tree.getroot() + root_children = list(root) + head = root_children[0] + body = root_children[1] + + filename = os.path.basename(doc) + title_text = " ".join([t.text for t in head.iter() if t.text]).strip() + # Proper header should begin with a digit have the following format: + # "document_number date title" + # This is true for 99% of the documents and excpetions are ignored + if title_text and title_text[0].isdigit(): + title_text = " ".join(title_text.split(" ")[2:]) + else: + warnings.warn( + f"{filename} file contains invalid document title: {title_text}", + RuntimeWarning, + ) + title_text = title_text.lower().replace("\r", "").replace("\n", "") + + body_text = [] + for b in body.iter(): + if b.text: + body_text.append(b.text) + # everything after the
tag will end up in tail + if b.tail and b.tail.strip(): + body_text.append(b.tail) + body_text = "\n".join(body_text).lower() + + # If a document is stored as pdf in the database, the extraction process + # generates an xml contaning the following string. Text for these documents is + # not available and they are therefore ignored. + if "postupak ekstrakcije teksta" in body_text: + warnings.warn( + f"{filename} XML file does not contain a valid text", RuntimeWarning + ) + return + + return Document(title=title_text, text=body_text, filename=filename) + + +def _get_text(child, filed_name): + """ + Extracts and returns lowercase striped text from field with the given name. + + Parameters + ---------- + child : Element + Element contaning label record from XML file. + + field_name : str + Name of the field to be extracted. + + Returns + ------- + str : Lowercase striped contents of the field. + """ + return child.find(filed_name).text.lower().strip() + + +def dill_dataset(output_path): + """ + Downloads the EuroVoc dataset (if not already present) and stores the + dataset in a dill file. + + Parameters + ---------- + output_path : str + Path to the file where the dataset instance will be stored. + """ + loader = EuroVocLoader() + eurovoc_labels, crovoc_labels, mapping, documents = loader.load_dataset() + dataset = EuroVocDataset( + documents=documents, + mappings=mapping, + eurovoc_labels=eurovoc_labels, + crovoc_labels=crovoc_labels, + ) + dataset.finalize_fields() + + with open(output_path, "wb") as output_file: + dill.dump(dataset, output_file) diff --git a/podium/datasets/impl/imdb_sentiment_dataset.py b/podium/datasets/impl/imdb_sentiment_dataset.py index 6292cb35..3d7498c4 100644 --- a/podium/datasets/impl/imdb_sentiment_dataset.py +++ b/podium/datasets/impl/imdb_sentiment_dataset.py @@ -21,10 +21,10 @@ import os from podium.datasets.dataset import Dataset -from podium.storage.example_factory import ExampleFactory -from podium.storage.field import Field, LabelField +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field, LabelField from podium.storage.resources.large_resource import LargeResource -from podium.storage.vocab import Vocab +from podium.vocab import Vocab class IMDB(Dataset): diff --git a/podium/datasets/impl/iris_dataset.py b/podium/datasets/impl/iris_dataset.py index b45056e0..f1228aac 100644 --- a/podium/datasets/impl/iris_dataset.py +++ b/podium/datasets/impl/iris_dataset.py @@ -1,7 +1,8 @@ from sklearn.datasets import load_iris from podium.datasets import Dataset -from podium.storage import ExampleFactory, Field +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field class IrisDataset(Dataset): diff --git a/podium/datasets/impl/pandora_reddit_dataset.py b/podium/datasets/impl/pandora_reddit_dataset.py index c10d4adf..baba31bc 100644 --- a/podium/datasets/impl/pandora_reddit_dataset.py +++ b/podium/datasets/impl/pandora_reddit_dataset.py @@ -1,5 +1,6 @@ -from podium.arrow import ArrowDataset -from podium.storage import Field, Vocab +from podium.datasets import ArrowDataset +from podium.field import Field +from podium.vocab import Vocab class PandoraDataset(ArrowDataset): diff --git a/podium/datasets/impl/pauza_dataset.py b/podium/datasets/impl/pauza_dataset.py index 7bca7a1b..e720e0be 100644 --- a/podium/datasets/impl/pauza_dataset.py +++ b/podium/datasets/impl/pauza_dataset.py @@ -4,10 +4,10 @@ import os from podium.datasets.dataset import Dataset -from podium.storage.example_factory import ExampleFactory -from podium.storage.field import Field +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field from podium.storage.resources.large_resource import LargeResource -from podium.storage.vocab import Vocab +from podium.vocab import Vocab class PauzaHRDataset(Dataset): diff --git a/podium/datasets/impl/snli_dataset.py b/podium/datasets/impl/snli_dataset.py index 0ccd1382..ac91807a 100644 --- a/podium/datasets/impl/snli_dataset.py +++ b/podium/datasets/impl/snli_dataset.py @@ -6,7 +6,10 @@ import os from podium.datasets import Dataset -from podium.storage import ExampleFactory, Field, LabelField, LargeResource, Vocab +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field, LabelField +from podium.storage import LargeResource +from podium.vocab import Vocab class SNLISimple(Dataset): diff --git a/podium/datasets/impl/sst_sentiment_dataset.py b/podium/datasets/impl/sst_sentiment_dataset.py index 774ec7b6..39cd27c5 100644 --- a/podium/datasets/impl/sst_sentiment_dataset.py +++ b/podium/datasets/impl/sst_sentiment_dataset.py @@ -1,10 +1,10 @@ import os from podium.datasets.dataset import Dataset -from podium.storage.example_factory import ExampleFactory -from podium.storage.field import Field, LabelField +from podium.datasets.example_factory import ExampleFactory +from podium.field import Field, LabelField from podium.storage.resources.large_resource import LargeResource -from podium.storage.vocab import Vocab +from podium.vocab import Vocab class SST(Dataset): diff --git a/podium/datasets/iterator.py b/podium/datasets/iterator.py index 4059ca48..325ea38b 100644 --- a/podium/datasets/iterator.py +++ b/podium/datasets/iterator.py @@ -11,17 +11,17 @@ import numpy as np -from podium.datasets.dataset import Dataset, DatasetABC +from podium.datasets.dataset import Dataset, DatasetBase from podium.datasets.hierarhical_dataset import HierarchicalDataset -class IteratorABC(ABC): +class IteratorBase(ABC): """ Abstract base class for all Iterators in Podium. """ def __call__( - self, dataset: DatasetABC + self, dataset: DatasetBase ) -> PythonIterator[Tuple[NamedTuple, NamedTuple]]: """ Sets the dataset for this Iterator and returns an iterable over the @@ -41,14 +41,14 @@ def __call__( return iter(self) @abstractmethod - def set_dataset(self, dataset: DatasetABC) -> None: + def set_dataset(self, dataset: DatasetBase) -> None: """ Sets the dataset for this Iterator to iterate over. Resets the epoch count. Parameters ---------- - dataset: DatasetABC + dataset: DatasetBase Dataset to iterate over. """ pass @@ -82,7 +82,7 @@ def __len__(self) -> int: pass -class Iterator(IteratorABC): +class Iterator(IteratorBase): """ An iterator that batches data from a dataset after numericalization. """ @@ -101,7 +101,7 @@ def __init__( Parameters ---------- - dataset : DatasetABC + dataset : DatasetBase The dataset whose examples the iterator will iterate over. batch_size : int The size of the batches that the iterator will return. If the @@ -192,14 +192,14 @@ def iterations(self) -> int: """ return self._iterations - def set_dataset(self, dataset: DatasetABC) -> None: + def set_dataset(self, dataset: DatasetBase) -> None: """ Sets the dataset for this Iterator to iterate over. Resets the epoch count. Parameters ---------- - dataset: DatasetABC + dataset: DatasetBase Dataset to iterate over. """ self._epoch = 0 @@ -263,7 +263,7 @@ def __iter__(self) -> PythonIterator[Tuple[NamedTuple, NamedTuple]]: self._iterations = 0 self._epoch += 1 - def _create_batch(self, dataset: DatasetABC) -> Tuple[NamedTuple, NamedTuple]: + def _create_batch(self, dataset: DatasetBase) -> Tuple[NamedTuple, NamedTuple]: examples = dataset.examples @@ -407,14 +407,14 @@ class SingleBatchIterator(Iterator): dataset. """ - def __init__(self, dataset: DatasetABC = None, shuffle=True): + def __init__(self, dataset: DatasetBase = None, shuffle=True): """ Creates an Iterator that creates one batch per epoch containing all examples in the dataset. Parameters ---------- - dataset : DatasetABC + dataset : DatasetBase The dataset whose examples the iterator will iterate over. shuffle : bool @@ -428,7 +428,7 @@ def __init__(self, dataset: DatasetABC = None, shuffle=True): """ super().__init__(dataset=dataset, batch_size=len(dataset), shuffle=shuffle) - def set_dataset(self, dataset: DatasetABC) -> None: + def set_dataset(self, dataset: DatasetBase) -> None: super().set_dataset(dataset) self._batch_size = len(dataset) @@ -562,7 +562,7 @@ def __init__( Parameters ---------- - dataset : DatasetABC + dataset : DatasetBase The dataset whose examples the iterator will iterate over. batch_size : int The size of the batches that the iterator will return. If the diff --git a/podium/datasets/tabular_dataset.py b/podium/datasets/tabular_dataset.py index 7293afa8..c031e6dc 100644 --- a/podium/datasets/tabular_dataset.py +++ b/podium/datasets/tabular_dataset.py @@ -2,7 +2,8 @@ import os from podium.datasets.dataset import Dataset -from podium.storage.example_factory import ExampleFactory + +from .example_factory import ExampleFactory class TabularDataset(Dataset): diff --git a/podium/model_selection/__init__.py b/podium/experimental/model_selection/__init__.py similarity index 100% rename from podium/model_selection/__init__.py rename to podium/experimental/model_selection/__init__.py diff --git a/podium/model_selection/model_selection.py b/podium/experimental/model_selection/model_selection.py similarity index 97% rename from podium/model_selection/model_selection.py rename to podium/experimental/model_selection/model_selection.py index 9aa24c3b..fda6ad26 100644 --- a/podium/model_selection/model_selection.py +++ b/podium/experimental/model_selection/model_selection.py @@ -5,8 +5,8 @@ from tqdm import tqdm from podium.datasets import Dataset -from podium.models import Experiment -from podium.validation import k_fold_validation +from podium.experimental.models import Experiment +from podium.experimental.validation import k_fold_validation def grid_search( diff --git a/podium/models/__init__.py b/podium/experimental/models/__init__.py similarity index 100% rename from podium/models/__init__.py rename to podium/experimental/models/__init__.py diff --git a/podium/models/batch_transform_functions.py b/podium/experimental/models/batch_transform_functions.py similarity index 100% rename from podium/models/batch_transform_functions.py rename to podium/experimental/models/batch_transform_functions.py diff --git a/podium/models/experiment.py b/podium/experimental/models/experiment.py similarity index 100% rename from podium/models/experiment.py rename to podium/experimental/models/experiment.py diff --git a/podium/models/impl/__init__.py b/podium/experimental/models/impl/__init__.py similarity index 100% rename from podium/models/impl/__init__.py rename to podium/experimental/models/impl/__init__.py diff --git a/podium/models/impl/blcc/__init__.py b/podium/experimental/models/impl/blcc/__init__.py similarity index 100% rename from podium/models/impl/blcc/__init__.py rename to podium/experimental/models/impl/blcc/__init__.py diff --git a/podium/models/impl/blcc/chain_crf.py b/podium/experimental/models/impl/blcc/chain_crf.py similarity index 100% rename from podium/models/impl/blcc/chain_crf.py rename to podium/experimental/models/impl/blcc/chain_crf.py diff --git a/podium/models/impl/blcc_model.py b/podium/experimental/models/impl/blcc_model.py similarity index 98% rename from podium/models/impl/blcc_model.py rename to podium/experimental/models/impl/blcc_model.py index 3957fbcb..8392ebed 100644 --- a/podium/models/impl/blcc_model.py +++ b/podium/experimental/models/impl/blcc_model.py @@ -5,8 +5,8 @@ import numpy as np -from podium.models import AbstractSupervisedModel -from podium.models.impl.blcc.chain_crf import ChainCRF, create_custom_objects +from podium.experimental.models import AbstractSupervisedModel +from podium.experimental.models.impl.blcc.chain_crf import ChainCRF, create_custom_objects try: diff --git a/podium/models/impl/eurovoc_models/__init__.py b/podium/experimental/models/impl/eurovoc_models/__init__.py similarity index 100% rename from podium/models/impl/eurovoc_models/__init__.py rename to podium/experimental/models/impl/eurovoc_models/__init__.py diff --git a/podium/models/impl/eurovoc_models/multilabel_svm.py b/podium/experimental/models/impl/eurovoc_models/multilabel_svm.py similarity index 98% rename from podium/models/impl/eurovoc_models/multilabel_svm.py rename to podium/experimental/models/impl/eurovoc_models/multilabel_svm.py index 7cdf4474..3edb5fe8 100644 --- a/podium/models/impl/eurovoc_models/multilabel_svm.py +++ b/podium/experimental/models/impl/eurovoc_models/multilabel_svm.py @@ -33,9 +33,9 @@ from sklearn.model_selection import GridSearchCV from podium.datasets.iterator import Iterator -from podium.models import AbstractSupervisedModel -from podium.storage.vectorizers.tfidf import TfIdfVectorizer -from podium.validation.validation import KFold +from podium.experimental.models import AbstractSupervisedModel +from podium.experimental.validation.validation import KFold +from podium.vectorizers.tfidf import TfIdfVectorizer class MultilabelSVM(AbstractSupervisedModel): diff --git a/podium/models/impl/fc_model.py b/podium/experimental/models/impl/fc_model.py similarity index 94% rename from podium/models/impl/fc_model.py rename to podium/experimental/models/impl/fc_model.py index 92928fe9..d22ef6d4 100644 --- a/podium/models/impl/fc_model.py +++ b/podium/experimental/models/impl/fc_model.py @@ -1,7 +1,7 @@ """ Module contains fully connected neural network models. """ -from podium.models.model import AbstractSupervisedModel +from podium.experimental.models.model import AbstractSupervisedModel try: diff --git a/podium/models/impl/pytorch/__init__.py b/podium/experimental/models/impl/pytorch/__init__.py similarity index 100% rename from podium/models/impl/pytorch/__init__.py rename to podium/experimental/models/impl/pytorch/__init__.py diff --git a/podium/models/impl/pytorch/models.py b/podium/experimental/models/impl/pytorch/models.py similarity index 98% rename from podium/models/impl/pytorch/models.py rename to podium/experimental/models/impl/pytorch/models.py index 10c8c4ac..9dc45ed1 100644 --- a/podium/models/impl/pytorch/models.py +++ b/podium/experimental/models/impl/pytorch/models.py @@ -1,6 +1,6 @@ import torch -from podium.models import AbstractSupervisedModel +from podium.experimental.models import AbstractSupervisedModel class TorchModel(AbstractSupervisedModel): diff --git a/podium/models/impl/pytorch/sequence_classification.py b/podium/experimental/models/impl/pytorch/sequence_classification.py similarity index 100% rename from podium/models/impl/pytorch/sequence_classification.py rename to podium/experimental/models/impl/pytorch/sequence_classification.py diff --git a/podium/models/impl/pytorch/trainers.py b/podium/experimental/models/impl/pytorch/trainers.py similarity index 96% rename from podium/models/impl/pytorch/trainers.py rename to podium/experimental/models/impl/pytorch/trainers.py index 02cdee2b..e246cbcf 100644 --- a/podium/models/impl/pytorch/trainers.py +++ b/podium/experimental/models/impl/pytorch/trainers.py @@ -2,7 +2,7 @@ import torch -from podium.models.trainer import AbstractTrainer +from podium.experimental.models.trainer import AbstractTrainer class TorchTrainer(AbstractTrainer): diff --git a/podium/models/impl/simple_trainers.py b/podium/experimental/models/impl/simple_trainers.py similarity index 95% rename from podium/models/impl/simple_trainers.py rename to podium/experimental/models/impl/simple_trainers.py index a1968475..d31876a5 100644 --- a/podium/models/impl/simple_trainers.py +++ b/podium/experimental/models/impl/simple_trainers.py @@ -2,7 +2,7 @@ Module contains simple trainer classes. """ from podium.datasets import Iterator -from podium.models.trainer import AbstractTrainer +from podium.experimental.models.trainer import AbstractTrainer class SimpleTrainer(AbstractTrainer): diff --git a/podium/models/impl/svm_model.py b/podium/experimental/models/impl/svm_model.py similarity index 93% rename from podium/models/impl/svm_model.py rename to podium/experimental/models/impl/svm_model.py index bf03c261..7f2d0948 100644 --- a/podium/models/impl/svm_model.py +++ b/podium/experimental/models/impl/svm_model.py @@ -1,7 +1,7 @@ """ Module contains svm models. """ -from podium.models.model import AbstractSupervisedModel +from podium.experimental.models.model import AbstractSupervisedModel try: diff --git a/podium/models/model.py b/podium/experimental/models/model.py similarity index 100% rename from podium/models/model.py rename to podium/experimental/models/model.py diff --git a/podium/models/trainer.py b/podium/experimental/models/trainer.py similarity index 100% rename from podium/models/trainer.py rename to podium/experimental/models/trainer.py diff --git a/podium/models/transformers.py b/podium/experimental/models/transformers.py similarity index 100% rename from podium/models/transformers.py rename to podium/experimental/models/transformers.py diff --git a/podium/pipeline/__init__.py b/podium/experimental/pipeline/__init__.py similarity index 100% rename from podium/pipeline/__init__.py rename to podium/experimental/pipeline/__init__.py diff --git a/podium/pipeline/pipeline.py b/podium/experimental/pipeline/pipeline.py similarity index 98% rename from podium/pipeline/pipeline.py rename to podium/experimental/pipeline/pipeline.py index 693ff126..09212378 100644 --- a/podium/pipeline/pipeline.py +++ b/podium/experimental/pipeline/pipeline.py @@ -2,15 +2,14 @@ import numpy as np -from podium.datasets import Dataset -from podium.models import ( +from podium.datasets import Dataset, ExampleFactory, ExampleFormat +from podium.experimental.models import ( AbstractSupervisedModel, AbstractTrainer, Experiment, FeatureTransformer, ) -from podium.storage import ExampleFactory, ExampleFormat -from podium.storage.field import Field, MultioutputField +from podium.field import Field, MultioutputField class Pipeline(Experiment): diff --git a/podium/validation/__init__.py b/podium/experimental/validation/__init__.py similarity index 100% rename from podium/validation/__init__.py rename to podium/experimental/validation/__init__.py diff --git a/podium/validation/validation.py b/podium/experimental/validation/validation.py similarity index 99% rename from podium/validation/validation.py rename to podium/experimental/validation/validation.py index 1ecdf1ac..8f0dbd75 100644 --- a/podium/validation/validation.py +++ b/podium/experimental/validation/validation.py @@ -5,7 +5,7 @@ from sklearn.model_selection import KFold from podium.datasets import Dataset -from podium.models.experiment import Experiment +from podium.experimental.models.experiment import Experiment class _KFold(KFold): diff --git a/podium/storage/field.py b/podium/field.py similarity index 99% rename from podium/storage/field.py rename to podium/field.py index 0988f5d6..e1f259a3 100644 --- a/podium/storage/field.py +++ b/podium/field.py @@ -9,7 +9,7 @@ import numpy as np from podium.preproc.tokenizers import get_tokenizer -from podium.storage.vocab import Vocab +from podium.vocab import Vocab PretokenizationHookType = Callable[[Any], Any] diff --git a/podium/metrics/__init__.py b/podium/metrics/__init__.py deleted file mode 100644 index 312205a4..00000000 --- a/podium/metrics/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Package contains functionalities related to model metrics. -""" - -from .metrics import f1_metric, multiclass_f1_metric diff --git a/podium/metrics/metrics.py b/podium/metrics/metrics.py deleted file mode 100644 index 6b0bc958..00000000 --- a/podium/metrics/metrics.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Module contains custom metrics. -""" -try: - from sklearn.metrics import f1_score -except ImportError: - print( - "Problem occured while trying to import sklearn. If the " - "library is not installed visit https://scikit-learn.org" - " for more details." - ) - raise - - -def f1_metric(true, pred): - """ - Function calculates F1 score. - """ - return f1_score(true, pred) - - -def multiclass_f1_metric(true, pred, average="weighted"): - """ - Function calculates F1 score on multiclass classification. - """ - return f1_score(true, pred, average=average) diff --git a/podium/storage/__init__.py b/podium/storage/__init__.py index 2ada8283..ba6daec1 100644 --- a/podium/storage/__init__.py +++ b/podium/storage/__init__.py @@ -2,8 +2,6 @@ Package contains modules for storing and loading datasets and vectors. """ -from .example_factory import Example, ExampleFactory, ExampleFormat -from .field import Field, LabelField, MultilabelField, MultioutputField, unpack_fields from .resources.downloader import ( BaseDownloader, HttpDownloader, @@ -11,7 +9,3 @@ SimpleHttpDownloader, ) from .resources.large_resource import LargeResource, SCPLargeResource -from .vectorizers.impl import GloVe, NlplVectorizer -from .vectorizers.tfidf import TfIdfVectorizer -from .vectorizers.vectorizer import BasicVectorStorage, VectorStorage -from .vocab import SpecialVocabSymbols, Vocab diff --git a/podium/storage/vectorizers/__init__.py b/podium/vectorizers/__init__.py similarity index 100% rename from podium/storage/vectorizers/__init__.py rename to podium/vectorizers/__init__.py diff --git a/podium/storage/vectorizers/impl/__init__.py b/podium/vectorizers/impl/__init__.py similarity index 100% rename from podium/storage/vectorizers/impl/__init__.py rename to podium/vectorizers/impl/__init__.py diff --git a/podium/storage/vectorizers/impl/glove.py b/podium/vectorizers/impl/glove.py similarity index 97% rename from podium/storage/vectorizers/impl/glove.py rename to podium/vectorizers/impl/glove.py index a6db6601..0a1ee1d9 100644 --- a/podium/storage/vectorizers/impl/glove.py +++ b/podium/vectorizers/impl/glove.py @@ -1,7 +1,7 @@ import os -from ...resources import LargeResource -from ..vectorizer import BasicVectorStorage, random_normal_default_vector +from podium.storage import LargeResource +from podium.vectorizers.vectorizer import BasicVectorStorage, random_normal_default_vector class GloVe(BasicVectorStorage): diff --git a/podium/storage/vectorizers/impl/nlpl.py b/podium/vectorizers/impl/nlpl.py similarity index 92% rename from podium/storage/vectorizers/impl/nlpl.py rename to podium/vectorizers/impl/nlpl.py index ddfbb25e..77e96c8a 100644 --- a/podium/storage/vectorizers/impl/nlpl.py +++ b/podium/vectorizers/impl/nlpl.py @@ -1,7 +1,7 @@ import os from podium.storage import LargeResource -from podium.storage.vectorizers.vectorizer import BasicVectorStorage, zeros_default_vector +from podium.vectorizers.vectorizer import BasicVectorStorage, zeros_default_vector class NlplVectorizer(BasicVectorStorage): diff --git a/podium/storage/vectorizers/tfidf.py b/podium/vectorizers/tfidf.py similarity index 100% rename from podium/storage/vectorizers/tfidf.py rename to podium/vectorizers/tfidf.py diff --git a/podium/storage/vectorizers/vectorizer.py b/podium/vectorizers/vectorizer.py similarity index 100% rename from podium/storage/vectorizers/vectorizer.py rename to podium/vectorizers/vectorizer.py diff --git a/podium/storage/vocab.py b/podium/vocab.py similarity index 100% rename from podium/storage/vocab.py rename to podium/vocab.py diff --git a/tests/arrow/__init__.py b/tests/arrow/__init__.py deleted file mode 100644 index 60aac6fd..00000000 --- a/tests/arrow/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -This module contains tests for the arrow podium module. -""" diff --git a/tests/storage/conftest.py b/tests/conftest.py similarity index 96% rename from tests/storage/conftest.py rename to tests/conftest.py index d823f7a4..c0007120 100644 --- a/tests/storage/conftest.py +++ b/tests/conftest.py @@ -4,8 +4,8 @@ import pytest from podium.datasets.tabular_dataset import TabularDataset -from podium.storage.field import Field, LabelField -from podium.storage.vocab import Vocab +from podium.field import Field, LabelField +from podium.vocab import Vocab @pytest.fixture diff --git a/tests/dataload/__init__.py b/tests/datasets/impl/__init__.py similarity index 100% rename from tests/dataload/__init__.py rename to tests/datasets/impl/__init__.py diff --git a/tests/dataload/mock_mapping.xls b/tests/datasets/impl/mock_mapping.xls similarity index 100% rename from tests/dataload/mock_mapping.xls rename to tests/datasets/impl/mock_mapping.xls diff --git a/tests/datasets/test_catacx_comments_dataset.py b/tests/datasets/impl/test_catacx_comments_dataset.py similarity index 100% rename from tests/datasets/test_catacx_comments_dataset.py rename to tests/datasets/impl/test_catacx_comments_dataset.py diff --git a/tests/datasets/test_catacx_dataset.py b/tests/datasets/impl/test_catacx_dataset.py similarity index 100% rename from tests/datasets/test_catacx_dataset.py rename to tests/datasets/impl/test_catacx_dataset.py diff --git a/tests/datasets/test_conllu_dataset.py b/tests/datasets/impl/test_conllu_dataset.py similarity index 100% rename from tests/datasets/test_conllu_dataset.py rename to tests/datasets/impl/test_conllu_dataset.py diff --git a/tests/dataload/test_cornel_movie_dialogs.py b/tests/datasets/impl/test_cornell_movie_dialogs.py similarity index 98% rename from tests/dataload/test_cornel_movie_dialogs.py rename to tests/datasets/impl/test_cornell_movie_dialogs.py index 83e5d85b..a9f4974c 100644 --- a/tests/dataload/test_cornel_movie_dialogs.py +++ b/tests/datasets/impl/test_cornell_movie_dialogs.py @@ -3,7 +3,7 @@ import pytest -from podium.dataload.cornell_movie_dialogs import CornellMovieDialogsLoader +from podium.datasets.impl.cornell_movie_dialogs_dataset import CornellMovieDialogsLoader from podium.storage import LargeResource diff --git a/tests/datasets/test_cornell_movie_dialogs_dataset.py b/tests/datasets/impl/test_cornell_movie_dialogs_dataset.py similarity index 95% rename from tests/datasets/test_cornell_movie_dialogs_dataset.py rename to tests/datasets/impl/test_cornell_movie_dialogs_dataset.py index b39ac5b8..e8a025db 100644 --- a/tests/datasets/test_cornell_movie_dialogs_dataset.py +++ b/tests/datasets/impl/test_cornell_movie_dialogs_dataset.py @@ -1,7 +1,9 @@ import pytest -from podium.dataload.cornell_movie_dialogs import CornellMovieDialogsNamedTuple from podium.datasets import CornellMovieDialogsConversationalDataset +from podium.datasets.impl.cornell_movie_dialogs_dataset import ( + CornellMovieDialogsNamedTuple, +) EXPECTED_EXAMPLES = [ diff --git a/tests/datasets/test_croatian_ner_dataset.py b/tests/datasets/impl/test_croatian_ner_dataset.py similarity index 100% rename from tests/datasets/test_croatian_ner_dataset.py rename to tests/datasets/impl/test_croatian_ner_dataset.py diff --git a/tests/dataload/test_eurovoc.py b/tests/datasets/impl/test_eurovoc.py similarity index 98% rename from tests/dataload/test_eurovoc.py rename to tests/datasets/impl/test_eurovoc.py index a58102ca..d820c669 100644 --- a/tests/dataload/test_eurovoc.py +++ b/tests/datasets/impl/test_eurovoc.py @@ -7,12 +7,18 @@ import dill import pytest -from podium.dataload.eurovoc import EuroVocLoader, Label, LabelRank, dill_dataset -from podium.datasets.impl.eurovoc_dataset import EuroVocDataset -from podium.storage import Field, MultilabelField, Vocab +from podium.datasets.impl.eurovoc_dataset import ( + EuroVocDataset, + EuroVocLoader, + Label, + LabelRank, + dill_dataset, +) +from podium.field import Field, MultilabelField from podium.storage.resources.large_resource import LargeResource, SCPLargeResource +from podium.vocab import Vocab -from ..datasets.test_eurovoc_dataset import ( +from .test_eurovoc_dataset import ( crovoc_label_hierarchy, documents, eurovoc_label_hierarchy, @@ -286,7 +292,7 @@ def create_mock_dataset( mappings_path = os.path.join(base_dataset_dir, EuroVocLoader.MAPPING_FILENAME) - with open("tests/dataload/mock_mapping.xls", mode="rb") as input_file: + with open("tests/datasets/impl/mock_mapping.xls", mode="rb") as input_file: mappings_content = input_file.read() with open(file=mappings_path, mode="wb") as fp: fp.write(mappings_content) diff --git a/tests/datasets/test_eurovoc_dataset.py b/tests/datasets/impl/test_eurovoc_dataset.py similarity index 98% rename from tests/datasets/test_eurovoc_dataset.py rename to tests/datasets/impl/test_eurovoc_dataset.py index 96d771bf..8581bd4b 100644 --- a/tests/datasets/test_eurovoc_dataset.py +++ b/tests/datasets/impl/test_eurovoc_dataset.py @@ -2,8 +2,12 @@ import pytest -from podium.dataload.eurovoc import Document, Label, LabelRank -from podium.datasets.impl.eurovoc_dataset import EuroVocDataset +from podium.datasets.impl.eurovoc_dataset import ( + Document, + EuroVocDataset, + Label, + LabelRank, +) from podium.preproc.lemmatizer.croatian_lemmatizer import CroatianLemmatizer diff --git a/tests/datasets/test_imdb_dataset.py b/tests/datasets/impl/test_imdb_dataset.py similarity index 99% rename from tests/datasets/test_imdb_dataset.py rename to tests/datasets/impl/test_imdb_dataset.py index 56837ba6..0ca6412d 100644 --- a/tests/datasets/test_imdb_dataset.py +++ b/tests/datasets/impl/test_imdb_dataset.py @@ -8,7 +8,7 @@ from podium.datasets.impl.imdb_sentiment_dataset import IMDB from podium.storage.resources.large_resource import LargeResource -from ..util import run_spacy +from ...util import run_spacy TRAIN_EXAMPLES = { diff --git a/tests/datasets/test_iris_dataset.py b/tests/datasets/impl/test_iris_dataset.py similarity index 100% rename from tests/datasets/test_iris_dataset.py rename to tests/datasets/impl/test_iris_dataset.py diff --git a/tests/dataload/test_ner_croatian.py b/tests/datasets/impl/test_ner_croatian.py similarity index 99% rename from tests/dataload/test_ner_croatian.py rename to tests/datasets/impl/test_ner_croatian.py index 752bc5d6..e44a6e20 100644 --- a/tests/dataload/test_ner_croatian.py +++ b/tests/datasets/impl/test_ner_croatian.py @@ -6,7 +6,7 @@ import pytest -from podium.dataload.ner_croatian import ( +from podium.datasets.impl.croatian_ner_dataset import ( NERCroatianXMLLoader, convert_sequence_to_entities, ) diff --git a/tests/datasets/test_pauza_dataset.py b/tests/datasets/impl/test_pauza_dataset.py similarity index 100% rename from tests/datasets/test_pauza_dataset.py rename to tests/datasets/impl/test_pauza_dataset.py diff --git a/tests/datasets/test_snli_dataset.py b/tests/datasets/impl/test_snli_dataset.py similarity index 100% rename from tests/datasets/test_snli_dataset.py rename to tests/datasets/impl/test_snli_dataset.py diff --git a/tests/datasets/test_sst_dataset.py b/tests/datasets/impl/test_sst_dataset.py similarity index 100% rename from tests/datasets/test_sst_dataset.py rename to tests/datasets/impl/test_sst_dataset.py diff --git a/tests/storage/test_dataset.py b/tests/datasets/test_dataset.py similarity index 99% rename from tests/storage/test_dataset.py rename to tests/datasets/test_dataset.py index ef55a0f7..2153b974 100644 --- a/tests/storage/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -7,13 +7,13 @@ import numpy as np import pytest +from podium.datasets import ExampleFactory from podium.datasets.dataset import Dataset from podium.datasets.hierarhical_dataset import HierarchicalDataset from podium.datasets.iterator import Iterator from podium.datasets.tabular_dataset import TabularDataset -from podium.storage.example_factory import ExampleFactory -from podium.storage.field import Field, MultioutputField, unpack_fields -from podium.storage.vocab import Vocab +from podium.field import Field, MultioutputField, unpack_fields +from podium.vocab import Vocab FORMAT_USE_DICT_COMBINATIONS = ( diff --git a/tests/storage/test_example_factory.py b/tests/datasets/test_example_factory.py similarity index 99% rename from tests/storage/test_example_factory.py rename to tests/datasets/test_example_factory.py index 2fb78ca0..204a8ff9 100644 --- a/tests/storage/test_example_factory.py +++ b/tests/datasets/test_example_factory.py @@ -1,6 +1,7 @@ import pytest -from podium.storage import Example, ExampleFactory, ExampleFormat, Field +from podium.datasets import Example, ExampleFactory, ExampleFormat +from podium.field import Field name_field = Field("Name", keep_raw=True, tokenizer="split") diff --git a/tests/dataload/test_hf.py b/tests/datasets/test_hf.py similarity index 98% rename from tests/dataload/test_hf.py rename to tests/datasets/test_hf.py index d24ec6ca..323f9566 100644 --- a/tests/dataload/test_hf.py +++ b/tests/datasets/test_hf.py @@ -4,8 +4,8 @@ datasets = pytest.importorskip("datasets") from datasets import ClassLabel, Features, Translation -from podium.dataload.hf import HFDatasetConverter, convert_features_to_fields from podium.datasets import Dataset +from podium.datasets.hf import HFDatasetConverter, convert_features_to_fields SIMPLE_DATA = { diff --git a/tests/storage/test_iterator.py b/tests/datasets/test_iterator.py similarity index 99% rename from tests/storage/test_iterator.py rename to tests/datasets/test_iterator.py index c7ec3fd0..67e01e81 100644 --- a/tests/storage/test_iterator.py +++ b/tests/datasets/test_iterator.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from podium.datasets import Dataset +from podium.datasets import Dataset, ExampleFactory from podium.datasets.hierarhical_dataset import HierarchicalDataset from podium.datasets.iterator import ( BucketIterator, @@ -11,9 +11,10 @@ Iterator, SingleBatchIterator, ) -from podium.storage import ExampleFactory, Field, Vocab +from podium.field import Field +from podium.vocab import Vocab -from .conftest import ( +from ..conftest import ( TABULAR_TEXT, create_tabular_dataset_from_json, tabular_dataset_fields, diff --git a/tests/arrow/test_pyarrow_tabular_dataset.py b/tests/datasets/test_pyarrow_tabular_dataset.py similarity index 98% rename from tests/arrow/test_pyarrow_tabular_dataset.py rename to tests/datasets/test_pyarrow_tabular_dataset.py index 1b7e661a..cb899a50 100644 --- a/tests/arrow/test_pyarrow_tabular_dataset.py +++ b/tests/datasets/test_pyarrow_tabular_dataset.py @@ -10,9 +10,9 @@ pa = pytest.importorskip("pyarrow") -from podium.arrow import ArrowDataset -from podium.datasets import Dataset -from podium.storage import ExampleFactory, Field, Vocab +from podium.datasets import ArrowDataset, Dataset, ExampleFactory +from podium.field import Field +from podium.vocab import Vocab @pytest.fixture diff --git a/tests/metrics/__init__.py b/tests/experimental/__init__.py similarity index 100% rename from tests/metrics/__init__.py rename to tests/experimental/__init__.py diff --git a/tests/models/__init__.py b/tests/experimental/models/__init__.py similarity index 100% rename from tests/models/__init__.py rename to tests/experimental/models/__init__.py diff --git a/tests/models/eurovoc_models/test_multilabel_svm.py b/tests/experimental/models/eurovoc_models/test_multilabel_svm.py similarity index 96% rename from tests/models/eurovoc_models/test_multilabel_svm.py rename to tests/experimental/models/eurovoc_models/test_multilabel_svm.py index 262df612..c50b1711 100644 --- a/tests/models/eurovoc_models/test_multilabel_svm.py +++ b/tests/experimental/models/eurovoc_models/test_multilabel_svm.py @@ -4,7 +4,7 @@ import pytest from numpy.testing import assert_array_equal -from podium.models.impl.eurovoc_models import multilabel_svm as ms +from podium.experimental.models.impl.eurovoc_models import multilabel_svm as ms X = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]]) @@ -23,6 +23,7 @@ def test_get_label_matrix(): @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.filterwarnings("ignore::sklearn.exceptions.FitFailedWarning") +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.UndefinedMetricWarning") def test_fitting_multilable_svm(): clf = ms.MultilabelSVM() parameter_grid = {"C": [1]} diff --git a/tests/models/test_default_batch_transform_functions.py b/tests/experimental/models/test_default_batch_transform_functions.py similarity index 93% rename from tests/models/test_default_batch_transform_functions.py rename to tests/experimental/models/test_default_batch_transform_functions.py index 41f0c667..07a4e886 100644 --- a/tests/models/test_default_batch_transform_functions.py +++ b/tests/experimental/models/test_default_batch_transform_functions.py @@ -2,7 +2,7 @@ import pytest -from podium.models.batch_transform_functions import ( +from podium.experimental.models.batch_transform_functions import ( default_feature_transform, default_label_transform, ) diff --git a/tests/models/test_experiment.py b/tests/experimental/models/test_experiment.py similarity index 94% rename from tests/models/test_experiment.py rename to tests/experimental/models/test_experiment.py index cdab4941..6f64bd19 100644 --- a/tests/models/test_experiment.py +++ b/tests/experimental/models/test_experiment.py @@ -1,8 +1,9 @@ import pytest -from podium.datasets import Dataset -from podium.models import Experiment, FeatureTransformer -from podium.storage import ExampleFactory, Field, Vocab +from podium.datasets import Dataset, ExampleFactory +from podium.experimental.models import Experiment, FeatureTransformer +from podium.field import Field +from podium.vocab import Vocab def get_dataset(): diff --git a/tests/models/test_fc_model.py b/tests/experimental/models/test_fc_model.py similarity index 84% rename from tests/models/test_fc_model.py rename to tests/experimental/models/test_fc_model.py index 00a22b45..f1d30793 100644 --- a/tests/models/test_fc_model.py +++ b/tests/experimental/models/test_fc_model.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from podium.models.impl.fc_model import ScikitMLPClassifier +from podium.experimental.models.impl.fc_model import ScikitMLPClassifier X = np.array([[1, 0, 1], [1, 1, 1], [0, 0, 1]]) diff --git a/tests/models/test_simple_trainers.py b/tests/experimental/models/test_simple_trainers.py similarity index 80% rename from tests/models/test_simple_trainers.py rename to tests/experimental/models/test_simple_trainers.py index 0344b6ac..e8d7fe52 100644 --- a/tests/models/test_simple_trainers.py +++ b/tests/experimental/models/test_simple_trainers.py @@ -2,11 +2,11 @@ import pytest_mock # noqa from podium.datasets.iterator import Iterator -from podium.models import FeatureTransformer -from podium.models.impl.simple_trainers import SimpleTrainer -from podium.models.model import AbstractSupervisedModel +from podium.experimental.models import FeatureTransformer +from podium.experimental.models.impl.simple_trainers import SimpleTrainer +from podium.experimental.models.model import AbstractSupervisedModel -from ..storage.conftest import json_file_path, tabular_dataset # noqa +from ...conftest import json_file_path, tabular_dataset # noqa @pytest.fixture @@ -45,11 +45,11 @@ def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): # iterator = Iterator(tabular_dataset, batch_size=len(tabular_dataset)) mocker.patch( - "tests.models.test_simple_trainers.mock_feature_transform_fun", + "tests.experimental.models.test_simple_trainers.mock_feature_transform_fun", return_value=next(iter(iterator))[0], ) mocker.patch( - "tests.models.test_simple_trainers.mock_label_transform_fun", + "tests.experimental.models.test_simple_trainers.mock_label_transform_fun", return_value=next(iter(iterator))[1], ) diff --git a/tests/models/test_svm_model.py b/tests/experimental/models/test_svm_model.py similarity index 84% rename from tests/models/test_svm_model.py rename to tests/experimental/models/test_svm_model.py index 3d9bb080..8500f945 100644 --- a/tests/models/test_svm_model.py +++ b/tests/experimental/models/test_svm_model.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from podium.models.impl.svm_model import ScikitSVCModel +from podium.experimental.models.impl.svm_model import ScikitSVCModel X = np.array([[1, 0, 1], [1, 1, 1], [0, 0, 1]]) diff --git a/tests/models/test_transformers.py b/tests/experimental/models/test_transformers.py similarity index 98% rename from tests/models/test_transformers.py rename to tests/experimental/models/test_transformers.py index cf190ef5..c7b1171f 100644 --- a/tests/models/test_transformers.py +++ b/tests/experimental/models/test_transformers.py @@ -2,7 +2,7 @@ import numpy as np -from podium.models import ( +from podium.experimental.models import ( FeatureTransformer, SklearnTensorTransformerWrapper, TensorTransformer, diff --git a/tests/pipeline/__init__.py b/tests/experimental/pipeline/__init__.py similarity index 100% rename from tests/pipeline/__init__.py rename to tests/experimental/pipeline/__init__.py diff --git a/tests/pipeline/test_pipeline.py b/tests/experimental/pipeline/test_pipeline.py similarity index 97% rename from tests/pipeline/test_pipeline.py rename to tests/experimental/pipeline/test_pipeline.py index 6c7d43b5..7a6ab115 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/experimental/pipeline/test_pipeline.py @@ -2,9 +2,10 @@ import numpy as np -from podium.models import AbstractSupervisedModel, FeatureTransformer -from podium.pipeline import Pipeline -from podium.storage import ExampleFormat, Field, LabelField, MultioutputField +from podium.datasets import ExampleFormat +from podium.experimental.models import AbstractSupervisedModel, FeatureTransformer +from podium.experimental.pipeline import Pipeline +from podium.field import Field, LabelField, MultioutputField name_dict = {"Marko": 1, "Darko": 2, "Ivana": 3} diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py deleted file mode 100644 index b08317ec..00000000 --- a/tests/metrics/test_metrics.py +++ /dev/null @@ -1,10 +0,0 @@ -import pytest - -from podium.metrics.metrics import f1_metric - - -def test_f1(): - pytest.importorskip("sklearn") - true = [1, 1] - pred = [0, 1] - assert pytest.approx(f1_metric(true, pred), 0.001) == 0.666 diff --git a/tests/preproc/test_hooks.py b/tests/preproc/test_hooks.py index 512c1fa7..50078167 100644 --- a/tests/preproc/test_hooks.py +++ b/tests/preproc/test_hooks.py @@ -2,6 +2,8 @@ import pytest +from podium.datasets import ExampleFactory +from podium.field import Field from podium.preproc.functional import remove_stopwords, truecase from podium.preproc.hooks import ( MosesNormalizer, @@ -10,7 +12,6 @@ SpacyLemmatizer, TextCleanUp, ) -from podium.storage import ExampleFactory, Field from ..util import run_spacy diff --git a/tests/storage/test_field.py b/tests/test_field.py similarity index 98% rename from tests/storage/test_field.py rename to tests/test_field.py index bb58cc00..a83bad73 100644 --- a/tests/storage/test_field.py +++ b/tests/test_field.py @@ -5,14 +5,8 @@ import numpy as np import pytest -from podium.storage import ( - Field, - LabelField, - MultilabelField, - MultioutputField, - SpecialVocabSymbols, - Vocab, -) +from podium.field import Field, LabelField, MultilabelField, MultioutputField +from podium.vocab import SpecialVocabSymbols, Vocab ONE_TO_FIVE = [1, 2, 3, 4, 5] @@ -268,14 +262,20 @@ def test_field_get_tokenizer_exception(): def test_field_get_tokenizer_spacy_ok(): - patch.dict("sys.modules", spacy=MockSpacy()).start() + mp = patch.dict("sys.modules", spacy=MockSpacy()) + mp.start() + f = Field(name="F", numericalizer=MockVocab(), tokenizer="spacy") _, data = f.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) + mp.stop() + def test_field_pickle_spacy_tokenizer(tmpdir): - patch.dict("sys.modules", spacy=MockSpacy()).start() + mp = patch.dict("sys.modules", spacy=MockSpacy()) + mp.start() + fld = Field(name="F", numericalizer=None, tokenizer="spacy") _, data = fld.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) @@ -293,6 +293,8 @@ def test_field_pickle_spacy_tokenizer(tmpdir): _, data = loaded_fld.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) + mp.stop() + def test_field_pretokenize_hooks(): f = Field(name="F", tokenizer="split", keep_raw=True) diff --git a/tests/storage/test_vocab.py b/tests/test_vocab.py similarity index 99% rename from tests/storage/test_vocab.py rename to tests/test_vocab.py index c39caf3a..1fef65af 100644 --- a/tests/storage/test_vocab.py +++ b/tests/test_vocab.py @@ -3,7 +3,7 @@ import dill import pytest -from podium.storage import vocab +from podium import vocab def test_default_vocab_add_set_words(): diff --git a/tests/vectorizers/__init__.py b/tests/vectorizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/storage/test_tfidf.py b/tests/vectorizers/test_tfidf.py similarity index 97% rename from tests/storage/test_tfidf.py rename to tests/vectorizers/test_tfidf.py index cfe1e1ac..945559e7 100644 --- a/tests/storage/test_tfidf.py +++ b/tests/vectorizers/test_tfidf.py @@ -2,12 +2,12 @@ import pytest from sklearn.feature_extraction import text -from podium.storage.field import Field -from podium.storage.vectorizers.tfidf import CountVectorizer, TfIdfVectorizer -from podium.storage.vocab import SpecialVocabSymbols, Vocab +from podium.field import Field +from podium.vectorizers.tfidf import CountVectorizer, TfIdfVectorizer +from podium.vocab import SpecialVocabSymbols, Vocab -from .conftest import TABULAR_TEXT +TABULAR_TEXT = ("a b c", "a", "a b c d", "a", "d b", "d c g", "b b b b b b") DATA = [ "this is the first document", diff --git a/tests/storage/test_vectorizer.py b/tests/vectorizers/test_vectorizer.py similarity index 99% rename from tests/storage/test_vectorizer.py rename to tests/vectorizers/test_vectorizer.py index 36db2abf..38454b65 100644 --- a/tests/storage/test_vectorizer.py +++ b/tests/vectorizers/test_vectorizer.py @@ -6,8 +6,8 @@ import numpy as np import pytest -from podium.storage.vectorizers import vectorizer -from podium.storage.vectorizers.impl import GloVe +from podium.vectorizers import vectorizer +from podium.vectorizers.impl import GloVe BASIC_VECT_HEADING = b"251518 300"