feat: Streamline the addition of new standards

VIDA-NYU · Nov 9, 2024 · b294360 · b294360
1 parent 8217711
commit b294360
Show file tree

Hide file tree

Showing 17 changed files with 20,160 additions and 161,395 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -39,12 +39,25 @@ Contributors can add new methods for schema and value matching by following thes
 
 2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
 
-3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your 
+3. Add a new entry to the Enum class (e.g. `ValueMatchers`) in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). 
+Make sure to add the correct import path for your module to ensure it can be accessed without errors.
+
+
+Adding New Standards
+--------------------
+
+Contributors can extend bdi-kit to additional standards  a by following these steps:
+
+1. Create a Python module inside the "standards" folder (`bdikit/standards`).
+
+2. Define a class in the module that implements `BaseStandard`.
+
+3. Add a new entry to the class `Standards(Enum)` in `bdikit/standards/standard_factory.py`. Make sure to add the correct import path for your 
 module to ensure it can be accessed without errors.
 
 
 Code of Conduct
 ---------------
 
 We abide by the principles of openness, respect, and consideration of others
-of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
+of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -1,31 +1,18 @@
 from __future__ import annotations
 import logging
-
 from collections import defaultdict
-from os.path import join, dirname
-from typing import (
-    Union,
-    List,
-    Dict,
-    TypedDict,
-    Optional,
-    Tuple,
-    Callable,
-    Any,
-)
 import itertools
 import pandas as pd
 import numpy as np
 import panel as pn
 from IPython.display import display, Markdown
-from bdikit.utils import get_gdc_data, get_gdc_metadata
-
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
 from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
 from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
 from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
 from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
 from bdikit.value_matching.matcher_factory import ValueMatchers
+from bdikit.standards.standard_factory import Standards
 
 from bdikit.mapping_functions import (
     ValueMapper,
@@ -34,11 +21,21 @@
     IdentityValueMapper,
 )
 
+from typing import (
+    Union,
+    List,
+    Dict,
+    TypedDict,
+    Optional,
+    Tuple,
+    Callable,
+    Any,
+)
+
+from bdikit.config import DEFAULT_SCHEMA_MATCHING_METHOD, DEFAULT_VALUE_MATCHING_METHOD
+
 pn.extension("tabulator")
 
-GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
-DEFAULT_VALUE_MATCHING_METHOD = "tfidf"
-DEFAULT_SCHEMA_MATCHING_METHOD = "coma"
 logger = logging.getLogger(__name__)
 
 
@@ -92,10 +89,10 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
     Load the table for the given standard data vocabulary. Currently, only the
     GDC standard is supported.
     """
-    if name == "gdc":
-        return pd.read_csv(GDC_DATA_PATH)
-    else:
-        raise ValueError(f"The {name} standard is not supported")
+    standard = Standards.get_standard(name)
+    df = standard.get_dataframe_rep()
+
+    return df
 
 
 def top_matches(
@@ -200,11 +197,11 @@ def match_values(
     if method_args is None:
         method_args = {}
 
-    if "top_k" in method_args and method_args["top_k"] > 1:
+    if "top_n" in method_args and method_args["top_n"] > 1:
         logger.warning(
-            f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches."
+            f"Ignoring 'top_n' argument, use the 'top_value_matches()' method to get top-k value matches."
         )
-        method_args["top_k"] = 1
+        method_args["top_n"] = 1
 
     matches = _match_values(source, target, column_mapping, method, method_args)
 
@@ -269,12 +266,12 @@ def top_value_matches(
     if method_args is None:
         method_args = {}
 
-    if "top_k" in method_args:
+    if "top_n" in method_args:
         logger.warning(
-            f"Ignoring 'top_k' argument, using top_k argument instead (top_k={top_k})"
+            f"Ignoring 'top_n' argument, using top_k argument instead (top_k={top_k})"
         )
 
-    method_args["top_k"] = top_k
+    method_args["top_n"] = top_k
 
     matches = _match_values(source, target, column_mapping, method, method_args)
 
@@ -291,7 +288,7 @@ def top_value_matches(
 
 
 def view_value_matches(
-    matches: Union[pd.DataFrame, List[pd.DataFrame]], edit: bool = False
+    matches: Union[pd.DataFrame, List[pd.DataFrame]], edit: bool = True
 ):
     """
     Shows the value match results in a DataFrame fashion.
@@ -439,9 +436,10 @@ def _format_value_matching_input(
                 f"The source column '{source_column}' is not present in the source dataset."
             )
 
-    if isinstance(target, str) and target == "gdc":
+    if isinstance(target, str):
         column_names = mapping_df["target"].unique().tolist()
-        target_domain = get_gdc_data(column_names)
+        standard = Standards.get_standard(target)
+        target_domain = standard.get_column_values(column_names)
     elif isinstance(target, pd.DataFrame):
         target_domain = {
             column_name: target[column_name].unique().tolist()
@@ -518,11 +516,12 @@ def preview_domain(
         (if applicable).
     """
 
-    if isinstance(dataset, str) and dataset == "gdc":
-        gdc_metadata = get_gdc_metadata()
-        value_names = gdc_metadata[column]["value_names"]
-        value_descriptions = gdc_metadata[column]["value_descriptions"]
-        column_description = gdc_metadata[column]["description"]
+    if isinstance(dataset, str):
+        standard = Standards.get_standard(dataset)
+        column_metadata = standard.get_column_metadata([column])
+        value_names = column_metadata[column]["value_names"]
+        value_descriptions = column_metadata[column]["value_descriptions"]
+        column_description = column_metadata[column]["description"]
         assert len(value_names) == len(value_descriptions)
     elif isinstance(dataset, pd.DataFrame):
         value_names = dataset[column].unique()

diff --git a/bdikit/config.py b/bdikit/config.py
@@ -6,6 +6,8 @@
 
 BDIKIT_DEVICE: str = os.getenv("BDIKIT_DEVICE", default="cpu")
 VALUE_MATCHING_THRESHOLD = 0.3
+DEFAULT_VALUE_MATCHING_METHOD = "tfidf"
+DEFAULT_SCHEMA_MATCHING_METHOD = "coma"
 
 
 def get_device() -> str: