stair-lab · minhtrung23 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -10,53 +10,43 @@
 import sys
 from datetime import datetime
 
-# -- Path setup --------------------------------------------------------------
-
-# Add the path to your source code here.
+# Path setup
 sys.path.insert(0, os.path.abspath("../../src"))
 
-# -- Project information -----------------------------------------------------
-
+# Project information
 PROJECT = "MELTs"
 AUTHOR = "Thu Nguyen Hoang Anh"
-COPYRIGHT = f"{datetime.datetime.now().year}, {AUTHOR}"
+COPYRIGHT = f"{datetime.now().year}, {AUTHOR}"
 
-# The version info for the project
-VERSION = "0.1"  # Short version (e.g., '0.1')
-RELEASE = "0.1"  # Full version (e.g., '0.1.0')
+# The full version, including alpha/beta/rc tags
+RELEASE = "0.1"
 
-# -- General configuration ---------------------------------------------------
+# General configuration
+MASTER_DOC = "index"
 
-MASTER_DOC = "index"  # The name of the master document
-
-# Sphinx extensions to use
+# Sphinx extension modules as strings, can be built-in or custom
 EXTENSIONS = [
-    "sphinx.ext.duration",  # Measure build time
-    "sphinx.ext.autodoc",   # Include documentation from docstrings
-    "sphinx.ext.coverage",  # Check for documentation coverage
-    "sphinx.ext.doctest",   # Test embedded doctests
-    "sphinx_rtd_theme",     # Read the Docs theme
+    "sphinx.ext.duration",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.coverage",
+    "sphinx_rtd_theme",
+    "sphinx.ext.doctest",
 ]
 
-# Mock import for autodoc
+# List of modules to mock during autodoc generation
 AUTODOC_MOCK_IMPORTS = ["pyemd"]
 
 # Paths that contain templates
 TEMPLATES_PATH = ["_templates"]
 
-# Patterns to ignore when looking for source files
+# List of patterns to ignore when looking for source files
 EXCLUDE_PATTERNS = []
 
 # Sort members alphabetically in the autodoc
 AUTODOC_MEMBER_ORDER = "alphabetical"
 
-# Theme to use for HTML and HTML Help pages
+# Options for HTML output
 HTML_THEME = "sphinx_rtd_theme"
 
-# Theme options for customizing the appearance of the theme
-HTML_THEME_OPTIONS = {
-    # You can add theme-specific options here
-}
-
-# Paths that contain custom static files (e.g., style sheets)
+# Paths for custom static files (like style sheets)
 HTML_STATIC_PATH = ["_static"]
diff --git a/src/melt/__main__.py b/src/melt/__main__.py
@@ -1,94 +1,18 @@
-"""
-This script initializes NLP models and runs the main function from the 'cli' module.
-
-The script performs the following tasks:
-1. Downloads the 'punkt' tokenizer models using nltk.
-2. Loads the spaCy 'en_core_web_sm' model, downloading it if necessary.
-3. Imports and executes the 'main' function from the 'cli' module.
-
-If any module or function cannot be imported, appropriate error messages are displayed.
-"""
-
-import logging
+"Main"
 import spacy
 import nltk
-from spacy.cli import download as spacy_download
-from typing import NoReturn
-
-# Configure logging with a descriptive name for the logger
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    level=logging.INFO
-)
-logger = logging.getLogger("nlp_utils")
-
-def download_nltk_resources() -> NoReturn:
-    """Download the necessary NLTK resources.
-
-    Logs success or failure messages.
-    """
-    try:
-        with nltk.download('punkt'):
-            logger.info("Successfully downloaded NLTK 'punkt' resource.")
-    except Exception as error:
-        logger.error("Failed to download NLTK resources: %s", error)
-        raise
-
-
-def load_spacy_model(model_name: str = "en_core_web_sm") -> spacy.language.Language:
-    """Load and return the spaCy model, downloading it if necessary.
-
-    Logs success or failure messages during the model loading process.
-
-    Args:
-        model_name (str): The name of the spaCy model to load.
-
-    Returns:
-        spacy.language.Language: The loaded spaCy model.
-    """
-    try:
-        model = spacy.load(model_name)
-        logger.info("Successfully loaded spaCy model: %s", model_name)
-    except OSError:
-        logger.warning("spaCy model '%s' not found. Downloading...", model_name)
-        spacy_download(model_name)
-        model = spacy.load(model_name)
-        logger.info("Successfully downloaded and loaded spaCy model: %s", model_name)
-    except Exception as error:
-        logger.error("Failed to load spaCy model: %s", error)
-        raise
-    return model
-
-
-def execute_cli_main() -> None:
-    """Execute the 'main' function from the CLI module.
-
-    Logs success or failure messages about the import process and execution.
-    """
-    try:
-        from cli import main as cli_main
-        logger.info("Successfully imported 'main' from 'cli' module.")
-    except ImportError as import_error:
-        logger.error("ImportError: %s", import_error)
-        try:
-            import cli
-            cli_main = cli.main
-            logger.info("Successfully imported 'cli' module directly.")
-        except ImportError as inner_import_error:
-            logger.critical("Failed to import 'cli' module: %s", inner_import_error)
-            raise
-    cli_main()
-
-
-def main() -> None:
-    """Main function to set up resources and execute the CLI.
+from melt.cli import main
 
-    Ensures proper logging and execution flow.
-    """
-    download_nltk_resources()
-    load_spacy_model()
-    execute_cli_main()
+nltk.download('punkt_tab')
+try:
+    spacy.load("en_core_web_sm")
+except OSError:
+    print(
+        "Downloading the spacy en_core_web_sm model\n"
+        "(don't worry, this will only happen once)"
+    )
+    from spacy.cli import download
 
+    download("en_core_web_sm")
 
-if __name__ == "__main__":
-    main()
+main()
diff --git a/src/melt/cli.py b/src/melt/cli.py
@@ -1,75 +1,28 @@
-"""
-This script initializes and runs the text generation pipeline using spaCy, 
-transformers, and dotenv. It also handles downloading the spaCy 'en_core_web_sm' 
-model if it is not already present.
-
-The main function is responsible for:
-1. Loading environment variables.
-2. Parsing script arguments.
-3. Running the generation process with the parsed arguments.
-"""
-try:
-    import spacy
-except ImportError as e:
-    print(f"Failed to import 'spacy': {e}")
-
+"Cli"
+import spacy
+from transformers import HfArgumentParser
+from dotenv import load_dotenv
+from melt.script_arguments import ScriptArguments
+from melt.generation import generation
 try:
     spacy.load("en_core_web_sm")
 except OSError:
     print(
         "Downloading the spacy en_core_web_sm model\n"
         "(don't worry, this will only happen once)"
     )
-    try:
-        from spacy.cli import download
-        download("en_core_web_sm")
+    from spacy.cli import download
 
-    except ImportError as e:
-        print(f"Failed to import 'spacy.cli': {e}")
-try:
-    from transformers import HfArgumentParser
-except ImportError as e:
-    print(f"Failed to import 'transformers': {e}")
+    download("en_core_web_sm")
 
-try:
-    from dotenv import load_dotenv
-except ImportError as e:
-    print(f"Failed to import 'dotenv': {e}")
-
-try:
-    from .script_arguments import ScriptArguments
-except ImportError as e:
-    print(f"Failed to import 'ScriptArguments' from 'script_arguments': {e}")
-try:
-    from .generation import generation
-except ImportError as e:
-    print(f"Failed to import 'generation' from 'generation': {e}")
 
-def main():
-    """
-    The main function that initializes the environment, parses script arguments,
-    and triggers the text generation process.
 
-    This function performs the following steps:
-    1. Loads environment variables using `load_dotenv()`.
-    2. Creates an argument parser for `ScriptArguments` using `HfArgumentParser`.
-    3. Parses the arguments into data classes.
-    4. Calls the `generation` function with the parsed arguments to perform the text generation.
+# from .to_sheet import to_sheet
+# from .to_sheet_std import to_sheet_std
 
-    Returns:
-        None
-    """
+def main():
+    "CLI"
     load_dotenv()
-
-    # Ensure spaCy model is available
-    ensure_spacy_model()
-
-    # Parse command-line arguments
     parser = HfArgumentParser(ScriptArguments)
     args = parser.parse_args_into_dataclasses()[0]
-
-    # Execute the generation function with parsed arguments
     generation(args)
-
-if __name__ == "__main__":
-    main()
diff --git a/src/melt/generation.py b/src/melt/generation.py
@@ -1,69 +1,14 @@
-"""
-This module provides functionality for evaluating and 
-generating data using specified pipelines and datasets.
-
-The `generation` function is the main entry point of this script. It performs the following tasks:
-1. Initializes the seed for reproducibility.
-2. Loads and processes the dataset using `DatasetWrapper`.
-3. Sets up directories for saving results if they don't already exist.
-4. Handles continuation of inference from a previous run if specified.
-5. Creates a DataLoader for batching dataset examples.
-6. Initializes the evaluation pipeline (`EvalPipeline`).
-7. Runs the evaluation pipeline and saves the results to JSON files.
-
-The script is designed to work with various configurations 
-specified in the `script_args` parameter, including options for 
-few-shot prompting and continuing from previous results.
-
-Modules used:
-- `os`: For file and directory operations.
-- `.tools.data`: Contains `DatasetWrapper` for 
-dataset management.
-- `.tools.pipelines`: Contains `EvalPipeline` for 
-evaluation processes.
-- `.tools.utils.utils`: Provides utility functions such as 
-`save_to_json`, `set_seed`, and `read_json`.
-- `torch.utils.data`: For data loading with `DataLoader`.
-"""
+"Generation"
 import os
+import sys
 from torch.utils.data import DataLoader
-from .tools.data import DatasetWrapper
-from .tools.pipelines import EvalPipeline
-from .tools.utils.utils import save_to_json, set_seed, read_json
-
-
+from melt.tools.data import DatasetWrapper
+from melt.tools.pipelines import EvalPipeline
+from melt.tools.utils.utils import save_to_json, set_seed, read_json
 
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 def generation(script_args):
-    """
-    Executes the data generation process based on the provided script arguments.
-
-    This function performs the following steps:
-    1. Sets the random seed for reproducibility using `set_seed`.
-    2. Loads and optionally processes the dataset using `DatasetWrapper`.
-    3. Constructs filenames for saving generation results and metrics based on the script arguments.
-    4. Creates necessary directories for saving results if they don't already exist.
-    5. Determines the starting index and results to continue 
-    inference from a previous run if specified.
-    6. Initializes a `DataLoader` for batching the dataset examples.
-    7. Initializes an `EvalPipeline` for evaluating the data.
-    8. Runs the evaluation pipeline and saves the results using the `save_results` function.
-    Args:
-        script_args (ScriptArguments): An object containing the configuration 
-        and parameters for the data generation process.
-            - seed (int): Random seed for reproducibility.
-            - smoke_test (bool): Flag to indicate if a smaller subset 
-            of data should be used for testing.
-            - dataset_name (str): Name of the dataset.
-            - model_name (str): Name of the model.
-            - output_dir (str): Directory to save generation results.
-            - output_eval_dir (str): Directory to save evaluation metrics.
-            - continue_infer (bool): Flag to continue inference from a previous run.
-            - per_device_eval_batch_size (int): Batch size for evaluation.
-            - fewshot_prompting (bool): Flag for few-shot prompting.
-
-    Returns:
-        None
-    """
+    "Generation"
     set_seed(script_args.seed)
 
     # Load dataset (you can process it here)
@@ -76,19 +21,29 @@ def generation(script_args):
             dataset_wrapper.dataset_testing.select(range(n_examples))
         )
     ds_exact_name = (
-        script_args.dataset_name.split("/")[-1]
+        script_args.lang
         + "_"
-        + script_args.model_name.split("/")[-1]
-        + f"_pt{dataset_wrapper.prompting_strategy}"
-        + ("_fewshot" if script_args.fewshot_prompting else "")
+        + dataset_wrapper.dataset_info.task
+        + "_"
+        + script_args.dataset_name.split("/")[-1].replace("_", "-")
+        + "_"
+        + script_args.model_name.split("/")[-1].replace("_", "-")
+        + "_"
+        + script_args.prompt_type
+        + "_"
+        + script_args.category
+        + "_"
+        + str(script_args.num_fs_shot)
+        + "_pt" + dataset_wrapper.prompting_strategy
         + f"_seed{script_args.seed}"
-    )
+)
+
 
     json_file = os.path.join(
         script_args.output_dir, f"generations_{ds_exact_name}.json"
     )
     metric_file = os.path.join(
-        script_args.output_eval_dir, f"metrics_{ds_exact_name}.json"
+        script_args.output_eval_dir, f"{ds_exact_name}.json"
     )
 
     # Save results