From 31da69e0f91f2d99b395b73e6f54cc9143d1e3a6 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sat, 7 Dec 2024 15:48:11 +0100 Subject: [PATCH 01/17] starting documentation --- docs/advanced/PreTrainedModelsHF.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index d3a9a18..02e65ba 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -1 +1,20 @@ -# Use pre-trained models from HuggingFace +# Use pre-trained models from HuggingFace in the Melusine framework + + +> The Hugging Face library has become an invaluable resource in the data science field, offering easy-to-use models that excel across a variety of natural language processing (NLP) tasks. + +# How to leverage these models within the Melusine framework to build: + +* A powerful processor using model embeddings. +* An intelligent detector utilizing fine-tuned model layers. + +Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction results. + +## How to Choose and Use Models + +The selection of a model depends on the specific detection task. For example: + + **Sentiment detection in French text/emails:** + Suitable models include: camembert-base, distil-camembert-base, or distil-camembert-base. + These models can be seamlessly integrated into your workflow to enhance predictions and optimize detection outcomes. + From 5c86cae3b590df191373a7709982916afe151bef Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sat, 7 Dec 2024 21:48:16 +0100 Subject: [PATCH 02/17] :sparkles: --- docs/advanced/PreTrainedModelsHF.md | 360 ++++++++++++++++++ hugging_face/__init__.py | 0 hugging_face/detectors.py | 187 +++++++++ hugging_face/models/__init__.py | 0 hugging_face/models/model.py | 85 +++++ hugging_face/models/model_1.onnx | 0 melusine/base.py | 42 +- melusine/regex/__init__.py | 3 +- melusine/regex/dissatisfaction_regex.py | 61 +++ pyproject.toml | 8 +- tests/conftest.py | 1 + tests/fixtures/models.py | 58 +++ .../test_dissatisfaction_detector.py | 147 +++++++ 13 files changed, 947 insertions(+), 5 deletions(-) create mode 100644 hugging_face/__init__.py create mode 100644 hugging_face/detectors.py create mode 100644 hugging_face/models/__init__.py create mode 100644 hugging_face/models/model.py create mode 100644 hugging_face/models/model_1.onnx create mode 100644 melusine/regex/dissatisfaction_regex.py create mode 100644 tests/fixtures/models.py create mode 100644 tests/huggingface/test_dissatisfaction_detector.py diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index 02e65ba..eaf3e3b 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -18,3 +18,363 @@ The selection of a model depends on the specific detection task. For example: Suitable models include: camembert-base, distil-camembert-base, or distil-camembert-base. These models can be seamlessly integrated into your workflow to enhance predictions and optimize detection outcomes. + + +# Implementing solution : distil-camembert Models +As usual , the detector can be implemented this way , inheriting the MelusineTransformerDetector detector base class + +```python +class DissatisfactionDetector(MelusineTransformerDetector): + """ + Class to detect emails containing only dissatisfaction text. + + Ex: + Merci à vous, + Cordialement + """ + + # Class constants + BODY_PART: str = "BODY" + DISSATISFACTION_PART: str = "DISSATISFACTION" + + # Intermediate columns + THANKS_TEXT_COL: str = "thanks_text" + THANKS_PARTS_COL: str = "thanks_parts" + HAS_BODY: str = "has_body" + THANKS_MATCH_COL: str = "thanks_match" + + def __init__( + self, + messages_column: str = "messages", + name: str = "dissatisfaction", + ) -> None: + """ + Attributes initialization. + + Parameters + ---------- + messages_column: str + Name of the column containing the messages. + + name: str + Name of the detector. + """ + + # Input columns + self.messages_column = messages_column + input_columns: List[str] = [self.messages_column] + + # Output columns + self.result_column = f"{name}_result" + output_columns: List[str] = [self.result_column] + + # Detection regex + self.thanks_regex: MelusineRegex = ThanksRegex() + + super().__init__( + name=name, + input_columns=input_columns, + output_columns=output_columns, + ) + self.complex_regex_key: str +``` + +> The pre_detect method allows to preprocess the data into the type of model inputs . + +```python +def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Extract text to analyse. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + # Check if a BODY part is present in the last message + has_body: bool = row[self.messages_column][0].has_tags( + target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART} + ) + + # Extract the DISSATISFACTION part in the last message + dissatisfaction_parts: List[Tuple[str, str]] = row[self.messages_column][ + 0 + ].extract_parts(target_tags={self.DISSATISFACTION_PART}) + + # Compute DISSATISFACTION text + if not dissatisfaction_parts: + dissatisfaction_text: str = "" + else: + dissatisfaction_text = "\n".join(x[1] for x in dissatisfaction_parts) + + # Save debug data + if debug_mode: + debug_dict = { + self.DISSATISFACTION_PARTS_COL: dissatisfaction_parts, + self.DISSATISFACTION_TEXT_COL: dissatisfaction_text, + self.HAS_BODY: has_body, + } + row[self.debug_dict_col].update(debug_dict) + + # Create new columns + row[self.DISSATISFACTION_TEXT_COL] = dissatisfaction_text + row[self.HAS_BODY] = has_body + + return row +``` +> The detection method can be one of the following three : + * deterministic only : using Melusine_regex : + * Machine learning based only : using HF models + * both are combined to one final output : the detection result + + +* A dissatisfaction_regex MUST BE CREATED WITH DIFFERENT REGEX USEFUL TO DETECT DISSATISFACTION + +```python +from typing import Dict, List, Optional, Union +from melusine.base import MelusineRegex + + +class DissatisfactionRegex(MelusineRegex): + """ + Detect thanks patterns such as "merci". + """ + + @property + def positive(self) -> Union[str, Dict[str, str]]: + """ + Define regex patterns required to activate the MelusineRegex. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + + return r"\b(j'en ai marre|c'est nul|trop déçu|décevant|inadmissible|insupportable|intolérable|honteux|lamentable|catastrophe)\b" + + @property + def neutral(self) -> Optional[Union[str, Dict[str, str]]]: + """ + Define regex patterns to be ignored when running detection. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + return None + + @property + def negative(self) -> Optional[Union[str, Dict[str, str]]]: + """ + Define regex patterns prohibited to activate the MelusineRegex. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + return None + + @property + def match_list(self) -> List[str]: + """ + List of texts that should activate the MelusineRegex. + + Returns: + _: List of texts. + """ + return [ + "complétement insatisfait de ce que vous faites", + ] + + @property + def no_match_list(self) -> List[str]: + """ + List of texts that should NOT activate the MelusineRegex. + + Returns: + _: List of texts. + """ + return [] +``` +After constructing the DissatisfactionRegex class , the by_regex_detect method could be defined +```python +def by_regex_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Use regex to detect dissatisfaction. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + debug_info: Dict[str, Any] = {} + + text: str = row[self.DISSATISFACTION_TEXT_COL] + + detection_data = self.dissatisfaction_regex(text) + detection_result = detection_data[self.dissatisfaction_regex.MATCH_RESULT] + + # Save debug data + if debug_mode: + debug_info[self.dissatisfaction_regex.regex_name] = detection_data + row[self.debug_dict_col].update(debug_info) + + # Create new columns + row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] = detection_result + + return row +``` + +## The Machine Learning Approach to Detect Dissatisfaction: Two Methods + * Using a Pre-trained Model Directly + The distil-camembert-base model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. + + * Fine-tuning the Model + A pre-trained model can be fine-tuned using various methods, including: + + The Hugging Face Trainer API or PyTorch Lightning. + +> Fine-tuning approaches: + 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. + 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. + +Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. + +> Why distil-camembert-base? +Numerous studies and practical implementations have demonstrated that distil-camembert-base is a highly effective model for sentiment analysis and detecting dissatisfaction, particularly in tasks involving French text. +The model + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification +from torch.nn.functional import softmax + + +def load_hfmodel(self, model_name="distilcamembert-base") -> None: + """ + GET Distil-camembert-base from HF + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=5 + ) # Adjust num_labels for your classification task + + def predict_fn(self, text) -> List: + """ + Apply model and get prediction + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + inputs = self.tokenizer( + text, padding=True, truncation=True, return_tensors="pt" + ) + # Forward pass through the model + outputs = self.model(**inputs) + + # Convert logits to probabilities using softmax + probs = softmax(logits, dim=1) + + # Get predictions (the class index with the highest probability) + predictions = probs.argmax(dim=1).tolist() + + # Get confidence scores for the predicted classes + scores = probs.max(dim=1).values.tolist() + return predictions, scores + + def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Use machine learning model to detect dissatisfaction. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + debug_info: Dict[str, Any] = {} + ( + row[self.DISSATISFACTION_ML_MATCH_COL], + row[self.DISSATISFACTION_ML_SCORE_COL], + ) = self.predict_fn(row[self.DISSATISFACTION_TEXT_COL]) + # Save debug data + if debug_mode: + debug_info[self.DISSATISFACTION_ML_MATCH_COL] = row[ + self.DISSATISFACTION_ML_MATCH_COL + ] + debug_info[self.DISSATISFACTION_ML_SCORE_COL] = row[ + self.DISSATISFACTION_ML_SCORE_COL + ] + row[self.debug_dict_col].update(debug_info) + return row +``` + + +> The final detection result could be defined in the **post_detect** method using a predefined condition. +> [! Example ] +> condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) + + +```python +def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Apply final eligibility rules. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + # Match on thanks regex & Does not contain a body + row[self.result_column] = ( + row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9 + and row[self.DISSATISFACTION_ML_MATCH_COL] + ) or row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] + + return row +``` \ No newline at end of file diff --git a/hugging_face/__init__.py b/hugging_face/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hugging_face/detectors.py b/hugging_face/detectors.py new file mode 100644 index 0000000..41303df --- /dev/null +++ b/hugging_face/detectors.py @@ -0,0 +1,187 @@ +""" +Classes of detectors. + +Implemented classes: [ThanksDetector, VacationReplyDetector, ExpeditorDetector, +ReplyDetector, TransferDetector, RecipientsDetector] + +""" + +from typing import Any, Dict, List, Optional + +from hugging_face.models.model import TextClassifier +from melusine.base import MelusineItem, MelusineRegex, MelusineTransformerDetector +from melusine.regex import DissatisfactionRegex + + +class DissatisfactionDetector(MelusineTransformerDetector): + """ + Class to detect emails containing only dissatisfaction text. + + Ex: + Merci à vous, + Cordialement + """ + + # Intermediate columns + CONST_TEXT_COL_NAME: str = "effective_text" + DISSATISFACTION_TEXT_COL: str = "dissatisfaction_text" + CONST_DEBUG_TEXT_KEY: str = "text" + CONST_DEBUG_PARTS_KEY: str = "parts" + + # Results columns + DISSATISFACTION_ML_SCORE_COL: str = "dissatisfaction_ml_score" + DISSATISFACTION_ML_MATCH_COL: str = "dissatisfaction_ml_result" + DISSATISFACTION_BY_REGEX_MATCH_COL: str = "dissatisfaction_regex_result" + + def __init__( + self, + text_column: str, + name: str, + tokenizer_name_or_path: str, + model_name_or_path: str, + token: Optional[str] = None, + ) -> None: + """ + Attributes initialization. + + Parameters + ---------- + messages_column: str + Name of the column containing the messages. + + name: str + Name of the detector. + """ + + # Input columns + self.text_column = text_column + input_columns: List[str] = [text_column] + + # Output columns + self.result_column = f"{name}_result" + output_columns: List[str] = [self.result_column] + + # Detection regex + self.dissatisfaction_regex: MelusineRegex = DissatisfactionRegex() + self.token = token + + super().__init__( + name=name, + input_columns=input_columns, + output_columns=output_columns, + ) + self.melusine_model = TextClassifier( + tokenizer_name_or_path=tokenizer_name_or_path, model_name_or_path=model_name_or_path, token=self.token + ) + + def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Extract text to analyse. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + # Last message body + message_text: str = row[self.text_column] + + row[self.CONST_TEXT_COL_NAME] = "\n".join([message_text]) + + # Prepare and save debug data + if debug_mode: + debug_dict: Dict[str, Any] = { + self.CONST_DEBUG_TEXT_KEY: row[self.CONST_TEXT_COL_NAME], + } + row[self.debug_dict_col] = debug_dict + + return row + + def by_regex_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Use regex to detect dissatisfaction. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + debug_info: Dict[str, Any] = {} + text: str = row[self.CONST_TEXT_COL_NAME] + detection_data = self.dissatisfaction_regex(text) + detection_result = detection_data[self.dissatisfaction_regex.MATCH_RESULT] + + # Save debug data + if debug_mode: + debug_info[self.dissatisfaction_regex.regex_name] = detection_data + row[self.debug_dict_col].update(debug_info) + + # Create new columns + row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] = detection_result + return row + + def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Use machine learning model to detect dissatisfaction. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + predictions, scores = self.melusine_model.predict(row[self.CONST_TEXT_COL_NAME]) + debug_info: Dict[str, Any] = {} + + row[self.DISSATISFACTION_ML_MATCH_COL], row[self.DISSATISFACTION_ML_SCORE_COL] = bool(predictions[0]), scores[0] + # Save debug data + if debug_mode: + debug_info[self.DISSATISFACTION_ML_MATCH_COL] = row[self.DISSATISFACTION_ML_MATCH_COL] + debug_info[self.DISSATISFACTION_ML_SCORE_COL] = row[self.DISSATISFACTION_ML_SCORE_COL] + row[self.debug_dict_col].update(debug_info) + return row + + def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Apply final eligibility rules. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + # Match on thanks regex & Does not contain a body + ml_result = (row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9) and row[self.DISSATISFACTION_ML_MATCH_COL] + deterministic_result = row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] + row[self.result_column] = deterministic_result or ml_result + return row diff --git a/hugging_face/models/__init__.py b/hugging_face/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hugging_face/models/model.py b/hugging_face/models/model.py new file mode 100644 index 0000000..cab37e4 --- /dev/null +++ b/hugging_face/models/model.py @@ -0,0 +1,85 @@ +from typing import Optional + +import torch +from transformers import AutoModelForSequenceClassification, AutoTokenizer + + +class TextClassifier: + """ """ + + def __init__(self, tokenizer_name_or_path: str, model_name_or_path: str, token: Optional[str]): + """ + Apply model and get prediction + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + self.tokenizer_name_or_path = tokenizer_name_or_path + self.model_name_or_path = model_name_or_path + self.hf_token = token + self.load_model() + + def load_model(self): + """ + Apply model and get prediction + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + if self.hf_token: + self.tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=self.tokenizer_name_or_path, use_auth_token=self.hf_token + ) + self.model = AutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=self.model_name_or_path, num_labels=2, use_auth_token=self.hf_token + ) + else: + self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=self.tokenizer_name_or_path) + self.model = AutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=self.model_name_or_path, num_labels=2 + ) + + def predict(self, text) -> tuple[list, list]: + """ + Apply model and get prediction + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ + + inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") + # Forward pass through the model + outputs = self.model(**inputs) + # Extract logits + self.logits = outputs.logits + # Convert logits to probabilities using softmax + probs = torch.nn.functional.softmax(self.logits, dim=-1) + probs = probs.detach().cpu().numpy() + # Convert predictions and scores to lists + predictions = probs.argmax(axis=1).tolist() + scores = probs.max(axis=1).tolist() + return predictions, scores diff --git a/hugging_face/models/model_1.onnx b/hugging_face/models/model_1.onnx new file mode 100644 index 0000000..e69de29 diff --git a/melusine/base.py b/melusine/base.py index 36c347a..03ca017 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -8,7 +8,8 @@ BaseLabelProcessor, MissingModelInputFieldError, MissingFieldError, - MelusineFeatureEncoder + MelusineFeatureEncoder, + MelusineTransformerDetector ] """ @@ -302,6 +303,45 @@ def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIt """What needs to be done after detection (e.g., mapping columns).""" +class MelusineTransformerDetector(BaseMelusineDetector, ABC): + """ + Defines an interface for detectors. + All detectors used in a MelusinePipeline should inherit from the MelusineDetector class and + implement the abstract methods. + This ensures homogeneous coding style throughout the application. + Alternatively, melusine user's can define their own Interface (inheriting from the BaseMelusineDetector) + to suit their needs. + """ + + @property + def transform_methods(self) -> list[Callable]: + """ + Specify the sequence of methods to be called by the transform method. + + Returns + ------- + _: list[Callable] + List of methods to be called by the transform method. + """ + return [self.pre_detect, self.by_regex_detect, self.by_ml_detect, self.post_detect] + + @abstractmethod + def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """What needs to be done before detection.""" + + @abstractmethod + def by_regex_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """Run detection.""" + + @abstractmethod + def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """Run detection.""" + + @abstractmethod + def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """What needs to be done after detection (e.g., mapping columns).""" + + class MissingFieldError(Exception): """ Exception raised when a missing field is encountered by a MelusineTransformer diff --git a/melusine/regex/__init__.py b/melusine/regex/__init__.py index f27b8a8..59ab848 100644 --- a/melusine/regex/__init__.py +++ b/melusine/regex/__init__.py @@ -2,10 +2,11 @@ The melusine.regex module includes tools for handling regexes. """ +from melusine.regex.dissatisfaction_regex import DissatisfactionRegex from melusine.regex.emergency_regex import EmergencyRegex from melusine.regex.reply_regex import ReplyRegex from melusine.regex.thanks_regex import ThanksRegex from melusine.regex.transfer_regex import TransferRegex from melusine.regex.vacation_reply_regex import VacationReplyRegex -__all__ = ["EmergencyRegex", "ReplyRegex", "ThanksRegex", "TransferRegex", "VacationReplyRegex"] +__all__ = ["EmergencyRegex", "ReplyRegex", "ThanksRegex", "TransferRegex", "VacationReplyRegex", "DissatisfactionRegex"] diff --git a/melusine/regex/dissatisfaction_regex.py b/melusine/regex/dissatisfaction_regex.py new file mode 100644 index 0000000..056049d --- /dev/null +++ b/melusine/regex/dissatisfaction_regex.py @@ -0,0 +1,61 @@ +from typing import Dict, List, Optional, Union + +from melusine.base import MelusineRegex + + +class DissatisfactionRegex(MelusineRegex): + """ + Detect thanks patterns such as "merci". + """ + + @property + def positive(self) -> Union[str, Dict[str, str]]: + """ + Define regex patterns required to activate the MelusineRegex. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + return r"\b(j'en ai marre|insatisfait|c'est nul|trop déçu|décevant|inadmissible|insupportable|intolérable|honteux|lamentable|catastrophe)\b" + + @property + def neutral(self) -> Optional[Union[str, Dict[str, str]]]: + """ + Define regex patterns to be ignored when running detection. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + return None + + @property + def negative(self) -> Optional[Union[str, Dict[str, str]]]: + """ + Define regex patterns prohibited to activate the MelusineRegex. + + Returns: + _: Regex pattern or dict of regex patterns. + """ + return None + + @property + def match_list(self) -> List[str]: + """ + List of texts that should activate the MelusineRegex. + + Returns: + _: List of texts. + """ + return [ + "complétement insatisfait de ce que vous faites", + ] + + @property + def no_match_list(self) -> List[str]: + """ + List of texts that should NOT activate the MelusineRegex. + + Returns: + _: List of texts. + """ + return [] diff --git a/pyproject.toml b/pyproject.toml index 085cb93..020e268 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] description = "Melusine is a high-level library for emails processing" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10,<3.11" keywords = ["nlp", "email", "courriel", "text", "data-science", "machine-learning", "natural-language-processing"] license = {text = "Apache Software License 2.0"} classifiers = [ @@ -42,12 +42,14 @@ dependencies = [ "tqdm>=4.34", "omegaconf>=2.0", ] + dynamic = ["version"] [project.optional-dependencies] # Optional -dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff"] +dev = ["tox", "pre-commit", "black", "flake8", "isort", "mypy", "pytest", "coverage", "build", "ruff" ] test = ["pytest", "coverage", "pytest-cov", "google-auth-oauthlib", "google-api-python-client"] -transformers = ["transformers>4"] +transformers = ["transformers>4" ] +torch-cpu = ["torch>=2.0.0"] connectors = ["exchangelib", "google-auth-oauthlib", "google-api-python-client"] docs = ["mkdocs", "markdown", "mkdocs-material", "mdx-include"] diff --git a/tests/conftest.py b/tests/conftest.py index 678606f..6359e87 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,6 +15,7 @@ "tests.fixtures.docs", "tests.fixtures.pipelines", "tests.fixtures.processors", + "tests.fixtures.models", ] diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py new file mode 100644 index 0000000..cf0666b --- /dev/null +++ b/tests/fixtures/models.py @@ -0,0 +1,58 @@ +from unittest.mock import MagicMock, patch + +import pytest +import torch +from google.oauth2.credentials import Credentials +from googleapiclient.http import HttpRequestMock + +from hugging_face.detectors import DissatisfactionDetector +from hugging_face.models.model import TextClassifier +from melusine.connectors.gmail import GmailConnector + + +def return_value(resp, content): + return content + + +@pytest.fixture +def mock_tokenizer(): + tokenizer = MagicMock() + tokenizer.return_value = {"input_ids": [[101, 102]], "attention_mask": [[1, 1]]} + return tokenizer + + +@pytest.fixture +def mock_model(): + model = MagicMock() + model.return_value.logits = torch.tensor([[0.1, 0.9]]) # Simulated logits + return model + + +@pytest.fixture +def mock_detector(mock_tokenizer, mock_model): + with patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer): + with patch("transformers.AutoModelForSequenceClassification.from_pretrained", return_value=mock_model): + # Create a TextClassifier instance + classifier = TextClassifier( + tokenizer_name_or_path="mock_tokenizer", + model_name_or_path="mock_model", + token=None, + ) + + # Create the DissatisfactionDetector using the mock classifier + detector = DissatisfactionDetector( + name="dissatisfaction", + text_column="det_normalized_last_body", + model_name_or_path="mock_model_path", + tokenizer_name_or_path="mock_tokenizer_path", + token=None, + ) + detector.melusine_model = classifier + return detector + + +# Example test using the mock_detector fixture +def test_mock_detector_instantiation(mock_detector): + assert isinstance(mock_detector, DissatisfactionDetector) + assert mock_detector.name == "dissatisfaction" + assert mock_detector.text_column == "det_normalized_last_body" diff --git a/tests/huggingface/test_dissatisfaction_detector.py b/tests/huggingface/test_dissatisfaction_detector.py new file mode 100644 index 0000000..94cdcc5 --- /dev/null +++ b/tests/huggingface/test_dissatisfaction_detector.py @@ -0,0 +1,147 @@ +""" +Unit tests of the DissatisfactionDetector +The model used inside of the detector is mocked in the fixtures tests +""" + +from unittest.mock import MagicMock, patch + +import pytest +from pandas import DataFrame + +from hugging_face.detectors import DissatisfactionDetector +from hugging_face.models.model import TextClassifier + + +@pytest.mark.usefixtures("mock_detector") +def test_instantiation(mock_detector): + """Test that the mock detector is instantiated correctly.""" + assert isinstance(mock_detector, DissatisfactionDetector) + assert mock_detector.name == "dissatisfaction" + assert mock_detector.text_column == "det_normalized_last_body" + + +@pytest.mark.usefixtures("mock_detector") +@pytest.mark.parametrize( + "row, good_deterministic_result", + [ + ( + {"det_normalized_last_body": "je suis content de votre service."}, + False, + ), + ( + {"det_normalized_last_body": "je suis complètement insatisfait de votre service."}, + True, + ), + ( + { + "det_normalized_last_body": "Franchement, j'en ai marre de ce genre de service qui ne respecte pas ses engagements." + }, + True, + ), + ( + {"det_normalized_last_body": "Je suis trop déçu par la qualité, je m'attendais à bien mieux pour ce prix."}, + True, + ), + ( + {"det_normalized_last_body": "C'est vraiment décevant de voir un tel manque de professionnalisme."}, + True, + ), + ], +) +def test_by_regex_detect(row, good_deterministic_result, mock_detector): + """Unit test of the transform() method.""" + df_copy = row.copy() + df_copy = mock_detector.pre_detect(df_copy, debug_mode=True) + df_copy = mock_detector.by_regex_detect(df_copy, debug_mode=True) + + deterministic_result = mock_detector.DISSATISFACTION_BY_REGEX_MATCH_COL + deterministic_debug_result = mock_detector.debug_dict_col + + assert deterministic_result in df_copy.keys() + assert deterministic_debug_result in df_copy.keys() + assert df_copy[deterministic_result] == good_deterministic_result + + +@pytest.mark.usefixtures("mock_detector") +@pytest.mark.parametrize( + "row, good_ml_result", + [ + ( + {"det_normalized_last_body": "je suis complètement insatisfait de votre service."}, + True, + ), + ( + { + "det_normalized_last_body": "Un service médiocre, avec des frais cachés qui ont presque doublé le coût final. Je ne ferai plus appel à eux." + }, + True, + ), + ( + { + "det_normalized_last_body": "Très déçu. L’article ne correspond pas du tout à la description, et la qualité laisse à désirer." + }, + True, + ), + ], +) +def test_by_ml_detection(row, good_ml_result, mock_detector): + """Unit test of the transform() method.""" + df_copy = row.copy() + # Test result + df_copy = mock_detector.pre_detect(df_copy, debug_mode=True) + df_copy = mock_detector.by_ml_detect(df_copy, debug_mode=True) + + # Test result + ml_result_col = mock_detector.DISSATISFACTION_ML_MATCH_COL + ml_score_col = mock_detector.DISSATISFACTION_ML_SCORE_COL + + assert ml_result_col in df_copy.keys() + assert ml_score_col in df_copy.keys() + assert df_copy[ml_result_col] == good_ml_result + assert isinstance(df_copy[ml_score_col], float) + assert df_copy[ml_score_col] > 0.5 + + +@pytest.mark.usefixtures("mock_detector") +@pytest.mark.parametrize( + "df, good_result", + [ + ( + DataFrame( + { + "det_normalized_last_body": ["je suis complètement insatisfait de votre service."], + } + ), + True, + ), + ( + DataFrame( + { + "det_normalized_last_body": [ + "Ce retard est une véritable catastrophe, cela m'a causé beaucoup de problèmes." + ], + } + ), + True, + ), + ( + DataFrame( + { + "det_normalized_last_body": [ + "Le traitement que j'ai reçu est honteux, surtout venant d'une entreprise comme la vôtre." + ], + } + ), + True, + ), + ], +) +def test_by_transform_detection(df, good_result, mock_detector): + """Unit test of the transform() method.""" + df_copy = df.copy() + # Test result + df_copy = mock_detector.transform(df_copy) + # Test result + result_col = mock_detector.result_column + assert result_col in df_copy.keys() + assert bool(df_copy[result_col][0]) == good_result From ca356cbc67effef32d786be8379fdab6fc66f11a Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 11:15:47 +0100 Subject: [PATCH 03/17] correct tests using tox --- pyproject.toml | 2 +- tests/conftest.py | 2 +- tests/{fixtures => huggingface}/models.py | 3 --- tox.ini | 6 +++++- 4 files changed, 7 insertions(+), 6 deletions(-) rename tests/{fixtures => huggingface}/models.py (92%) diff --git a/pyproject.toml b/pyproject.toml index 020e268..3e85098 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] description = "Melusine is a high-level library for emails processing" readme = "README.md" -requires-python = ">=3.10,<3.11" +requires-python = ">=3.10,<3.13" keywords = ["nlp", "email", "courriel", "text", "data-science", "machine-learning", "natural-language-processing"] license = {text = "Apache Software License 2.0"} classifiers = [ diff --git a/tests/conftest.py b/tests/conftest.py index 6359e87..0ae7903 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,7 @@ "tests.fixtures.docs", "tests.fixtures.pipelines", "tests.fixtures.processors", - "tests.fixtures.models", + "tests.huggingface.models", ] diff --git a/tests/fixtures/models.py b/tests/huggingface/models.py similarity index 92% rename from tests/fixtures/models.py rename to tests/huggingface/models.py index cf0666b..f8fa80c 100644 --- a/tests/fixtures/models.py +++ b/tests/huggingface/models.py @@ -2,12 +2,9 @@ import pytest import torch -from google.oauth2.credentials import Credentials -from googleapiclient.http import HttpRequestMock from hugging_face.detectors import DissatisfactionDetector from hugging_face.models.model import TextClassifier -from melusine.connectors.gmail import GmailConnector def return_value(resp, content): diff --git a/tox.ini b/tox.ini index a94c341..1dc21c9 100644 --- a/tox.ini +++ b/tox.ini @@ -1,12 +1,14 @@ [tox] requires = tox>=4 -env_list = clean, core38, core310, transformers, report +env_list = clean, core38, core310, core311, transformers, report [gh-actions] python = 3.8: clean, core38, transformers 3.10: core310 + 3.11: core311, transformers + [testenv] commands = pytest --cov --cov-append --cov-report xml @@ -15,9 +17,11 @@ deps = pytest-cov google-auth-oauthlib google-api-python-client + torch depends = {core38,transformers}: clean report: core38,transformers +extras = transformers [testenv:core38] deps={[testenv]deps} From 69ee29a43f55e1ca47f3e0a938d5b0ea547b9ac9 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 11:18:37 +0100 Subject: [PATCH 04/17] =?UTF-8?q?correct=20python=20version=20=C3=A9=20git?= =?UTF-8?q?=20add=20.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clear quit() xxxxx$ XXX --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3e85098..a499caf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] description = "Melusine is a high-level library for emails processing" readme = "README.md" -requires-python = ">=3.10,<3.13" +requires-python = ">=3.8,<3.13" keywords = ["nlp", "email", "courriel", "text", "data-science", "machine-learning", "natural-language-processing"] license = {text = "Apache Software License 2.0"} classifiers = [ From 892fe929a257201650a5c3e4843e23fc77c2173d Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 11:30:20 +0100 Subject: [PATCH 05/17] correcting tox 38 version --- hugging_face/models/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hugging_face/models/model.py b/hugging_face/models/model.py index cab37e4..f096daf 100644 --- a/hugging_face/models/model.py +++ b/hugging_face/models/model.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import List, Optional, Tuple import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer @@ -55,7 +55,7 @@ def load_model(self): pretrained_model_name_or_path=self.model_name_or_path, num_labels=2 ) - def predict(self, text) -> tuple[list, list]: + def predict(self, text) -> Tuple[List, List]: """ Apply model and get prediction Parameters From a4dc4c0f83ffd924494b8970da6e19d9ff36676a Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:43:24 +0100 Subject: [PATCH 06/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 479 ++++++++++++---------------- hugging_face/detectors.py | 17 +- hugging_face/models/model.py | 39 +-- 3 files changed, 238 insertions(+), 297 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index eaf3e3b..d1b43cc 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -1,318 +1,253 @@ -# Use pre-trained models from HuggingFace in the Melusine framework +# HuggingFace's pre-trained models in the Melusine framework + -> The Hugging Face library has become an invaluable resource in the data science field, offering easy-to-use models that excel across a variety of natural language processing (NLP) tasks. +[HuggingFace](https://huggingface.co/) -# How to leverage these models within the Melusine framework to build: -* A powerful processor using model embeddings. -* An intelligent detector utilizing fine-tuned model layers. +The Hugging Face library has revolutionized the landscape of natural language processing (NLP) and beyond, redefining the boundaries of what's possible in NLP and other domains and establishing itself as an indispensable tool for researchers, data scientists, and developers. By bridging the gap between cutting-edge research and practical implementation, Hugging Face not only simplifies the complexities of model deployment but also fosters innovation across industries, enabling applications that were once considered out of reach. -Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction results. + -## How to Choose and Use Models +Renowned for its user-friendly interface and extensive collection of pre-trained models, Hugging Face empowers users to tackle a diverse range of tasks from text classification and sentiment analysis to machine translation and question answering. The library's versatility and adaptability make it a cornerstone in modern AI development, providing accurate and efficient models. -The selection of a model depends on the specific detection task. For example: + - **Sentiment detection in French text/emails:** - Suitable models include: camembert-base, distil-camembert-base, or distil-camembert-base. - These models can be seamlessly integrated into your workflow to enhance predictions and optimize detection outcomes. +**Melusine** provides an exceptional framework for streamlining and optimizing email workflows with remarkable efficiency. Its flexible architecture allows seamless integration of machine learning models into its detectors, as demonstrated in the Hugging Face folder, enabling users to harness advanced AI capabilities for enhanced performance. + +## Tutorial : Dissatisfaction detection using Hugging-face models -# Implementing solution : distil-camembert Models -As usual , the detector can be implemented this way , inheriting the MelusineTransformerDetector detector base class +### How to leverage these models within the Melusine framework to build: -```python -class DissatisfactionDetector(MelusineTransformerDetector): - """ - Class to detect emails containing only dissatisfaction text. + - Ex: - Merci à vous, - Cordialement - """ +1. **Custom Email Classifiers**: - # Class constants - BODY_PART: str = "BODY" - DISSATISFACTION_PART: str = "DISSATISFACTION" - - # Intermediate columns - THANKS_TEXT_COL: str = "thanks_text" - THANKS_PARTS_COL: str = "thanks_parts" - HAS_BODY: str = "has_body" - THANKS_MATCH_COL: str = "thanks_match" - - def __init__( - self, - messages_column: str = "messages", - name: str = "dissatisfaction", - ) -> None: - """ - Attributes initialization. +Use pre-trained models from Hugging Face, such as BERT or DistilBERT, to classify emails into custom categories. Integrate these models into Melusine's workflow to improve sorting, spam detection, or customer inquiry prioritization. - Parameters - ---------- - messages_column: str - Name of the column containing the messages. +2. **Named Entity Recognition (NER) for Emails**: - name: str - Name of the detector. - """ +Incorporate Hugging Face's NER models to extract key information such as names, dates, or invoice numbers from email bodies. This integration can automate data extraction, reducing manual effort and errors. - # Input columns - self.messages_column = messages_column - input_columns: List[str] = [self.messages_column] +3. **Sentiment Analysis for Customer Feedback**: - # Output columns - self.result_column = f"{name}_result" - output_columns: List[str] = [self.result_column] +Implement sentiment analysis models to assess the tone of customer emails. - # Detection regex - self.thanks_regex: MelusineRegex = ThanksRegex() +* Classifications such as dissatisfaction or happiness could be assessed and integrated into specialised melusine detectors. - super().__init__( - name=name, - input_columns=input_columns, - output_columns=output_columns, - ) - self.complex_regex_key: str -``` +* Prioritizing urgent issues or to monitor overall customer satisfaction trends. -> The pre_detect method allows to preprocess the data into the type of model inputs . +4. **Topic Modeling for Email Segmentation**: -```python -def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: - """ - Extract text to analyse. - - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. - """ - # Check if a BODY part is present in the last message - has_body: bool = row[self.messages_column][0].has_tags( - target_tags={self.BODY_PART}, stop_at={self.GREETINGS_PART} - ) - - # Extract the DISSATISFACTION part in the last message - dissatisfaction_parts: List[Tuple[str, str]] = row[self.messages_column][ - 0 - ].extract_parts(target_tags={self.DISSATISFACTION_PART}) - - # Compute DISSATISFACTION text - if not dissatisfaction_parts: - dissatisfaction_text: str = "" - else: - dissatisfaction_text = "\n".join(x[1] for x in dissatisfaction_parts) - - # Save debug data - if debug_mode: - debug_dict = { - self.DISSATISFACTION_PARTS_COL: dissatisfaction_parts, - self.DISSATISFACTION_TEXT_COL: dissatisfaction_text, - self.HAS_BODY: has_body, - } - row[self.debug_dict_col].update(debug_dict) - - # Create new columns - row[self.DISSATISFACTION_TEXT_COL] = dissatisfaction_text - row[self.HAS_BODY] = has_body - - return row -``` -> The detection method can be one of the following three : - * deterministic only : using Melusine_regex : - * Machine learning based only : using HF models - * both are combined to one final output : the detection result +Leverage pre-trained topic modeling transformers to group emails by subject or theme. This enables businesses to analyze email traffic patterns and identify frequently discussed topics. +5. **Automated Email Responses or Automated summaires**: -* A dissatisfaction_regex MUST BE CREATED WITH DIFFERENT REGEX USEFUL TO DETECT DISSATISFACTION +Utilize text generation models like GPT-2 or GPT-3 to draft automated, context-aware email replies. Integrate these models into Melusine to improve response times and maintain professional communication. -```python -from typing import Dict, List, Optional, Union -from melusine.base import MelusineRegex +6. **Language Translation for Multilingual Support**: +Enhance Melusine's capabilities by adding Hugging Face's translation models to convert emails into multiple languages. This feature is invaluable for global teams handling diverse customers. -class DissatisfactionRegex(MelusineRegex): - """ - Detect thanks patterns such as "merci". - """ + - @property - def positive(self) -> Union[str, Dict[str, str]]: - """ - Define regex patterns required to activate the MelusineRegex. +By seamlessly integrating these models into the Melusine framework, businesses can unlock advanced email processing capabilities, streamline workflows, and enhance productivity across their operations. Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction results which is the goal of this tutorial : - Returns: - _: Regex pattern or dict of regex patterns. - """ + - return r"\b(j'en ai marre|c'est nul|trop déçu|décevant|inadmissible|insupportable|intolérable|honteux|lamentable|catastrophe)\b" +> model selection - @property - def neutral(self) -> Optional[Union[str, Dict[str, str]]]: - """ - Define regex patterns to be ignored when running detection. + +The selection of a model depends on the specific detection task. For example, **Sentiment detection in French text** +Suitable models include: camembert and distil-camembert. - Returns: - _: Regex pattern or dict of regex patterns. - """ - return None + - @property - def negative(self) -> Optional[Union[str, Dict[str, str]]]: - """ - Define regex patterns prohibited to activate the MelusineRegex. +> Implementing solution : distil-camembert Models - Returns: - _: Regex pattern or dict of regex patterns. - """ - return None + + +As usual , the detector can be implemented this way , inheriting from a **MelusineTransformerDetector** base class. + +The detector adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. + +The MelusineTransformerDetector class has multiple defined methods as demonstrated below + + + +``` python +class MelusineTransformerDetector(BaseMelusineDetector, ABC): + """ + Defines an interface for detectors. + All detectors used in a MelusinePipeline should inherit from the MelusineDetector class and + implement the abstract methods. + This ensures homogeneous coding style throughout the application. + Alternatively, melusine user's can define their own Interface (inheriting from the BaseMelusineDetector) + to suit their needs. + """ @property - def match_list(self) -> List[str]: + def transform_methods(self) -> list[Callable]: """ - List of texts that should activate the MelusineRegex. + Specify the sequence of methods to be called by the transform method. - Returns: - _: List of texts. + Returns + ------- + _: list[Callable] + List of methods to be called by the transform method. """ return [ - "complétement insatisfait de ce que vous faites", + self.pre_detect, + self.by_regex_detect, + self.by_ml_detect, + self.post_detect, ] - @property - def no_match_list(self) -> List[str]: - """ - List of texts that should NOT activate the MelusineRegex. + @abstractmethod + def pre_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """What needs to be done before detection.""" - Returns: - _: List of texts. - """ - return [] + @abstractmethod + def by_regex_detect( + self, row: MelusineItem, debug_mode: bool = False + ) -> MelusineItem: + """Run detection.""" + + @abstractmethod + def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """Run detection.""" + + @abstractmethod + def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """What needs to be done after detection (e.g., mapping columns).""" ``` -After constructing the DissatisfactionRegex class , the by_regex_detect method could be defined -```python -def by_regex_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: - """ - Use regex to detect dissatisfaction. - - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. - """ - debug_info: Dict[str, Any] = {} - text: str = row[self.DISSATISFACTION_TEXT_COL] +> The detection method can be one of the following three : + + * purely deterministic : using the Melusine_regex fonctionality + * Machine learning-based detection : using Hugging-Face models + * Combining deterministic and machine-learning based methods + + + +```mermaid + + graph LR + + A[PRE-DETECT] -- deterministic --> B(by_regex_detect) - detection_data = self.dissatisfaction_regex(text) - detection_result = detection_data[self.dissatisfaction_regex.MATCH_RESULT] + A -- machine-learning based --> C( by_ml_detect) - # Save debug data - if debug_mode: - debug_info[self.dissatisfaction_regex.regex_name] = detection_data - row[self.debug_dict_col].update(debug_info) + A -- combined methods --> D( by_regex_detect & by_ml_detect) - # Create new columns - row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] = detection_result + B --> E[POST-DETECT] + C --> E + D --> E - return row ``` + + + +* In order to detect dissatisfaction emotions by regex, a DissatisfactionRegex class inheriting from melusineregex is required. + + The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) + + After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector + + + + ## The Machine Learning Approach to Detect Dissatisfaction: Two Methods + * Using a Pre-trained Model Directly - The distil-camembert-base model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. - * Fine-tuning the Model - A pre-trained model can be fine-tuned using various methods, including: + In rhis case a hf-token is required as menshioned in the model class. - The Hugging Face Trainer API or PyTorch Lightning. - -> Fine-tuning approaches: - 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. - 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. + The model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. -Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. + + * Fine-tuning the Model : A pre-trained model can be fine-tuned using various methods, including: -> Why distil-camembert-base? -Numerous studies and practical implementations have demonstrated that distil-camembert-base is a highly effective model for sentiment analysis and detecting dissatisfaction, particularly in tasks involving French text. -The model + * The Hugging Face Trainer API -```python -from transformers import AutoTokenizer, AutoModelForSequenceClassification -from torch.nn.functional import softmax + * PyTorch Lightning (https://lightning.ai/docs/pytorch/stable/) + > Fine-tuning approaches: -def load_hfmodel(self, model_name="distilcamembert-base") -> None: - """ - GET Distil-camembert-base from HF - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. - """ + 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. + + 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. + + + + Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. + + A fine-tuned model could be then locally stored and loaded from path. - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSequenceClassification.from_pretrained( - model_name, num_labels=5 - ) # Adjust num_labels for your classification task + + - def predict_fn(self, text) -> List: + ```python + def load_hfmodel(self, model_name="distilcamembert-base") -> None: """ - Apply model and get prediction + GET Distil-camembert-base from HF Parameters + ---------- + + row: MelusineItem Content of an email. + debug_mode: bool Debug mode activation flag. + Returns + + ------- + row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. + Updated row. + + """ + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=2 + ) + + def predict(self, text: str) -> Tuple[List, List]: + """ + Apply model and get prediction + Parameters + ---------- + text: str + Email text Returns ------- row: MelusineItem Updated row. """ - inputs = self.tokenizer( - text, padding=True, truncation=True, return_tensors="pt" - ) + inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") # Forward pass through the model outputs = self.model(**inputs) - + # Extract logits + self.logits = outputs.logits # Convert logits to probabilities using softmax - probs = softmax(logits, dim=1) + probs = torch.nn.functional.softmax(self.logits, dim=-1) + probs = probs.detach().cpu().numpy() + # Convert predictions and scores to lists + predictions = probs.argmax(axis=1).tolist() + scores = probs.max(axis=1).tolist() + return predictions, scores + ``` - # Get predictions (the class index with the highest probability) - predictions = probs.argmax(dim=1).tolist() + + The by_ml_detect function applies the model on a dataset that provides the model tokenized inputs and returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration. The resulting prediction based on the score's validity and its threshold-crossing. - # Get confidence scores for the predicted classes - scores = probs.max(dim=1).values.tolist() - return predictions, scores + + + ```python def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: """ Use machine learning model to detect dissatisfaction. @@ -329,11 +264,14 @@ def load_hfmodel(self, model_name="distilcamembert-base") -> None: row: MelusineItem Updated row. """ + + predictions, scores = self.melusine_model.predict(row[self.CONST_TEXT_COL_NAME]) debug_info: Dict[str, Any] = {} - ( - row[self.DISSATISFACTION_ML_MATCH_COL], - row[self.DISSATISFACTION_ML_SCORE_COL], - ) = self.predict_fn(row[self.DISSATISFACTION_TEXT_COL]) + + row[self.DISSATISFACTION_ML_MATCH_COL], row[self.DISSATISFACTION_ML_SCORE_COL] = ( + bool(predictions[0]), + scores[0], + ) # Save debug data if debug_mode: debug_info[self.DISSATISFACTION_ML_MATCH_COL] = row[ @@ -344,37 +282,40 @@ def load_hfmodel(self, model_name="distilcamembert-base") -> None: ] row[self.debug_dict_col].update(debug_info) return row -``` + ``` + + -> The final detection result could be defined in the **post_detect** method using a predefined condition. -> [! Example ] -> condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) - + > The final detection result could be defined in the **post_detect** method using a predefined condition. + > [! Example ] + > condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) -```python -def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: - """ - Apply final eligibility rules. - - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. - """ + + + ```python + def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Apply final eligibility rules. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. - # Match on thanks regex & Does not contain a body - row[self.result_column] = ( - row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9 - and row[self.DISSATISFACTION_ML_MATCH_COL] - ) or row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] + Returns + ------- + row: MelusineItem + Updated row. + """ - return row -``` \ No newline at end of file + # Match on thanks regex & Does not contain a body + ml_result = (row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9) and row[ + self.DISSATISFACTION_ML_MATCH_COL + ] + deterministic_result = row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] + row[self.result_column] = deterministic_result or ml_result + return row + ``` \ No newline at end of file diff --git a/hugging_face/detectors.py b/hugging_face/detectors.py index 41303df..66d4104 100644 --- a/hugging_face/detectors.py +++ b/hugging_face/detectors.py @@ -15,10 +15,10 @@ class DissatisfactionDetector(MelusineTransformerDetector): """ - Class to detect emails containing only dissatisfaction text. + Class to detect emails containing dissatisfaction emotion. Ex: - Merci à vous, + je vous deteste, Cordialement """ @@ -46,11 +46,18 @@ def __init__( Parameters ---------- - messages_column: str - Name of the column containing the messages. - + text_column: str + Name of the column containing the email text. name: str Name of the detector. + tokenizer_name_or_path: str + Name of model or path of the tokenizer. + model_name_or_path: str + Name of path of the model. + text_column: str + Name of the column containing the email text. + token: Optional[str] + hugging-face token . """ # Input columns diff --git a/hugging_face/models/model.py b/hugging_face/models/model.py index f096daf..87b529b 100644 --- a/hugging_face/models/model.py +++ b/hugging_face/models/model.py @@ -12,11 +12,12 @@ def __init__(self, tokenizer_name_or_path: str, model_name_or_path: str, token: Apply model and get prediction Parameters ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - + tokenizer_name_or_path: str + tokenizer name or path . + model_name_or_path: str + model name or path. + token: Optional[str] + hugging-face pass Returns ------- row: MelusineItem @@ -27,20 +28,15 @@ def __init__(self, tokenizer_name_or_path: str, model_name_or_path: str, token: self.hf_token = token self.load_model() - def load_model(self): + def load_model(self) -> None: """ Apply model and get prediction Parameters ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. Returns ------- - row: MelusineItem - Updated row. + None """ if self.hf_token: self.tokenizer = AutoTokenizer.from_pretrained( @@ -58,17 +54,14 @@ def load_model(self): def predict(self, text) -> Tuple[List, List]: """ Apply model and get prediction - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. + Parameters + ---------- + text: str + Email text + Returns + ------- + predictions, scores: Tuple[List, List] + Model output post softmax appliance """ inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") From 0e9b484bedff47ef3d0bf89e5f8e6a264aa13dd1 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:47:24 +0100 Subject: [PATCH 07/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 261 +++++++++++++--------------- 1 file changed, 125 insertions(+), 136 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index d1b43cc..b0989bd 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -25,31 +25,31 @@ Renowned for its user-friendly interface and extensive collection of pre-trained 1. **Custom Email Classifiers**: -Use pre-trained models from Hugging Face, such as BERT or DistilBERT, to classify emails into custom categories. Integrate these models into Melusine's workflow to improve sorting, spam detection, or customer inquiry prioritization. + Use pre-trained models from Hugging Face, such as BERT or DistilBERT, to classify emails into custom categories. Integrate these models into Melusine's workflow to improve sorting, spam detection, or customer inquiry prioritization. 2. **Named Entity Recognition (NER) for Emails**: -Incorporate Hugging Face's NER models to extract key information such as names, dates, or invoice numbers from email bodies. This integration can automate data extraction, reducing manual effort and errors. + Incorporate Hugging Face's NER models to extract key information such as names, dates, or invoice numbers from email bodies. This integration can automate data extraction, reducing manual effort and errors. 3. **Sentiment Analysis for Customer Feedback**: -Implement sentiment analysis models to assess the tone of customer emails. + Implement sentiment analysis models to assess the tone of customer emails. -* Classifications such as dissatisfaction or happiness could be assessed and integrated into specialised melusine detectors. + * Classifications such as dissatisfaction or happiness could be assessed and integrated into specialised melusine detectors. -* Prioritizing urgent issues or to monitor overall customer satisfaction trends. + * Prioritizing urgent issues or to monitor overall customer satisfaction trends. 4. **Topic Modeling for Email Segmentation**: -Leverage pre-trained topic modeling transformers to group emails by subject or theme. This enables businesses to analyze email traffic patterns and identify frequently discussed topics. + Leverage pre-trained topic modeling transformers to group emails by subject or theme. This enables businesses to analyze email traffic patterns and identify frequently discussed topics. 5. **Automated Email Responses or Automated summaires**: -Utilize text generation models like GPT-2 or GPT-3 to draft automated, context-aware email replies. Integrate these models into Melusine to improve response times and maintain professional communication. + Utilize text generation models like GPT-2 or GPT-3 to draft automated, context-aware email replies. Integrate these models into Melusine to improve response times and maintain professional communication. 6. **Language Translation for Multilingual Support**: -Enhance Melusine's capabilities by adding Hugging Face's translation models to convert emails into multiple languages. This feature is invaluable for global teams handling diverse customers. + Enhance Melusine's capabilities by adding Hugging Face's translation models to convert emails into multiple languages. This feature is invaluable for global teams handling diverse customers. @@ -59,7 +59,6 @@ By seamlessly integrating these models into the Melusine framework, businesses c > model selection - The selection of a model depends on the specific detection task. For example, **Sentiment detection in French text** Suitable models include: camembert and distil-camembert. @@ -67,12 +66,8 @@ Suitable models include: camembert and distil-camembert. > Implementing solution : distil-camembert Models - - As usual , the detector can be implemented this way , inheriting from a **MelusineTransformerDetector** base class. - The detector adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. - The MelusineTransformerDetector class has multiple defined methods as demonstrated below @@ -124,9 +119,10 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): """What needs to be done after detection (e.g., mapping columns).""" ``` + > The detection method can be one of the following three : - * purely deterministic : using the Melusine_regex fonctionality + * Purely deterministic : using the Melusine_regex fonctionality * Machine learning-based detection : using Hugging-Face models * Combining deterministic and machine-learning based methods @@ -149,12 +145,9 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): ``` - - * In order to detect dissatisfaction emotions by regex, a DissatisfactionRegex class inheriting from melusineregex is required. The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) - After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector @@ -162,160 +155,156 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): ## The Machine Learning Approach to Detect Dissatisfaction: Two Methods - * Using a Pre-trained Model Directly - - In rhis case a hf-token is required as menshioned in the model class. +* Using a Pre-trained Model Directly + In this case a hf-token is required as menshioned in the model class. The model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. - - * Fine-tuning the Model : A pre-trained model can be fine-tuned using various methods, including: +* Fine-tuning the Model : A pre-trained model can be fine-tuned using various methods, including: - * The Hugging Face Trainer API + * The Hugging Face Trainer API - * PyTorch Lightning (https://lightning.ai/docs/pytorch/stable/) + * PyTorch Lightning (https://lightning.ai/docs/pytorch/stable/) - > Fine-tuning approaches: + > Fine-tuning approaches: - 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. + 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. - 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. + 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. - - Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. - - A fine-tuned model could be then locally stored and loaded from path. +Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. +A fine-tuned model could be then locally stored and loaded from path. - ```python - def load_hfmodel(self, model_name="distilcamembert-base") -> None: - """ - GET Distil-camembert-base from HF - Parameters +```python +def load_hfmodel(self, model_name="distilcamembert-base") -> None: + """ + GET Distil-camembert-base from HF + Parameters - ---------- + ---------- - row: MelusineItem Content of an email. - debug_mode: bool Debug mode activation flag. - Returns + row: MelusineItem Content of an email. + debug_mode: bool Debug mode activation flag. + Returns - ------- + ------- - row: MelusineItem - Updated row. + row: MelusineItem + Updated row. - """ - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSequenceClassification.from_pretrained( - model_name, num_labels=2 - ) + """ + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=2 + ) - def predict(self, text: str) -> Tuple[List, List]: - """ - Apply model and get prediction - Parameters - ---------- - text: str - Email text - Returns - ------- - row: MelusineItem - Updated row. - """ +def predict(self, text: str) -> Tuple[List, List]: + """ + Apply model and get prediction + Parameters + ---------- + text: str + Email text + Returns + ------- + row: MelusineItem + Updated row. + """ - inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") - # Forward pass through the model - outputs = self.model(**inputs) - # Extract logits - self.logits = outputs.logits - # Convert logits to probabilities using softmax - probs = torch.nn.functional.softmax(self.logits, dim=-1) - probs = probs.detach().cpu().numpy() - # Convert predictions and scores to lists - predictions = probs.argmax(axis=1).tolist() - scores = probs.max(axis=1).tolist() - return predictions, scores - ``` + inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") + # Forward pass through the model + outputs = self.model(**inputs) + # Extract logits + self.logits = outputs.logits + # Convert logits to probabilities using softmax + probs = torch.nn.functional.softmax(self.logits, dim=-1) + probs = probs.detach().cpu().numpy() + # Convert predictions and scores to lists + predictions = probs.argmax(axis=1).tolist() + scores = probs.max(axis=1).tolist() + return predictions, scores +``` - The by_ml_detect function applies the model on a dataset that provides the model tokenized inputs and returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration. The resulting prediction based on the score's validity and its threshold-crossing. + +The by_ml_detect function applies the model on a dataset that provides the model tokenized inputs and returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration. The resulting prediction based on the score's validity and its threshold-crossing. - ```python - def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: - """ - Use machine learning model to detect dissatisfaction. - - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. +```python +def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Use machine learning model to detect dissatisfaction. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ - Returns - ------- - row: MelusineItem - Updated row. - """ + predictions, scores = self.melusine_model.predict(row[self.CONST_TEXT_COL_NAME]) + debug_info: Dict[str, Any] = {} - predictions, scores = self.melusine_model.predict(row[self.CONST_TEXT_COL_NAME]) - debug_info: Dict[str, Any] = {} - - row[self.DISSATISFACTION_ML_MATCH_COL], row[self.DISSATISFACTION_ML_SCORE_COL] = ( - bool(predictions[0]), - scores[0], - ) - # Save debug data - if debug_mode: - debug_info[self.DISSATISFACTION_ML_MATCH_COL] = row[ - self.DISSATISFACTION_ML_MATCH_COL - ] - debug_info[self.DISSATISFACTION_ML_SCORE_COL] = row[ - self.DISSATISFACTION_ML_SCORE_COL - ] - row[self.debug_dict_col].update(debug_info) - return row - ``` + row[self.DISSATISFACTION_ML_MATCH_COL], row[self.DISSATISFACTION_ML_SCORE_COL] = ( + bool(predictions[0]), + scores[0], + ) + # Save debug data + if debug_mode: + debug_info[self.DISSATISFACTION_ML_MATCH_COL] = row[ + self.DISSATISFACTION_ML_MATCH_COL + ] + debug_info[self.DISSATISFACTION_ML_SCORE_COL] = row[ + self.DISSATISFACTION_ML_SCORE_COL + ] + row[self.debug_dict_col].update(debug_info) + return row +``` - > The final detection result could be defined in the **post_detect** method using a predefined condition. - > [! Example ] - > condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) +> The final detection result could be defined in the **post_detect** method using a predefined condition. +> [! Example ] : condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) - ```python - def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: - """ - Apply final eligibility rules. - - Parameters - ---------- - row: MelusineItem - Content of an email. - debug_mode: bool - Debug mode activation flag. - - Returns - ------- - row: MelusineItem - Updated row. - """ +```python +def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: + """ + Apply final eligibility rules. + + Parameters + ---------- + row: MelusineItem + Content of an email. + debug_mode: bool + Debug mode activation flag. + + Returns + ------- + row: MelusineItem + Updated row. + """ - # Match on thanks regex & Does not contain a body - ml_result = (row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9) and row[ - self.DISSATISFACTION_ML_MATCH_COL - ] - deterministic_result = row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] - row[self.result_column] = deterministic_result or ml_result - return row - ``` \ No newline at end of file + # Match on thanks regex & Does not contain a body + ml_result = (row[self.DISSATISFACTION_ML_SCORE_COL] > 0.9) and row[ + self.DISSATISFACTION_ML_MATCH_COL + ] + deterministic_result = row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] + row[self.result_column] = deterministic_result or ml_result + return row +``` \ No newline at end of file From 9e6f00a3363c9b6a4045570db119345a0bad2246 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:49:30 +0100 Subject: [PATCH 08/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index b0989bd..66d0ee9 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -7,8 +7,6 @@ The Hugging Face library has revolutionized the landscape of natural language processing (NLP) and beyond, redefining the boundaries of what's possible in NLP and other domains and establishing itself as an indispensable tool for researchers, data scientists, and developers. By bridging the gap between cutting-edge research and practical implementation, Hugging Face not only simplifies the complexities of model deployment but also fosters innovation across industries, enabling applications that were once considered out of reach. - - Renowned for its user-friendly interface and extensive collection of pre-trained models, Hugging Face empowers users to tackle a diverse range of tasks from text classification and sentiment analysis to machine translation and question answering. The library's versatility and adaptability make it a cornerstone in modern AI development, providing accurate and efficient models. @@ -19,9 +17,9 @@ Renowned for its user-friendly interface and extensive collection of pre-trained ## Tutorial : Dissatisfaction detection using Hugging-face models + ### How to leverage these models within the Melusine framework to build: - 1. **Custom Email Classifiers**: @@ -150,8 +148,6 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector - - ## The Machine Learning Approach to Detect Dissatisfaction: Two Methods @@ -163,14 +159,12 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): * Fine-tuning the Model : A pre-trained model can be fine-tuned using various methods, including: * The Hugging Face Trainer API - * PyTorch Lightning (https://lightning.ai/docs/pytorch/stable/) > Fine-tuning approaches: - 1- Full Fine-tuning: Updates all layers of the model in an autoregressive manner. - - 2- LoRA's PEFT (Parameter-Efficient Fine-Tuning): A more efficient and optimized method that reduces computational cost while achieving excellent results. + 1- **Full Fine-tuning** : Updates all layers of the model in an autoregressive manner. + 2- **LoRA's PEFT (Parameter-Efficient Fine-Tuning)** : A more efficient and optimized method that reduces computational cost while achieving excellent results. Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. @@ -274,14 +268,10 @@ def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineI return row ``` - - - -> The final detection result could be defined in the **post_detect** method using a predefined condition. -> [! Example ] : condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) +The final detection result could be defined in the **post_detect** method using a predefined condition. +[! Example ] : condition : by_regex_detect OR (by_ml_detect and by_ml_detect.score > .9) - ```python def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: """ From 2433a32e59464703a821effdc1bda5ac6fa05222 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:50:41 +0100 Subject: [PATCH 09/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index 66d0ee9..149dc4f 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -55,14 +55,14 @@ By seamlessly integrating these models into the Melusine framework, businesses c -> model selection +**model selection** The selection of a model depends on the specific detection task. For example, **Sentiment detection in French text** Suitable models include: camembert and distil-camembert. -> Implementing solution : distil-camembert Models +**Implementing solution : distil-camembert Models** As usual , the detector can be implemented this way , inheriting from a **MelusineTransformerDetector** base class. The detector adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. @@ -117,12 +117,10 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): """What needs to be done after detection (e.g., mapping columns).""" ``` - -> The detection method can be one of the following three : - - * Purely deterministic : using the Melusine_regex fonctionality - * Machine learning-based detection : using Hugging-Face models - * Combining deterministic and machine-learning based methods +**The detection method can be one of the following three** + * Purely deterministic : using the Melusine_regex fonctionality + * Machine learning-based detection : using Hugging-Face models + * Combining deterministic and machine-learning based methods From bdbd5dea33f0e7c28219083cfa9c65b52411438d Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:53:47 +0100 Subject: [PATCH 10/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index 149dc4f..dbc8054 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -62,13 +62,11 @@ Suitable models include: camembert and distil-camembert. -**Implementing solution : distil-camembert Models** +**Implementing solution** -As usual , the detector can be implemented this way , inheriting from a **MelusineTransformerDetector** base class. -The detector adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. -The MelusineTransformerDetector class has multiple defined methods as demonstrated below +As usual , the detector inherites from a **MelusineTransformerDetector** base class, adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. +The MelusineTransformerDetector class has one additional defined method **by_ml_detect** as demonstrated below - ``` python class MelusineTransformerDetector(BaseMelusineDetector, ABC): @@ -159,7 +157,7 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): * The Hugging Face Trainer API * PyTorch Lightning (https://lightning.ai/docs/pytorch/stable/) - > Fine-tuning approaches: + Fine-tuning approaches: 1- **Full Fine-tuning** : Updates all layers of the model in an autoregressive manner. 2- **LoRA's PEFT (Parameter-Efficient Fine-Tuning)** : A more efficient and optimized method that reduces computational cost while achieving excellent results. @@ -225,9 +223,7 @@ def predict(self, text: str) -> Tuple[List, List]: The by_ml_detect function applies the model on a dataset that provides the model tokenized inputs and returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration. The resulting prediction based on the score's validity and its threshold-crossing. - - - + ```python def by_ml_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineItem: From 740bd410274b1b09fbb35a87dab4589b4f95cd92 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 17:55:28 +0100 Subject: [PATCH 11/17] :memo: --- docs/advanced/PreTrainedModelsHF.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index dbc8054..82edafb 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -141,16 +141,16 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): * In order to detect dissatisfaction emotions by regex, a DissatisfactionRegex class inheriting from melusineregex is required. - The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) - After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector +The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) +After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector ## The Machine Learning Approach to Detect Dissatisfaction: Two Methods * Using a Pre-trained Model Directly - In this case a hf-token is required as menshioned in the model class. - The model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. +In this case a hf-token is required as menshioned in the model class. +The model can be loaded directly from the Hugging Face platform, along with its tokenizer, for immediate use in detecting dissatisfaction. * Fine-tuning the Model : A pre-trained model can be fine-tuned using various methods, including: From a23d09b4544863381082e322ecdf00b3cb0d8906 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 18:09:30 +0100 Subject: [PATCH 12/17] test python 3.13 --- docs/advanced/PreTrainedModelsHF.md | 49 ++++------------------------- pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 44 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index 82edafb..1be7626 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -13,55 +13,18 @@ Renowned for its user-friendly interface and extensive collection of pre-trained **Melusine** provides an exceptional framework for streamlining and optimizing email workflows with remarkable efficiency. Its flexible architecture allows seamless integration of machine learning models into its detectors, as demonstrated in the Hugging Face folder, enabling users to harness advanced AI capabilities for enhanced performance. - - -## Tutorial : Dissatisfaction detection using Hugging-face models - - -### How to leverage these models within the Melusine framework to build: - - -1. **Custom Email Classifiers**: - - Use pre-trained models from Hugging Face, such as BERT or DistilBERT, to classify emails into custom categories. Integrate these models into Melusine's workflow to improve sorting, spam detection, or customer inquiry prioritization. - -2. **Named Entity Recognition (NER) for Emails**: - - Incorporate Hugging Face's NER models to extract key information such as names, dates, or invoice numbers from email bodies. This integration can automate data extraction, reducing manual effort and errors. - -3. **Sentiment Analysis for Customer Feedback**: - - Implement sentiment analysis models to assess the tone of customer emails. - - * Classifications such as dissatisfaction or happiness could be assessed and integrated into specialised melusine detectors. +### Tutorial : Dissatisfaction detection using Hugging-face models - * Prioritizing urgent issues or to monitor overall customer satisfaction trends. - -4. **Topic Modeling for Email Segmentation**: - - Leverage pre-trained topic modeling transformers to group emails by subject or theme. This enables businesses to analyze email traffic patterns and identify frequently discussed topics. - -5. **Automated Email Responses or Automated summaires**: - - Utilize text generation models like GPT-2 or GPT-3 to draft automated, context-aware email replies. Integrate these models into Melusine to improve response times and maintain professional communication. - -6. **Language Translation for Multilingual Support**: - - Enhance Melusine's capabilities by adding Hugging Face's translation models to convert emails into multiple languages. This feature is invaluable for global teams handling diverse customers. - - - -By seamlessly integrating these models into the Melusine framework, businesses can unlock advanced email processing capabilities, streamline workflows, and enhance productivity across their operations. Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction results which is the goal of this tutorial : - - +Whether it's utilizing pre-trained models from Hugging Face, such as BERT or DistilBERT, for email classification, integrating Named Entity Recognition (NER) models to extract key information, leveraging topic modeling transformers to organize emails by themes, or using language translation models to convert emails into multiple languages, all of these capabilities are seamlessly achievable through the Melusine framework. + +By seamlessly integrating these models into the Melusine framework, businesses can unlock advanced email processing capabilities, streamline workflows, and enhance productivity across their operations. Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction. +The integration of these advanced transformations is primarily facilitated through **Melusine detectors**. **model selection** -The selection of a model depends on the specific detection task. For example, **Sentiment detection in French text** -Suitable models include: camembert and distil-camembert. +The selection of a model depends on the specific detection task. For example, **Sentiment detection in French text** suitable models includes camembert and distil-camembert. - **Implementing solution** As usual , the detector inherites from a **MelusineTransformerDetector** base class, adheres to the standard structure of a Melusine detector, with the addition of a method enabling machine learning-based detection. diff --git a/pyproject.toml b/pyproject.toml index a499caf..bf18bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] description = "Melusine is a high-level library for emails processing" readme = "README.md" -requires-python = ">=3.8,<3.13" +requires-python = ">=3.8" keywords = ["nlp", "email", "courriel", "text", "data-science", "machine-learning", "natural-language-processing"] license = {text = "Apache Software License 2.0"} classifiers = [ From a33f9caf3b7d7405d5fce0393adfb1b8631672ed Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Sun, 8 Dec 2024 18:14:20 +0100 Subject: [PATCH 13/17] test python 3.13 --- docs/advanced/PreTrainedModelsHF.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index 1be7626..ad66a81 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -79,9 +79,10 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): ``` **The detection method can be one of the following three** - * Purely deterministic : using the Melusine_regex fonctionality - * Machine learning-based detection : using Hugging-Face models - * Combining deterministic and machine-learning based methods + +* Purely deterministic : using the Melusine_regex fonctionality +* Machine learning-based detection : using Hugging-Face models +* Combining deterministic and machine-learning based methods @@ -105,7 +106,7 @@ class MelusineTransformerDetector(BaseMelusineDetector, ABC): * In order to detect dissatisfaction emotions by regex, a DissatisfactionRegex class inheriting from melusineregex is required. The implemntation can be found in here ! (melusine/regex/dissatisfaction_regex.py) -After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector +After constructing the DissatisfactionRegex class , the by_regex_detect method could be implemented as demonstrated in the DissatisfactionDetector! (huggingface/detectors.py/) ## The Machine Learning Approach to Detect Dissatisfaction: Two Methods @@ -122,8 +123,8 @@ The model can be loaded directly from the Hugging Face platform, along with its Fine-tuning approaches: - 1- **Full Fine-tuning** : Updates all layers of the model in an autoregressive manner. - 2- **LoRA's PEFT (Parameter-Efficient Fine-Tuning)** : A more efficient and optimized method that reduces computational cost while achieving excellent results. + 1- **Full Fine-tuning** : Updates all layers of the model in an autoregressive manner. + 2- **LoRA's PEFT (Parameter-Efficient Fine-Tuning)** : A more efficient and optimized method that reduces computational cost while achieving excellent results. Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. @@ -185,7 +186,7 @@ def predict(self, text: str) -> Tuple[List, List]: -The by_ml_detect function applies the model on a dataset that provides the model tokenized inputs and returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration. The resulting prediction based on the score's validity and its threshold-crossing. +The by_ml_detect function applies the model on a the input text. It returns both the predictions outputs and the scores outputs. A certain threshold could be then defined in the detector configuration so that the resulting prediction would be based on the score's validity and its threshold-crossing. ```python @@ -254,4 +255,6 @@ def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIt deterministic_result = row[self.DISSATISFACTION_BY_REGEX_MATCH_COL] row[self.result_column] = deterministic_result or ml_result return row -``` \ No newline at end of file +``` + + From 0aa0495f403d769ea9f3e4c90899c71ae7ee50e7 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Wed, 11 Dec 2024 15:44:00 +0100 Subject: [PATCH 14/17] enhancing docs --- docs/advanced/PreTrainedModelsHF.md | 14 +++++++------- hugging_face/models/model.py | 21 ++++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/docs/advanced/PreTrainedModelsHF.md b/docs/advanced/PreTrainedModelsHF.md index ad66a81..0b4d099 100644 --- a/docs/advanced/PreTrainedModelsHF.md +++ b/docs/advanced/PreTrainedModelsHF.md @@ -17,7 +17,7 @@ Renowned for its user-friendly interface and extensive collection of pre-trained Whether it's utilizing pre-trained models from Hugging Face, such as BERT or DistilBERT, for email classification, integrating Named Entity Recognition (NER) models to extract key information, leveraging topic modeling transformers to organize emails by themes, or using language translation models to convert emails into multiple languages, all of these capabilities are seamlessly achievable through the Melusine framework. -By seamlessly integrating these models into the Melusine framework, businesses can unlock advanced email processing capabilities, streamline workflows, and enhance productivity across their operations. Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction. +By integrating these models into the Melusine framework, businesses can unlock advanced email processing capabilities, streamline workflows, and enhance productivity across their operations. Transformers-based models from Hugging Face can significantly enhance detection capabilities and act as a complementary approach to strengthen prediction. The integration of these advanced transformations is primarily facilitated through **Melusine detectors**. **model selection** @@ -123,15 +123,12 @@ The model can be loaded directly from the Hugging Face platform, along with its Fine-tuning approaches: - 1- **Full Fine-tuning** : Updates all layers of the model in an autoregressive manner. - 2- **LoRA's PEFT (Parameter-Efficient Fine-Tuning)** : A more efficient and optimized method that reduces computational cost while achieving excellent results. - - + 1- **Full Fine-Tuning**: Involves updating all layers of the model during training, typically used for adapting the model to a specific task. + 2- **LoRA (Low-Rank Adaptation)** in Parameter-Efficient Fine-Tuning (PEFT): A method designed to reduce computational and memory costs by only fine-tuning a small subset of parameters, while maintaining high performance. + Fine-tuning allows customization of the model for specific tasks, improving its performance on datasets relevant to dissatisfaction detection. A fine-tuned model could be then locally stored and loaded from path. - - ```python def load_hfmodel(self, model_name="distilcamembert-base") -> None: @@ -258,3 +255,6 @@ def post_detect(self, row: MelusineItem, debug_mode: bool = False) -> MelusineIt ``` +**Melusine already automates email workflows using deterministic regex-based methods. However, the rapid growth and evolution of artificial intelligence applications in the NLP landscape remain largely untapped. This tutorial offers a glimpse into integrating state-of-the-art models into your workflows. Feel free to experiment with different model types, preprocessing methods, and use cases while maintaining the general structure of the detector. The core purpose of Melusine lies in its modularity and versatility, enabling it to handle a wide range of applications and modeling tools effectively.** + + diff --git a/hugging_face/models/model.py b/hugging_face/models/model.py index 87b529b..598e177 100644 --- a/hugging_face/models/model.py +++ b/hugging_face/models/model.py @@ -5,7 +5,10 @@ class TextClassifier: - """ """ + """ + The modeling class + + """ def __init__(self, tokenizer_name_or_path: str, model_name_or_path: str, token: Optional[str]): """ @@ -54,14 +57,14 @@ def load_model(self) -> None: def predict(self, text) -> Tuple[List, List]: """ Apply model and get prediction - Parameters - ---------- - text: str - Email text - Returns - ------- - predictions, scores: Tuple[List, List] - Model output post softmax appliance + Parameters + ---------- + text: str + Email text + Returns + ------- + predictions, scores: Tuple[List, List] + Model output post softmax appliance """ inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt") From 75b41c00793ab543b4cde334208af6a04e2b4fce Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Wed, 11 Dec 2024 16:31:36 +0100 Subject: [PATCH 15/17] change scikit-learn version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf18bc9..cc8ef01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ dependencies = [ "arrow", "pandas>2", - "scikit-learn>=1", + "scikit-learn<1.6", "tqdm>=4.34", "omegaconf>=2.0", ] From 349158081f5b2837a67012161c2ca8b7c85251f9 Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Wed, 11 Dec 2024 17:21:17 +0100 Subject: [PATCH 16/17] reorganize tox --- tox.ini | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 1dc21c9..beb5612 100644 --- a/tox.ini +++ b/tox.ini @@ -17,11 +17,13 @@ deps = pytest-cov google-auth-oauthlib google-api-python-client - torch + torch depends = {core38,transformers}: clean - report: core38,transformers -extras = transformers + report: core38 +extras = transformers + + [testenv:core38] deps={[testenv]deps} @@ -31,6 +33,10 @@ commands={[testenv]commands} deps={[testenv]deps} commands=pytest tests +[testenv:core311] +deps={[testenv]deps} +commands=pytest tests + [testenv:clean] deps = coverage[toml] skip_install = true @@ -39,7 +45,7 @@ commands = coverage erase [testenv:transformers] description = run unit tests with the transformers dependency deps={[testenv]deps} -commands = pytest tests/huggingface --cov --cov-append --cov-report xml +commands = pytest tests/huggingface extras = transformers [testenv:report] From 5e637a41406c3e384e600577989942c0f6ec70bf Mon Sep 17 00:00:00 2001 From: Meriem JEBALI Date: Wed, 11 Dec 2024 17:39:05 +0100 Subject: [PATCH 17/17] pre commit apply --- tox.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index beb5612..8d43513 100644 --- a/tox.ini +++ b/tox.ini @@ -17,11 +17,11 @@ deps = pytest-cov google-auth-oauthlib google-api-python-client - torch + torch depends = {core38,transformers}: clean - report: core38 -extras = transformers + report: core38 +extras = transformers @@ -45,7 +45,7 @@ commands = coverage erase [testenv:transformers] description = run unit tests with the transformers dependency deps={[testenv]deps} -commands = pytest tests/huggingface +commands = pytest tests/huggingface extras = transformers [testenv:report]