From de071a7ed1fa6c1a064ff5d5f536e69bab885884 Mon Sep 17 00:00:00 2001
From: Oscar Barrios <obarrios@suse.com>
Date: Tue, 26 Nov 2024 11:35:09 +0100
Subject: [PATCH] Evaluate test report

---
 testsuite/Rakefile                            |   4 +-
 .../cucumber_report_history.rb                |  21 +--
 .../cucumber_report_parser.rb                 |  15 +-
 .../cucumber_report_review.py                 |   0
 .../evaluate_cucumber_report.py               |  34 ++++
 .../machine_learning/gh_issues_parser.rb      |  45 +++---
 .../machine_learning/gh_issues_train_model.py | 152 ------------------
 .../machine_learning/preprocess_datasets.py   | 110 +++++++++++++
 .../ext-tools/machine_learning/train_model.py | 103 ++++++++++++
 9 files changed, 284 insertions(+), 200 deletions(-)
 delete mode 100644 testsuite/ext-tools/machine_learning/cucumber_report_review.py
 create mode 100644 testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py
 delete mode 100644 testsuite/ext-tools/machine_learning/gh_issues_train_model.py
 create mode 100644 testsuite/ext-tools/machine_learning/preprocess_datasets.py
 create mode 100644 testsuite/ext-tools/machine_learning/train_model.py

diff --git a/testsuite/Rakefile b/testsuite/Rakefile
index 81545bed06a8..534bf6a586d2 100644
--- a/testsuite/Rakefile
+++ b/testsuite/Rakefile
@@ -187,12 +187,12 @@ namespace :utils do
 
   desc 'Collect and tag flaky tests'
   task :collect_and_tag_flaky_tests do
-    `ruby ext-tools/machine_learning/gh_issues_parser.rb --collect-and-tag --directory-path features`
+    `ruby ext-tools/machine_learning/gh_issues_parser.rb --collect_and_tag --directory_path features`
   end
 
   desc 'Generate dataset from GH issues'
   task :generate_dataset_gh_issues do
-    `ruby ext-tools/machine_learning/gh_issues_parser.rb --generate-dataset --file-path gh_issues_dataset.json`
+    `ruby ext-tools/machine_learning/gh_issues_parser.rb --generate_dataset --output_path gh_issues_dataset.json`
   end
 
   desc 'Generate dataset from JSON Cucumber Test Report'
diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_history.rb b/testsuite/ext-tools/machine_learning/cucumber_report_history.rb
index ccb05ee309c8..fd2b39593511 100755
--- a/testsuite/ext-tools/machine_learning/cucumber_report_history.rb
+++ b/testsuite/ext-tools/machine_learning/cucumber_report_history.rb
@@ -2,7 +2,6 @@
 # Copyright (c) 2024 SUSE LLC.
 # Licensed under the terms of the MIT license.
 
-require 'csv'
 require 'json'
 require 'net/http'
 require 'optparse'
@@ -17,7 +16,7 @@
       options[:server] = server
     end
 
-    opts.on('-o', '--output_path FILEPATH', 'Output file path (CSV format)') do |filepath|
+    opts.on('-o', '--output_path FILEPATH', 'Output file path (JSON format)') do |filepath|
       options[:output_path] = filepath
     end
 
@@ -44,26 +43,28 @@
   response = Net::HTTP.get_response(uri)
   if response.is_a?(Net::HTTPSuccess)
     data = JSON.parse(response.body)
+    label_mapping = {
+      'PASSED' => 0,
+      'SKIPPED' => 1,
+      'FIXED' => 2,
+      'REGRESSION' => 3,
+      'FAILED' => 4
+    }
     dataset =
       data['data']['result'].map do |result|
         metric = result['metric']
         {
-          label: metric['status'].downcase,
+          label: label_mapping[metric['status']],
           description: {
-            jobname: metric['jobname'],
             scenario: metric['case'],
             feature: metric['suite'],
+            # jobname: metric['jobname'],
             failedsince: metric['failedsince'].to_i,
             age: result['value'][1].to_i
           }
         }
       end
-    CSV.open(options[:output_path], 'w') do |csv|
-      csv << dataset.first.keys
-      dataset.each do |entry|
-        csv << [entry[:label], entry[:description].to_json]
-      end
-    end
+    File.write(options[:output_path], dataset.to_json)
   else
     puts "Failed to fetch data from Prometheus: #{response.code} #{response.message}"
   end
diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb b/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb
index 2c51039a1c07..00ae452174ca 100644
--- a/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb
+++ b/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb
@@ -1,7 +1,7 @@
 # Copyright (c) 2024 SUSE LLC.
 # Licensed under the terms of the MIT license.
 
-require 'csv'
+require 'base64'
 require 'json'
 require 'nokogiri'
 require 'optparse'
@@ -43,9 +43,9 @@ def extract_dataset_from_json(json_report_path)
         time: (scenario['steps'].sum { |step| step['result']['duration'] || 0 } / 1_000_000_000.0).round
       }
 
-      scenario_data[:error_message] = scenario['steps'].last['result']['error_message'] if scenario['steps'].last['result'].key?('error_message')
+      scenario_data[:error_message] = Base64.encode64(scenario['steps'].last['result']['error_message']) if scenario['steps'].last['result'].key?('error_message')
       scenario_data[:tags] = scenario['tags'].map { |tag| tag['name'][1..] } if scenario.key?('tags')
-      scenario_data[:logs] = logs unless logs.empty?
+      scenario_data[:logs] = Base64.encode64(logs.to_s) unless logs.empty?
       scenario_data[:screenshots] = screenshots unless screenshots.empty?
 
       if scenario['before'] && scenario['before'].size > 3 && scenario['before'][3].key?('output')
@@ -79,7 +79,7 @@ def extract_dataset_from_json(json_report_path)
       options[:report_path] = f
     end
 
-    opts.on('-o', '--output_path PATH', 'Path to the processed report file (CSV format)') do |f|
+    opts.on('-o', '--output_path PATH', 'Path to the processed report file (JSON format)') do |f|
       options[:output_path] = f
     end
 
@@ -98,9 +98,4 @@ def extract_dataset_from_json(json_report_path)
 end
 
 dataset = extract_dataset_from_json(options[:report_path])
-CSV.open(options[:output_path], 'w') do |csv|
-  csv << dataset.first.keys
-  dataset.each do |entry|
-    csv << [entry[:label], entry[:description].to_json]
-  end
-end
+File.write(options[:output_path], dataset.to_json)
diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_review.py b/testsuite/ext-tools/machine_learning/cucumber_report_review.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py b/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py
new file mode 100644
index 000000000000..9b50f8225674
--- /dev/null
+++ b/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py
@@ -0,0 +1,34 @@
+import pandas as pd
+from joblib import load
+import json
+
+def evaluate_current_report(current_report_path, model_path, vectorizer_path, output_path):
+    # Load data
+    with open(current_report_path, 'r') as file:
+        current_report = json.load(file)
+    df = pd.DataFrame(current_report)
+
+    # Load model and vectorizer
+    model = load(model_path)
+    vectorizer = load(vectorizer_path)
+
+    # Preprocess and predict
+    X = vectorizer.transform(df['text'])
+    df['predicted_root_cause'] = model.predict(X)
+
+    # Save predictions
+    df.to_csv(output_path, index=False)
+    print(f"Predictions saved to {output_path}")
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 5:
+        print("Usage: python evaluate_current_report.py <current_report_path> <model_path> <vectorizer_path> <output_path>")
+        sys.exit(1)
+
+    current_report_path = sys.argv[1]
+    model_path = sys.argv[2]
+    vectorizer_path = sys.argv[3]
+    output_path = sys.argv[4]
+
+    evaluate_current_report(current_report_path, model_path, vectorizer_path, output_path)
diff --git a/testsuite/ext-tools/machine_learning/gh_issues_parser.rb b/testsuite/ext-tools/machine_learning/gh_issues_parser.rb
index b32e04c2c362..90dd01a073ba 100755
--- a/testsuite/ext-tools/machine_learning/gh_issues_parser.rb
+++ b/testsuite/ext-tools/machine_learning/gh_issues_parser.rb
@@ -4,6 +4,7 @@
 # Collect all the issues from a GitHub project board column
 # and tag the corresponding Cucumber feature files with a given tag
 
+require 'base64'
 require 'csv'
 require 'find'
 require 'json'
@@ -146,18 +147,15 @@ def self.generate_dataset(organization, project_number, headers)
       # end
       if status_field && status_field['item'] && status_field['item']['content']
         title = clean_text(status_field['item']['content']['title'])
-        description = clean_text(status_field['item']['content']['bodyText'])
-        comments = status_field['item']['content']['comments']['nodes'].map { |node| clean_text(node['body']) }
+        description = Base64.encode64(clean_text(status_field['item']['content']['bodyText']))
+        comments = Base64.encode64(status_field['item']['content']['comments']['nodes'].map { |node| clean_text(node['body']) }.to_s)
         matches = title.match(/Feature:(.*)\s*\|\s*Scenario:(.*)/)
-        gh_issue_content = {}
-        if matches.nil?
-          gh_issue_content[:title] = title
-        else
-          gh_issue_content[:feature] = matches[1].strip
-          gh_issue_content[:scenario] = matches[2].strip
-        end
-        gh_issue_content[:description] = description
-        gh_issue_content[:comments] = comments
+        gh_issue_content = {
+          feature: matches.nil? ? title : matches[1].strip,
+          scenario: matches.nil? ? title : matches[2].strip,
+          description: description,
+          comments: comments
+        }
         dataset.push({ label: label_mapping[label], description: gh_issue_content })
         puts "\e[36mCard found\e[0m => #{title}"
       else
@@ -227,20 +225,20 @@ def main
     OptionParser.new do |opts|
       opts.banner = 'Usage: ruby gh_issues_parser.rb [options]'
 
-      opts.on('-g', '--generate-dataset', 'Generate a dataset from GitHub project board issues') do
+      opts.on('-g', '--generate_dataset', 'Generate a dataset from GitHub project board issues') do
         options[:generate_dataset] = true
       end
 
-      opts.on('-c', '--collect-and-tag', 'Collect flaky tests and tag Cucumber features') do
+      opts.on('-c', '--collect_and_tag', 'Collect flaky tests and tag Cucumber features') do
         options[:collect_and_tag] = true
       end
 
-      opts.on('-d', '--directory-path PATH', 'Directory path to search for Cucumber feature files') do |path|
+      opts.on('-d', '--directory_path PATH', 'Directory path to search for Cucumber feature files') do |path|
         options[:directory_path] = path
       end
 
-      opts.on('-f', '--file-path PATH', 'File path to store the dataset (CSV format)') do |path|
-        options[:file_path] = path
+      opts.on('-o', '--output_path PATH', 'File path to store the dataset (JSON format)') do |path|
+        options[:output_path] = path
       end
 
       opts.on('-h', '--help', 'Show this help message') do
@@ -252,17 +250,17 @@ def main
   parser.parse!
 
   unless options[:generate_dataset] || options[:collect_and_tag]
-    puts 'Please specify either --generate-dataset or --collect-and-tag'
+    puts 'Please specify either --generate_dataset or --collect_and_tag'
     exit 1
   end
 
-  if options[:generate_dataset] && !options[:file_path]
-    puts 'Please specify the file path using --file-path'
+  if options[:generate_dataset] && !options[:output_path]
+    puts 'Please specify the file path using --output_path'
     exit 1
   end
 
   if options[:collect_and_tag] && !options[:directory_path]
-    puts 'Please specify the file path using --directory-path'
+    puts 'Please specify the file path using --directory_path'
     exit 1
   end
 
@@ -287,12 +285,7 @@ def main
 
   if options[:generate_dataset]
     dataset = GithubProjectBoard.generate_dataset(organization, project_number, headers)
-    CSV.open(options[:file_path], 'w') do |csv|
-      csv << dataset.first.keys
-      dataset.each do |entry|
-        csv << [entry[:label], entry[:description].to_json]
-      end
-    end
+    File.write(options[:output_path], dataset.to_json)
   elsif options[:collect_and_tag]
     columns = {
       'New' => 'new_issue',
diff --git a/testsuite/ext-tools/machine_learning/gh_issues_train_model.py b/testsuite/ext-tools/machine_learning/gh_issues_train_model.py
deleted file mode 100644
index 6963fd745876..000000000000
--- a/testsuite/ext-tools/machine_learning/gh_issues_train_model.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import sys
-import json
-import numpy as np
-from sklearn.model_selection import train_test_split
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import classification_report
-from sklearn.preprocessing import LabelEncoder
-from joblib import dump
-from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
-from datasets import Dataset
-import torch
-
-def load_dataset(filepath):
-    """Load JSON dataset from a file."""
-    with open(filepath, 'r') as file:
-        data = json.load(file)
-    return data
-
-def preprocess_data(data):
-    """Combine and clean title, description, and comments."""
-    texts, labels = [], []
-    for entry in data:
-        combined_text = " ".join([
-            entry.get('title', ''),
-            entry.get('description', ''),
-            " ".join(entry.get('comments', []))
-        ])
-        texts.append(combined_text)
-        labels.append(entry['label'])
-    return texts, labels
-
-def train_random_forest(texts, labels):
-    # Encode labels
-    label_encoder = LabelEncoder()
-    encoded_labels = label_encoder.fit_transform(labels)
-
-    # Split the dataset
-    X_train, X_test, y_train, y_test = train_test_split(
-        texts, encoded_labels, test_size=0.2, random_state=42
-    )
-
-    # Convert text data to TF-IDF features
-    vectorizer = TfidfVectorizer(max_features=5000)
-    X_train_vec = vectorizer.fit_transform(X_train)
-    X_test_vec = vectorizer.transform(X_test)
-
-    # Train a Random Forest classifier
-    model = RandomForestClassifier(n_estimators=100, random_state=42)
-    model.fit(X_train_vec, y_train)
-
-    # Evaluate the model
-    y_pred = model.predict(X_test_vec)
-    unique_labels = np.unique(y_test)
-    target_names = [str(label) for label in label_encoder.inverse_transform(unique_labels)]
-
-    print("Classification Report (Random Forest):")
-    print(classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names, zero_division=0))
-
-    # Save the model and vectorizer
-    dump(model, 'random_forest_model.joblib')
-    dump(vectorizer, 'random_forest_vectorizer.joblib')
-    dump(label_encoder, 'random_forest_label_encoder.joblib')
-    print("Random Forest model, vectorizer, and label encoder saved successfully!")
-
-def train_bert(texts, labels):
-    # Split into train and test sets
-    train_texts, test_texts, train_labels, test_labels = train_test_split(
-        texts, labels, test_size=0.2, random_state=42
-    )
-
-    # Convert to Hugging Face Dataset
-    train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
-    test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})
-
-    # Tokenize the data
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-    def tokenize_function(example):
-        return tokenizer(example["text"], padding="max_length", truncation=True)
-
-    train_data = train_data.map(tokenize_function, batched=True)
-    test_data = test_data.map(tokenize_function, batched=True)
-
-    train_data = train_data.remove_columns(["text"])
-    test_data = test_data.remove_columns(["text"])
-    train_data.set_format("torch")
-    test_data.set_format("torch")
-
-    # Load BERT model
-    num_labels = len(set(labels))
-    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
-
-    # Define Training Arguments
-    training_args = TrainingArguments(
-        output_dir="./bert_results",
-        eval_strategy="steps",
-        save_strategy="steps",  # Align save strategy with evaluation
-        save_steps=500,  # Frequency of saving (if step-based)
-        logging_dir="./bert_logs",
-        num_train_epochs=3,
-        per_device_train_batch_size=8,
-        per_device_eval_batch_size=8,
-        save_total_limit=2,
-        learning_rate=2e-5,
-        weight_decay=0.01,
-        logging_steps=100,
-        load_best_model_at_end=True
-    )
-
-    # Define the Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_data,
-        eval_dataset=test_data,
-        processing_class=tokenizer
-    )
-
-    # Train the Model
-    trainer.train()
-
-    # Evaluate the Model
-    metrics = trainer.evaluate()
-    print("BERT Model Evaluation Metrics:")
-    print(metrics)
-
-    # Save the Model
-    model.save_pretrained("bert_model")
-    tokenizer.save_pretrained("bert_tokenizer")
-    print("BERT model and tokenizer saved successfully!")
-
-def main(model_type, filepath):
-    # Load and preprocess dataset
-    data = load_dataset(filepath)
-    texts, labels = preprocess_data(data)
-
-    if model_type == "random_forest":
-        train_random_forest(texts, labels)
-    elif model_type == "bert":
-        train_bert(texts, labels)
-    else:
-        print("Invalid model type. Please choose 'random_forest' or 'bert'.")
-        sys.exit(1)
-
-if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("Usage: python train_model.py <model_type> <path_to_dataset.json>")
-        sys.exit(1)
-
-    model_type = sys.argv[1].lower()
-    dataset_path = sys.argv[2].lower()
-    main(model_type, dataset_path)
diff --git a/testsuite/ext-tools/machine_learning/preprocess_datasets.py b/testsuite/ext-tools/machine_learning/preprocess_datasets.py
new file mode 100644
index 000000000000..8fd27f2601b7
--- /dev/null
+++ b/testsuite/ext-tools/machine_learning/preprocess_datasets.py
@@ -0,0 +1,110 @@
+import simplejson as json
+import pandas as pd
+
+def load_json(dataset_path):
+    """Load JSON dataset from a file using json library."""
+    with open(dataset_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+
+
+def preprocess_github_issues(gh_issues_data):
+    gh_issues_df = pd.DataFrame(gh_issues_data)
+    gh_issues_df['test_case'] = gh_issues_df['description'].apply(
+        lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else ''
+    )
+
+    # Combine 'description', 'logs', and 'comments' fields for log-based matching
+    gh_issues_df['description'] = (
+            'description:' + gh_issues_df['description'].apply(lambda d: d.get('description', '') if isinstance(d, dict) else '') +
+            'logs:' + gh_issues_df['description'].apply(lambda d: ''.join(d.get('logs', [])) if isinstance(d, dict) else '') +
+            'comments:' + gh_issues_df['description'].apply(lambda d: ''.join(d.get('comments', [])) if isinstance(d, dict) else '')
+    )
+
+    gh_issues_df['label'] = gh_issues_df['label'].astype(int)
+    return gh_issues_df[['test_case', 'description', 'label']]
+
+
+def preprocess_cucumber_history(cucumber_history_data):
+    cucumber_history_df = pd.DataFrame(cucumber_history_data)
+    cucumber_history_df['test_case'] = cucumber_history_df['description'].apply(
+        lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else ''
+    )
+
+    # Combine 'age' and 'failedsince' fields for historical tracking
+    cucumber_history_df['description'] = cucumber_history_df['description'].apply(
+        lambda d: f"age:{d.get('age', 0)} failed_since:{d.get('failedsince', 0)}" if isinstance(d, dict) else ''
+    )
+
+    cucumber_history_df['label'] = cucumber_history_df['label'].astype(int)
+    return cucumber_history_df[['test_case', 'description', 'label']]
+
+
+def preprocess_current_report(current_report_data):
+    cucumber_report_df = pd.DataFrame(current_report_data)
+    cucumber_report_df['test_case'] = cucumber_report_df['description'].apply(
+        lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else ''
+    )
+
+    # Combine 'description', 'logs', and 'comments' fields for log-based matching
+    cucumber_report_df['description'] = (
+            'error:' + cucumber_report_df['description'].apply(lambda d: d.get('error_message', '') if isinstance(d, dict) else '') +
+            'logs:' + cucumber_report_df['description'].apply(lambda d: ''.join(d.get('logs', [])) if isinstance(d, dict) else '')
+    )
+
+    cucumber_report_df['label'] = cucumber_report_df['label'].astype(int)
+    return cucumber_report_df[['test_case', 'description', 'label']]
+
+
+def merge_datasets(gh_issues_df, cucumber_history_df, current_report_df):
+    """Merge all datasets into a unified format without duplicates."""
+
+    # Merge on feature-scenario for log-based matching
+    combined_logs_df = pd.merge(
+        current_report_df,
+        gh_issues_df,
+        on=['test_case', 'description'],
+        suffixes=('_current', '_gh')
+    )
+
+    # Concatenate GitHub Issues data for log-based matching
+    combined_df = pd.concat([combined_logs_df, cucumber_history_df], axis=0, ignore_index=True)
+
+    # Convert unhashable types (e.g., dict) to strings for deduplication
+    for col in ['test_case', 'description', 'label']:
+        if col in combined_df.columns:
+            combined_df[col] = combined_df[col].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
+
+    # Remove duplicates based on key columns
+    combined_df = combined_df.drop_duplicates(subset=['test_case', 'description', 'label'], keep='first')
+
+    return combined_df
+
+
+def main(gh_issues_path, cucumber_history_path, current_report_path, output_path):
+    gh_issues_data = load_json(gh_issues_path)
+    cucumber_history_data = load_json(cucumber_history_path)
+    current_report_data = load_json(current_report_path)
+
+    gh_issues_df = preprocess_github_issues(gh_issues_data)
+    cucumber_history_df = preprocess_cucumber_history(cucumber_history_data)
+    current_report_df = preprocess_current_report(current_report_data)
+
+    combined_df = merge_datasets(gh_issues_df, cucumber_history_df, current_report_df)
+
+    # Save the combined dataset to a JSON file
+    combined_df.to_json(output_path, orient='records')
+    print(f"Combined dataset saved to {output_path}")
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 5:
+        print("Usage: python preprocess_datasets.py <github_issues_path> <cucumber_history_path> <current_report_path> <output_path>")
+        sys.exit(1)
+
+    gh_issues_path = sys.argv[1]
+    cucumber_history_path = sys.argv[2]
+    current_report_path = sys.argv[3]
+    output_path = sys.argv[4]
+
+    main(gh_issues_path, cucumber_history_path, current_report_path, output_path)
diff --git a/testsuite/ext-tools/machine_learning/train_model.py b/testsuite/ext-tools/machine_learning/train_model.py
new file mode 100644
index 000000000000..46b7c8277314
--- /dev/null
+++ b/testsuite/ext-tools/machine_learning/train_model.py
@@ -0,0 +1,103 @@
+import pandas as pd
+from sklearn.linear_model import SGDClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+from joblib import dump, load
+import base64
+import sys
+import numpy as np
+import json
+
+def decode_base64(encoded_str):
+    """Decode Base64 encoded string to its original form."""
+    try:
+        if encoded_str:
+            return base64.b64decode(encoded_str).decode('utf-8')
+        return ''
+    except Exception as e:
+        print(f"Error decoding string: {e}")
+        return encoded_str
+
+def decode_description(df):
+    """Parse and decode Base64-encoded fields nested in the 'description' column."""
+    def decode_nested_fields(description):
+        try:
+            # Parse the JSON object embedded in the 'description' column
+            data = json.loads(description)
+            # Decode individual fields if they exist
+            if 'description' in data:
+                data['description'] = decode_base64(data['description'])
+            if 'comments' in data:
+                data['comments'] = decode_base64(data['comments'])
+            if 'error_message' in data:
+                data['error_message'] = decode_base64(data['error_message'])
+            if 'logs' in data:
+                data['logs'] = decode_base64(data['logs'])
+            return data
+        except Exception as e:
+            print(f"Error processing description: {e}")
+            return description  # Return original description on failure
+
+    # Apply decoding logic to the 'description' column
+    df['description'] = df['description'].apply(decode_nested_fields)
+    return df
+
+
+def train_model(dataset_path, model_path, vectorizer_path, output_model_path, output_vectorizer_path):
+    # Load previous model and vectorizer
+    try:
+        model = load(model_path)
+        print(f"Loaded existing model from {model_path}")
+    except FileNotFoundError:
+        # Initialize SGDClassifier if no previous model exists
+        model = SGDClassifier(loss="log", random_state=42, warm_start=True)
+        print("No previous model found, initializing a new model.")
+
+    try:
+        vectorizer = load(vectorizer_path)
+        print(f"Loaded existing vectorizer from {vectorizer_path}")
+    except FileNotFoundError:
+        # Initialize a new vectorizer if no previous one exists
+        vectorizer = TfidfVectorizer(max_features=5000)
+        print("No previous vectorizer found, initializing a new vectorizer.")
+
+    # Load and preprocess the new dataset
+    df = pd.read_json(dataset_path)
+
+    # Decode the Base64 encoded fields
+    print("Decoding Base64-encoded fields...")
+    df = decode_description(df)
+
+    # Vectorize the decoded 'text' field
+    X = vectorizer.fit_transform(df['description'])
+    y = df['label'].tolist()
+
+    # Incrementally update the model
+    print("Updating model incrementally with new data...")
+    model.partial_fit(X, y, classes=np.unique(y))  # Incremental fit with new data
+
+    # Evaluate the updated model (optional, for reporting)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    y_pred = model.predict(X_test)
+    print("Classification Report (Root Cause):")
+    print(classification_report(y_test, y_pred))
+
+    # Save the updated model and vectorizer
+    dump(model, output_model_path)
+    dump(vectorizer, output_vectorizer_path)
+    print(f"Updated model saved to {output_model_path}")
+    print(f"Updated vectorizer saved to {output_vectorizer_path}")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 6:
+        print("Usage: python train_model.py <new_dataset_path> <model_path> <vectorizer_path> <output_model_path> <output_vectorizer_path>")
+        sys.exit(1)
+
+    dataset_path = sys.argv[1]
+    model_path = sys.argv[2]
+    vectorizer_path = sys.argv[3]
+    output_model_path = sys.argv[4]
+    output_vectorizer_path = sys.argv[5]
+
+    train_model(dataset_path, model_path, vectorizer_path, output_model_path, output_vectorizer_path)