From de071a7ed1fa6c1a064ff5d5f536e69bab885884 Mon Sep 17 00:00:00 2001 From: Oscar Barrios Date: Tue, 26 Nov 2024 11:35:09 +0100 Subject: [PATCH] Evaluate test report --- testsuite/Rakefile | 4 +- .../cucumber_report_history.rb | 21 +-- .../cucumber_report_parser.rb | 15 +- .../cucumber_report_review.py | 0 .../evaluate_cucumber_report.py | 34 ++++ .../machine_learning/gh_issues_parser.rb | 45 +++--- .../machine_learning/gh_issues_train_model.py | 152 ------------------ .../machine_learning/preprocess_datasets.py | 110 +++++++++++++ .../ext-tools/machine_learning/train_model.py | 103 ++++++++++++ 9 files changed, 284 insertions(+), 200 deletions(-) delete mode 100644 testsuite/ext-tools/machine_learning/cucumber_report_review.py create mode 100644 testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py delete mode 100644 testsuite/ext-tools/machine_learning/gh_issues_train_model.py create mode 100644 testsuite/ext-tools/machine_learning/preprocess_datasets.py create mode 100644 testsuite/ext-tools/machine_learning/train_model.py diff --git a/testsuite/Rakefile b/testsuite/Rakefile index 81545bed06a8..534bf6a586d2 100644 --- a/testsuite/Rakefile +++ b/testsuite/Rakefile @@ -187,12 +187,12 @@ namespace :utils do desc 'Collect and tag flaky tests' task :collect_and_tag_flaky_tests do - `ruby ext-tools/machine_learning/gh_issues_parser.rb --collect-and-tag --directory-path features` + `ruby ext-tools/machine_learning/gh_issues_parser.rb --collect_and_tag --directory_path features` end desc 'Generate dataset from GH issues' task :generate_dataset_gh_issues do - `ruby ext-tools/machine_learning/gh_issues_parser.rb --generate-dataset --file-path gh_issues_dataset.json` + `ruby ext-tools/machine_learning/gh_issues_parser.rb --generate_dataset --output_path gh_issues_dataset.json` end desc 'Generate dataset from JSON Cucumber Test Report' diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_history.rb b/testsuite/ext-tools/machine_learning/cucumber_report_history.rb index ccb05ee309c8..fd2b39593511 100755 --- a/testsuite/ext-tools/machine_learning/cucumber_report_history.rb +++ b/testsuite/ext-tools/machine_learning/cucumber_report_history.rb @@ -2,7 +2,6 @@ # Copyright (c) 2024 SUSE LLC. # Licensed under the terms of the MIT license. -require 'csv' require 'json' require 'net/http' require 'optparse' @@ -17,7 +16,7 @@ options[:server] = server end - opts.on('-o', '--output_path FILEPATH', 'Output file path (CSV format)') do |filepath| + opts.on('-o', '--output_path FILEPATH', 'Output file path (JSON format)') do |filepath| options[:output_path] = filepath end @@ -44,26 +43,28 @@ response = Net::HTTP.get_response(uri) if response.is_a?(Net::HTTPSuccess) data = JSON.parse(response.body) + label_mapping = { + 'PASSED' => 0, + 'SKIPPED' => 1, + 'FIXED' => 2, + 'REGRESSION' => 3, + 'FAILED' => 4 + } dataset = data['data']['result'].map do |result| metric = result['metric'] { - label: metric['status'].downcase, + label: label_mapping[metric['status']], description: { - jobname: metric['jobname'], scenario: metric['case'], feature: metric['suite'], + # jobname: metric['jobname'], failedsince: metric['failedsince'].to_i, age: result['value'][1].to_i } } end - CSV.open(options[:output_path], 'w') do |csv| - csv << dataset.first.keys - dataset.each do |entry| - csv << [entry[:label], entry[:description].to_json] - end - end + File.write(options[:output_path], dataset.to_json) else puts "Failed to fetch data from Prometheus: #{response.code} #{response.message}" end diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb b/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb index 2c51039a1c07..00ae452174ca 100644 --- a/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb +++ b/testsuite/ext-tools/machine_learning/cucumber_report_parser.rb @@ -1,7 +1,7 @@ # Copyright (c) 2024 SUSE LLC. # Licensed under the terms of the MIT license. -require 'csv' +require 'base64' require 'json' require 'nokogiri' require 'optparse' @@ -43,9 +43,9 @@ def extract_dataset_from_json(json_report_path) time: (scenario['steps'].sum { |step| step['result']['duration'] || 0 } / 1_000_000_000.0).round } - scenario_data[:error_message] = scenario['steps'].last['result']['error_message'] if scenario['steps'].last['result'].key?('error_message') + scenario_data[:error_message] = Base64.encode64(scenario['steps'].last['result']['error_message']) if scenario['steps'].last['result'].key?('error_message') scenario_data[:tags] = scenario['tags'].map { |tag| tag['name'][1..] } if scenario.key?('tags') - scenario_data[:logs] = logs unless logs.empty? + scenario_data[:logs] = Base64.encode64(logs.to_s) unless logs.empty? scenario_data[:screenshots] = screenshots unless screenshots.empty? if scenario['before'] && scenario['before'].size > 3 && scenario['before'][3].key?('output') @@ -79,7 +79,7 @@ def extract_dataset_from_json(json_report_path) options[:report_path] = f end - opts.on('-o', '--output_path PATH', 'Path to the processed report file (CSV format)') do |f| + opts.on('-o', '--output_path PATH', 'Path to the processed report file (JSON format)') do |f| options[:output_path] = f end @@ -98,9 +98,4 @@ def extract_dataset_from_json(json_report_path) end dataset = extract_dataset_from_json(options[:report_path]) -CSV.open(options[:output_path], 'w') do |csv| - csv << dataset.first.keys - dataset.each do |entry| - csv << [entry[:label], entry[:description].to_json] - end -end +File.write(options[:output_path], dataset.to_json) diff --git a/testsuite/ext-tools/machine_learning/cucumber_report_review.py b/testsuite/ext-tools/machine_learning/cucumber_report_review.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py b/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py new file mode 100644 index 000000000000..9b50f8225674 --- /dev/null +++ b/testsuite/ext-tools/machine_learning/evaluate_cucumber_report.py @@ -0,0 +1,34 @@ +import pandas as pd +from joblib import load +import json + +def evaluate_current_report(current_report_path, model_path, vectorizer_path, output_path): + # Load data + with open(current_report_path, 'r') as file: + current_report = json.load(file) + df = pd.DataFrame(current_report) + + # Load model and vectorizer + model = load(model_path) + vectorizer = load(vectorizer_path) + + # Preprocess and predict + X = vectorizer.transform(df['text']) + df['predicted_root_cause'] = model.predict(X) + + # Save predictions + df.to_csv(output_path, index=False) + print(f"Predictions saved to {output_path}") + +if __name__ == "__main__": + import sys + if len(sys.argv) != 5: + print("Usage: python evaluate_current_report.py ") + sys.exit(1) + + current_report_path = sys.argv[1] + model_path = sys.argv[2] + vectorizer_path = sys.argv[3] + output_path = sys.argv[4] + + evaluate_current_report(current_report_path, model_path, vectorizer_path, output_path) diff --git a/testsuite/ext-tools/machine_learning/gh_issues_parser.rb b/testsuite/ext-tools/machine_learning/gh_issues_parser.rb index b32e04c2c362..90dd01a073ba 100755 --- a/testsuite/ext-tools/machine_learning/gh_issues_parser.rb +++ b/testsuite/ext-tools/machine_learning/gh_issues_parser.rb @@ -4,6 +4,7 @@ # Collect all the issues from a GitHub project board column # and tag the corresponding Cucumber feature files with a given tag +require 'base64' require 'csv' require 'find' require 'json' @@ -146,18 +147,15 @@ def self.generate_dataset(organization, project_number, headers) # end if status_field && status_field['item'] && status_field['item']['content'] title = clean_text(status_field['item']['content']['title']) - description = clean_text(status_field['item']['content']['bodyText']) - comments = status_field['item']['content']['comments']['nodes'].map { |node| clean_text(node['body']) } + description = Base64.encode64(clean_text(status_field['item']['content']['bodyText'])) + comments = Base64.encode64(status_field['item']['content']['comments']['nodes'].map { |node| clean_text(node['body']) }.to_s) matches = title.match(/Feature:(.*)\s*\|\s*Scenario:(.*)/) - gh_issue_content = {} - if matches.nil? - gh_issue_content[:title] = title - else - gh_issue_content[:feature] = matches[1].strip - gh_issue_content[:scenario] = matches[2].strip - end - gh_issue_content[:description] = description - gh_issue_content[:comments] = comments + gh_issue_content = { + feature: matches.nil? ? title : matches[1].strip, + scenario: matches.nil? ? title : matches[2].strip, + description: description, + comments: comments + } dataset.push({ label: label_mapping[label], description: gh_issue_content }) puts "\e[36mCard found\e[0m => #{title}" else @@ -227,20 +225,20 @@ def main OptionParser.new do |opts| opts.banner = 'Usage: ruby gh_issues_parser.rb [options]' - opts.on('-g', '--generate-dataset', 'Generate a dataset from GitHub project board issues') do + opts.on('-g', '--generate_dataset', 'Generate a dataset from GitHub project board issues') do options[:generate_dataset] = true end - opts.on('-c', '--collect-and-tag', 'Collect flaky tests and tag Cucumber features') do + opts.on('-c', '--collect_and_tag', 'Collect flaky tests and tag Cucumber features') do options[:collect_and_tag] = true end - opts.on('-d', '--directory-path PATH', 'Directory path to search for Cucumber feature files') do |path| + opts.on('-d', '--directory_path PATH', 'Directory path to search for Cucumber feature files') do |path| options[:directory_path] = path end - opts.on('-f', '--file-path PATH', 'File path to store the dataset (CSV format)') do |path| - options[:file_path] = path + opts.on('-o', '--output_path PATH', 'File path to store the dataset (JSON format)') do |path| + options[:output_path] = path end opts.on('-h', '--help', 'Show this help message') do @@ -252,17 +250,17 @@ def main parser.parse! unless options[:generate_dataset] || options[:collect_and_tag] - puts 'Please specify either --generate-dataset or --collect-and-tag' + puts 'Please specify either --generate_dataset or --collect_and_tag' exit 1 end - if options[:generate_dataset] && !options[:file_path] - puts 'Please specify the file path using --file-path' + if options[:generate_dataset] && !options[:output_path] + puts 'Please specify the file path using --output_path' exit 1 end if options[:collect_and_tag] && !options[:directory_path] - puts 'Please specify the file path using --directory-path' + puts 'Please specify the file path using --directory_path' exit 1 end @@ -287,12 +285,7 @@ def main if options[:generate_dataset] dataset = GithubProjectBoard.generate_dataset(organization, project_number, headers) - CSV.open(options[:file_path], 'w') do |csv| - csv << dataset.first.keys - dataset.each do |entry| - csv << [entry[:label], entry[:description].to_json] - end - end + File.write(options[:output_path], dataset.to_json) elsif options[:collect_and_tag] columns = { 'New' => 'new_issue', diff --git a/testsuite/ext-tools/machine_learning/gh_issues_train_model.py b/testsuite/ext-tools/machine_learning/gh_issues_train_model.py deleted file mode 100644 index 6963fd745876..000000000000 --- a/testsuite/ext-tools/machine_learning/gh_issues_train_model.py +++ /dev/null @@ -1,152 +0,0 @@ -import sys -import json -import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import classification_report -from sklearn.preprocessing import LabelEncoder -from joblib import dump -from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments -from datasets import Dataset -import torch - -def load_dataset(filepath): - """Load JSON dataset from a file.""" - with open(filepath, 'r') as file: - data = json.load(file) - return data - -def preprocess_data(data): - """Combine and clean title, description, and comments.""" - texts, labels = [], [] - for entry in data: - combined_text = " ".join([ - entry.get('title', ''), - entry.get('description', ''), - " ".join(entry.get('comments', [])) - ]) - texts.append(combined_text) - labels.append(entry['label']) - return texts, labels - -def train_random_forest(texts, labels): - # Encode labels - label_encoder = LabelEncoder() - encoded_labels = label_encoder.fit_transform(labels) - - # Split the dataset - X_train, X_test, y_train, y_test = train_test_split( - texts, encoded_labels, test_size=0.2, random_state=42 - ) - - # Convert text data to TF-IDF features - vectorizer = TfidfVectorizer(max_features=5000) - X_train_vec = vectorizer.fit_transform(X_train) - X_test_vec = vectorizer.transform(X_test) - - # Train a Random Forest classifier - model = RandomForestClassifier(n_estimators=100, random_state=42) - model.fit(X_train_vec, y_train) - - # Evaluate the model - y_pred = model.predict(X_test_vec) - unique_labels = np.unique(y_test) - target_names = [str(label) for label in label_encoder.inverse_transform(unique_labels)] - - print("Classification Report (Random Forest):") - print(classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names, zero_division=0)) - - # Save the model and vectorizer - dump(model, 'random_forest_model.joblib') - dump(vectorizer, 'random_forest_vectorizer.joblib') - dump(label_encoder, 'random_forest_label_encoder.joblib') - print("Random Forest model, vectorizer, and label encoder saved successfully!") - -def train_bert(texts, labels): - # Split into train and test sets - train_texts, test_texts, train_labels, test_labels = train_test_split( - texts, labels, test_size=0.2, random_state=42 - ) - - # Convert to Hugging Face Dataset - train_data = Dataset.from_dict({"text": train_texts, "label": train_labels}) - test_data = Dataset.from_dict({"text": test_texts, "label": test_labels}) - - # Tokenize the data - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - def tokenize_function(example): - return tokenizer(example["text"], padding="max_length", truncation=True) - - train_data = train_data.map(tokenize_function, batched=True) - test_data = test_data.map(tokenize_function, batched=True) - - train_data = train_data.remove_columns(["text"]) - test_data = test_data.remove_columns(["text"]) - train_data.set_format("torch") - test_data.set_format("torch") - - # Load BERT model - num_labels = len(set(labels)) - model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) - - # Define Training Arguments - training_args = TrainingArguments( - output_dir="./bert_results", - eval_strategy="steps", - save_strategy="steps", # Align save strategy with evaluation - save_steps=500, # Frequency of saving (if step-based) - logging_dir="./bert_logs", - num_train_epochs=3, - per_device_train_batch_size=8, - per_device_eval_batch_size=8, - save_total_limit=2, - learning_rate=2e-5, - weight_decay=0.01, - logging_steps=100, - load_best_model_at_end=True - ) - - # Define the Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_data, - eval_dataset=test_data, - processing_class=tokenizer - ) - - # Train the Model - trainer.train() - - # Evaluate the Model - metrics = trainer.evaluate() - print("BERT Model Evaluation Metrics:") - print(metrics) - - # Save the Model - model.save_pretrained("bert_model") - tokenizer.save_pretrained("bert_tokenizer") - print("BERT model and tokenizer saved successfully!") - -def main(model_type, filepath): - # Load and preprocess dataset - data = load_dataset(filepath) - texts, labels = preprocess_data(data) - - if model_type == "random_forest": - train_random_forest(texts, labels) - elif model_type == "bert": - train_bert(texts, labels) - else: - print("Invalid model type. Please choose 'random_forest' or 'bert'.") - sys.exit(1) - -if __name__ == "__main__": - if len(sys.argv) != 3: - print("Usage: python train_model.py ") - sys.exit(1) - - model_type = sys.argv[1].lower() - dataset_path = sys.argv[2].lower() - main(model_type, dataset_path) diff --git a/testsuite/ext-tools/machine_learning/preprocess_datasets.py b/testsuite/ext-tools/machine_learning/preprocess_datasets.py new file mode 100644 index 000000000000..8fd27f2601b7 --- /dev/null +++ b/testsuite/ext-tools/machine_learning/preprocess_datasets.py @@ -0,0 +1,110 @@ +import simplejson as json +import pandas as pd + +def load_json(dataset_path): + """Load JSON dataset from a file using json library.""" + with open(dataset_path, 'r', encoding='utf-8') as file: + data = json.load(file) + return data + + +def preprocess_github_issues(gh_issues_data): + gh_issues_df = pd.DataFrame(gh_issues_data) + gh_issues_df['test_case'] = gh_issues_df['description'].apply( + lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else '' + ) + + # Combine 'description', 'logs', and 'comments' fields for log-based matching + gh_issues_df['description'] = ( + 'description:' + gh_issues_df['description'].apply(lambda d: d.get('description', '') if isinstance(d, dict) else '') + + 'logs:' + gh_issues_df['description'].apply(lambda d: ''.join(d.get('logs', [])) if isinstance(d, dict) else '') + + 'comments:' + gh_issues_df['description'].apply(lambda d: ''.join(d.get('comments', [])) if isinstance(d, dict) else '') + ) + + gh_issues_df['label'] = gh_issues_df['label'].astype(int) + return gh_issues_df[['test_case', 'description', 'label']] + + +def preprocess_cucumber_history(cucumber_history_data): + cucumber_history_df = pd.DataFrame(cucumber_history_data) + cucumber_history_df['test_case'] = cucumber_history_df['description'].apply( + lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else '' + ) + + # Combine 'age' and 'failedsince' fields for historical tracking + cucumber_history_df['description'] = cucumber_history_df['description'].apply( + lambda d: f"age:{d.get('age', 0)} failed_since:{d.get('failedsince', 0)}" if isinstance(d, dict) else '' + ) + + cucumber_history_df['label'] = cucumber_history_df['label'].astype(int) + return cucumber_history_df[['test_case', 'description', 'label']] + + +def preprocess_current_report(current_report_data): + cucumber_report_df = pd.DataFrame(current_report_data) + cucumber_report_df['test_case'] = cucumber_report_df['description'].apply( + lambda d: f"{d.get('feature', '')} | {d.get('scenario', '')}" if isinstance(d, dict) else '' + ) + + # Combine 'description', 'logs', and 'comments' fields for log-based matching + cucumber_report_df['description'] = ( + 'error:' + cucumber_report_df['description'].apply(lambda d: d.get('error_message', '') if isinstance(d, dict) else '') + + 'logs:' + cucumber_report_df['description'].apply(lambda d: ''.join(d.get('logs', [])) if isinstance(d, dict) else '') + ) + + cucumber_report_df['label'] = cucumber_report_df['label'].astype(int) + return cucumber_report_df[['test_case', 'description', 'label']] + + +def merge_datasets(gh_issues_df, cucumber_history_df, current_report_df): + """Merge all datasets into a unified format without duplicates.""" + + # Merge on feature-scenario for log-based matching + combined_logs_df = pd.merge( + current_report_df, + gh_issues_df, + on=['test_case', 'description'], + suffixes=('_current', '_gh') + ) + + # Concatenate GitHub Issues data for log-based matching + combined_df = pd.concat([combined_logs_df, cucumber_history_df], axis=0, ignore_index=True) + + # Convert unhashable types (e.g., dict) to strings for deduplication + for col in ['test_case', 'description', 'label']: + if col in combined_df.columns: + combined_df[col] = combined_df[col].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x) + + # Remove duplicates based on key columns + combined_df = combined_df.drop_duplicates(subset=['test_case', 'description', 'label'], keep='first') + + return combined_df + + +def main(gh_issues_path, cucumber_history_path, current_report_path, output_path): + gh_issues_data = load_json(gh_issues_path) + cucumber_history_data = load_json(cucumber_history_path) + current_report_data = load_json(current_report_path) + + gh_issues_df = preprocess_github_issues(gh_issues_data) + cucumber_history_df = preprocess_cucumber_history(cucumber_history_data) + current_report_df = preprocess_current_report(current_report_data) + + combined_df = merge_datasets(gh_issues_df, cucumber_history_df, current_report_df) + + # Save the combined dataset to a JSON file + combined_df.to_json(output_path, orient='records') + print(f"Combined dataset saved to {output_path}") + +if __name__ == "__main__": + import sys + if len(sys.argv) != 5: + print("Usage: python preprocess_datasets.py ") + sys.exit(1) + + gh_issues_path = sys.argv[1] + cucumber_history_path = sys.argv[2] + current_report_path = sys.argv[3] + output_path = sys.argv[4] + + main(gh_issues_path, cucumber_history_path, current_report_path, output_path) diff --git a/testsuite/ext-tools/machine_learning/train_model.py b/testsuite/ext-tools/machine_learning/train_model.py new file mode 100644 index 000000000000..46b7c8277314 --- /dev/null +++ b/testsuite/ext-tools/machine_learning/train_model.py @@ -0,0 +1,103 @@ +import pandas as pd +from sklearn.linear_model import SGDClassifier +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from joblib import dump, load +import base64 +import sys +import numpy as np +import json + +def decode_base64(encoded_str): + """Decode Base64 encoded string to its original form.""" + try: + if encoded_str: + return base64.b64decode(encoded_str).decode('utf-8') + return '' + except Exception as e: + print(f"Error decoding string: {e}") + return encoded_str + +def decode_description(df): + """Parse and decode Base64-encoded fields nested in the 'description' column.""" + def decode_nested_fields(description): + try: + # Parse the JSON object embedded in the 'description' column + data = json.loads(description) + # Decode individual fields if they exist + if 'description' in data: + data['description'] = decode_base64(data['description']) + if 'comments' in data: + data['comments'] = decode_base64(data['comments']) + if 'error_message' in data: + data['error_message'] = decode_base64(data['error_message']) + if 'logs' in data: + data['logs'] = decode_base64(data['logs']) + return data + except Exception as e: + print(f"Error processing description: {e}") + return description # Return original description on failure + + # Apply decoding logic to the 'description' column + df['description'] = df['description'].apply(decode_nested_fields) + return df + + +def train_model(dataset_path, model_path, vectorizer_path, output_model_path, output_vectorizer_path): + # Load previous model and vectorizer + try: + model = load(model_path) + print(f"Loaded existing model from {model_path}") + except FileNotFoundError: + # Initialize SGDClassifier if no previous model exists + model = SGDClassifier(loss="log", random_state=42, warm_start=True) + print("No previous model found, initializing a new model.") + + try: + vectorizer = load(vectorizer_path) + print(f"Loaded existing vectorizer from {vectorizer_path}") + except FileNotFoundError: + # Initialize a new vectorizer if no previous one exists + vectorizer = TfidfVectorizer(max_features=5000) + print("No previous vectorizer found, initializing a new vectorizer.") + + # Load and preprocess the new dataset + df = pd.read_json(dataset_path) + + # Decode the Base64 encoded fields + print("Decoding Base64-encoded fields...") + df = decode_description(df) + + # Vectorize the decoded 'text' field + X = vectorizer.fit_transform(df['description']) + y = df['label'].tolist() + + # Incrementally update the model + print("Updating model incrementally with new data...") + model.partial_fit(X, y, classes=np.unique(y)) # Incremental fit with new data + + # Evaluate the updated model (optional, for reporting) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + y_pred = model.predict(X_test) + print("Classification Report (Root Cause):") + print(classification_report(y_test, y_pred)) + + # Save the updated model and vectorizer + dump(model, output_model_path) + dump(vectorizer, output_vectorizer_path) + print(f"Updated model saved to {output_model_path}") + print(f"Updated vectorizer saved to {output_vectorizer_path}") + +if __name__ == "__main__": + if len(sys.argv) != 6: + print("Usage: python train_model.py ") + sys.exit(1) + + dataset_path = sys.argv[1] + model_path = sys.argv[2] + vectorizer_path = sys.argv[3] + output_model_path = sys.argv[4] + output_vectorizer_path = sys.argv[5] + + train_model(dataset_path, model_path, vectorizer_path, output_model_path, output_vectorizer_path)