diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 441ff70ad..7ec4856f8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,5 +37,4 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
-        args: ['--fix']
       - id: ruff-format
diff --git a/src/lighteval/logging/__init__.py b/src/lighteval/logging/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index e7628f845..0515af461 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -556,7 +556,7 @@ def push_results_to_tensorboard(  # noqa: C901
 
         tb_context.close()  # flushes the unfinished write operations
         time.sleep(5)
-        files = os.listdir(str(output_dir_tb))
+        files = os.listdir(output_dir_tb)
         for file in files:
             os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
 
@@ -566,3 +566,5 @@ def push_results_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
             f" at {output_dir_tb} and global_step {global_step}"
         )
+        # except Exception as e:
+        #     logger.warning(f"Could not push to tensorboard\n{e}")
diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 76be14215..e16963a9c 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -1,6 +1,5 @@
 import collections
 import random
-from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -40,42 +39,6 @@
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
 
-@dataclass
-class CustomEvaluationTaskConfig:
-    name: str
-    prompt_function: str
-    hf_repo: str
-    hf_subset: str
-    metric: Tuple[Metrics]
-    hf_avail_splits: Optional[Tuple[str]] = None
-    evaluation_splits: Optional[Tuple[str]] = None
-    few_shots_split: Optional[str] = None
-    few_shots_select: Optional[str] = None
-    generation_size: int = -1
-    stop_sequence: Optional[Tuple[str]] = None
-    output_regex: Optional[str] = None
-
-    frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
-
-    def __post_init__(self):
-        if self.suite is None:
-            self.suite = ["custom"]
-        if self.hf_avail_splits is None:
-            self.hf_avail_splits = ["train", "validation", "test"]
-        if self.evaluation_splits is None:
-            self.evaluation_splits = ["validation"]
-        if self.stop_sequence is None:
-            self.stop_sequence = ["\n"]
-
-        # Convert list to tuple for hashing
-        self.metric = tuple(self.metric)
-        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
-        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
-        self.suite = tuple(self.suite) if self.suite else None
-        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
-
-
 class LightevalTask:
     def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
         """
diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
index 296db3720..b0dae200c 100644
--- a/tasks_examples/custom_tasks/custom_evaluation_tasks.py
+++ b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
@@ -6,41 +6,44 @@
 """
 import re
 from dataclasses import asdict
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
-from lighteval.metrics import MetricCategory, Metrics
-from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
+from .custom_evaluation_utils import *
 
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
-_TASKS: List[CustomEvaluationTaskConfig] = []
+
+# fmt: off
+LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+# fmt: on
+
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
+_TASKS: List[CustomEvaluationTask] = []
 
 ## COMMON_SENSE_REASONING_TASKS ##
 COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="hellaswag",
         prompt_function="hellaswag_prompt",
         hf_repo="hellaswag",
         hf_subset="default",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="winogrande",
         prompt_function="winogrande",
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="piqa",
         prompt_function="piqa_harness",
         hf_repo="piqa",
         hf_subset="plain_text",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="siqa",
         prompt_function="siqa_prompt",
         hf_repo="lighteval/siqa",
@@ -48,14 +51,14 @@
         hf_avail_splits=["train", "validation"],
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="openbookqa",
         prompt_function="openbookqa",
         hf_repo="openbookqa",
         hf_subset="main",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:easy",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -64,7 +67,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:challenge",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -73,7 +76,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="commonsense_qa",
         prompt_function="commonsense_qa_prompt",
         hf_repo="commonsense_qa",
@@ -131,7 +134,7 @@ def preprocess(text):
 ## WORLD_KNOWLEDGE_TASKS ##
 
 WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="trivia_qa",
         prompt_function="triviaqa",
         hf_repo="trivia_qa",
@@ -140,7 +143,7 @@ def preprocess(text):
         generation_size=20,
         stop_sequence=["\n", ".", ","],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="natural_questions",
         prompt_function="natural_questions_prompt",
         hf_repo="lighteval/natural_questions_clean",
@@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
 ## Reading comprehension ##
 
 READING_COMP_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="super_glue:boolq",
         prompt_function="boolq_prompt",
         hf_repo="super_glue",
         hf_subset="boolq",
         metric=["target_perplexity"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="quac",
         prompt_function="quac",
         hf_repo="lighteval/quac_helm",
@@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):
 
 
 ## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMathEvaluationTask(CustomEvaluationTask):
     """Custom class for math tasks with all the defaults set"""
 
     def __init__(
@@ -251,7 +254,7 @@ def __init__(
     CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
     CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
 ]
-GSM8K = CustomEvaluationTaskConfig(
+GSM8K = CustomEvaluationTask(
     name="gsm8k",
     prompt_function="gsm8k",
     hf_repo="gsm8k",
@@ -272,7 +275,7 @@ def __init__(
 
 
 ## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMMLUEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
 ## BBH ##
 
 
-class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
+class CustomBBHEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):
 
 
 ## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
+class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -617,7 +620,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 
 
 ## HUMAN EVAL ##
-# human_eval = CustomEvaluationTaskConfig(
+# human_eval = CustomEvaluationTask(
 #         name="human_eval",
 #         prompt_function="human_eval",
 #         hf_repo="lighteval/human_eval",
@@ -625,9 +628,9 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 #     ),
 
 
-def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
+def has_generative_metrics(task: CustomEvaluationTask) -> bool:
     for metric in task.metric:
-        if metric.category == MetricCategory.GENERATIVE:
+        if metric in NEEDS_GENERATION_ONLY:
             return True
     return False
 
diff --git a/tasks_examples/custom_tasks/custom_evaluation_utils.py b/tasks_examples/custom_tasks/custom_evaluation_utils.py
new file mode 100644
index 000000000..d3f005db1
--- /dev/null
+++ b/tasks_examples/custom_tasks/custom_evaluation_utils.py
@@ -0,0 +1,159 @@
+"""
+Custom evaluation tasks for lighteval
+"""
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional, Tuple, Union
+
+
+class Metrics(Enum):
+    any_target_loglikelihood_acc = auto()
+    bert_score = auto()
+    bias = auto()
+    bits_per_byte = auto()
+    bleu = auto()
+    bleu_1 = auto()
+    bleu_4 = auto()
+    byte_perplexity = auto()
+    chrf = auto()
+    code_eval_APPS = auto()
+    code_eval_HE = auto()
+    copyright = auto()
+    disinformation = auto()
+    exact_match = auto()
+    exact_set_match = auto()
+    extractiveness = auto()
+    f1_from_bags = auto()
+    f1_quasi = auto()
+    f1_sequence = auto()
+    f1_set_match = auto()
+    faithfulness = auto()
+    iou_set_match = auto()
+    log_prob = auto()
+    loglikelihood_acc = auto()
+    loglikelihood_acc_norm = auto()
+    loglikelihood_acc_norm_nospace = auto()
+    loglikelihood_acc_norm_single_token = auto()
+    loglikelihood_acc_single_token = auto()
+    loglikelihood_f1 = auto()
+    loglikelihood_f1_single_token = auto()
+    math_quasi_exact_match = auto()
+    mc_taco = auto()
+    mcc = auto()
+    mcc_single_token = auto()
+    mrr = auto()
+    mrr_single_token = auto()
+    multi_fi_numeric = auto()
+    one_choice_loglikelihood_acc = auto()
+    perfect_exact_match = auto()
+    prediction_perplexity = auto()
+    prefix_exact_match = auto()
+    prefix_quasi_exact_match = auto()
+    quasi_exact_match = auto()
+    quasi_exact_match2 = auto()
+    ranking = auto()
+    recall_at_1_single_token = auto()
+    recall_at_2_single_token = auto()
+    recall_at_1 = auto()
+    recall_at_2 = auto()
+    rouge = auto()
+    rouge_1 = auto()
+    rouge_2 = auto()
+    rouge_l = auto()
+    target_perplexity = auto()
+    ter = auto()
+    toxicity = auto()
+    truthfulqa_mc_metrics = auto()
+    word_perplexity = auto()
+
+    def __str__(self):
+        return self.name.replace("_at_", "@")
+
+
+NEEDS_GENERATION_ONLY = [
+    "perfect_exact_match",
+    "exact_match",
+    "quasi_exact_match",
+    "quasi_exact_match2",
+    "prefix_exact_match",
+    "prefix_quasi_exact_match",
+    "math_quasi_exact_match",
+    "iou_set_match",
+    "exact_set_match",
+    "f1_sequence",
+    "f1_quasi",
+    "f1_set_match",
+    "f1_from_bags",
+    "chrf",
+    "ter",
+    "rouge",
+    "rouge_1",
+    "rouge_2",
+    "rouge_l",
+    "faithfulness",
+    "extractiveness",
+    "bert_score",
+    "bleu",
+    "bleu_1",
+    "bleu_4",
+    "bias",
+    "toxicity",
+    "code_eval_HE",
+    "code_eval_APPS",
+    "copyright",
+]
+
+
+@dataclass(unsafe_hash=True)
+class CustomEvaluationTask:
+    name: str
+    prompt_function: str
+    hf_repo: str
+    hf_subset: str
+    metric: Tuple[Union[str, Metrics]]
+    hf_avail_splits: Optional[Tuple[str]] = None
+    evaluation_splits: Optional[Tuple[str]] = None
+    few_shots_split: Optional[str] = None
+    few_shots_select: Optional[str] = None
+    generation_size: int = -1
+    stop_sequence: Optional[Tuple[str]] = None
+    output_regex: Optional[str] = None
+
+    frozen: bool = False
+    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+
+    def __post_init__(self):
+        self.metric = [str(m) for m in self.metric]
+        if self.suite is None:
+            self.suite = ["custom"]
+        if self.hf_avail_splits is None:
+            self.hf_avail_splits = ["train", "validation", "test"]
+        if self.evaluation_splits is None:
+            self.evaluation_splits = ["validation"]
+        if self.stop_sequence is None:
+            self.stop_sequence = ["\n"]
+
+        # Convert list to tuple for hashing
+        self.metric = tuple(self.metric)
+        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
+        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
+        self.suite = tuple(self.suite) if self.suite else None
+        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
+
+
+@dataclass(unsafe_hash=True)
+class BigCodeEvaluationTask:
+    name: str
+    bigcode_task: str
+    bigcode_task_kwargs: Optional[dict] = None
+    n_samples: int = 1
+    prefix: Optional[str] = None
+
+    suite: Tuple[str] = None
+
+    def __post_init__(self):
+        if self.suite is None:
+            self.suite = ("bigcode",)
+
+        # Convert list to tuple for hashing
+        self.suite = tuple(self.suite)