Merge branch 'main' into Feature/#517

Marker-Inc-Korea · Jun 30, 2024 · 9183b82 · 9183b82
2 parents 9e8ccff + 8850fa4
commit 9183b82
Show file tree

Hide file tree

Showing 19 changed files with 151 additions and 12 deletions.
diff --git a/autorag/VERSION b/autorag/VERSION
@@ -1 +1 @@
-0.2.7
+0.2.8
diff --git a/autorag/evaluation/metric/generation.py b/autorag/evaluation/metric/generation.py
@@ -16,11 +16,12 @@
 
 from autorag import embedding_models
 from autorag.evaluation.metric.util import calculate_cosine_similarity
-from autorag.utils.util import process_batch, openai_truncate_by_token
+from autorag.utils.util import process_batch, openai_truncate_by_token, convert_inputs_to_list
 
 
 def generation_metric(func):
     @functools.wraps(func)
+    @convert_inputs_to_list
     def wrapper(generation_gt: List[List[str]], generations: List[str], **kwargs) -> List[float]:
         """
         Compute generation metric.
@@ -39,6 +40,7 @@ def wrapper(generation_gt: List[List[str]], generations: List[str], **kwargs) ->
     return wrapper
 
 
+@convert_inputs_to_list
 def huggingface_evaluate(instance, key: str,
                          generation_gt: List[List[str]], generations: List[str],
                          **kwargs) -> List[float]:
@@ -83,6 +85,7 @@ def bleu(generation_gt: List[List[str]], generations: [str], tokenize: str|None
     return result
 
 
+@convert_inputs_to_list
 def meteor(generation_gt: List[List[str]], generations: List[str],
            alpha: float = 0.9,
            beta: float = 3.0,
@@ -110,6 +113,7 @@ def meteor(generation_gt: List[List[str]], generations: List[str],
     return result
 
 
+@convert_inputs_to_list
 def rouge(generation_gt: List[List[str]], generations: List[str],
           rouge_type: Optional[str] = 'rougeL',
           use_stemmer: bool = False,
@@ -154,6 +158,7 @@ async def compute(gt: List[str], pred: str) -> float:
     return result
 
 
+@convert_inputs_to_list
 def sem_score(generation_gt: List[List[str]], generations: List[str],
               embedding_model: Optional[BaseEmbedding] = None,
               batch: int = 128) -> List[float]:
@@ -207,6 +212,7 @@ def sem_score(generation_gt: List[List[str]], generations: List[str],
     return result
 
 
+@convert_inputs_to_list
 def g_eval(generation_gt: List[List[str]], generations: List[str],
            metrics: Optional[List[str]] = None,
            model: str = 'gpt-4-0125-preview',
@@ -296,6 +302,7 @@ def get_g_eval_score(responses, max_score: int = 5) -> int:
     return sum(g_eval_scores) / len(g_eval_scores)
 
 
+@convert_inputs_to_list
 def bert_score(generation_gt: List[List[str]], generations: List[str],
                lang: str = 'en',
                batch: int = 128,

diff --git a/autorag/evaluation/metric/retrieval.py b/autorag/evaluation/metric/retrieval.py
@@ -1,11 +1,15 @@
 import functools
 import itertools
-import math
 from typing import List
 
+import math
+
+from autorag.utils.util import convert_inputs_to_list
+
 
 def retrieval_metric(func):
     @functools.wraps(func)
+    @convert_inputs_to_list
     def wrapper(retrieval_gt: List[List[List[str]]], pred_ids: List[List[str]]) -> List[float]:
         results = []
         for gt, pred in zip(retrieval_gt, pred_ids):

diff --git a/autorag/evaluation/metric/retrieval_contents.py b/autorag/evaluation/metric/retrieval_contents.py
@@ -9,11 +9,12 @@
 
 import numpy as np
 
-from autorag.utils.util import normalize_string
+from autorag.utils.util import normalize_string, convert_inputs_to_list
 
 
 def retrieval_contents_metric(func):
     @functools.wraps(func)
+    @convert_inputs_to_list
     def wrapper(gt_contents: List[List[str]], pred_contents: List[List[str]]) -> List[float]:
         results = []
         for gt, pred in zip(gt_contents, pred_contents):

diff --git a/autorag/nodes/passagereranker/flag_embedding.py b/autorag/nodes/passagereranker/flag_embedding.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, Iterable
 
 import pandas as pd
 import torch
@@ -58,7 +58,7 @@ def flag_embedding_run_model(input_texts, model, batch_size: int):
     for batch_texts in tqdm(batch_input_texts):
         with torch.no_grad():
             pred_scores = model.compute_score(sentence_pairs=batch_texts)
-        if batch_size == 1:
+        if batch_size == 1 or not isinstance(pred_scores, Iterable):
             results.append(pred_scores)
         else:
             results.extend(pred_scores)

diff --git a/autorag/nodes/queryexpansion/base.py b/autorag/nodes/queryexpansion/base.py
@@ -42,6 +42,9 @@ def wrapper(
                                 prompt=prompt,
                                 generator_func=generator_callable,
                                 generator_params=generator_param)
+        # delete empty string in the nested expanded queries list
+        expanded_queries = [list(map(lambda x: x.strip(), sublist)) for sublist in expanded_queries]
+        expanded_queries = [list(filter(lambda x: bool(x), sublist)) for sublist in expanded_queries]
         return expanded_queries
 
     return wrapper

diff --git a/autorag/nodes/queryexpansion/query_decompose.py b/autorag/nodes/queryexpansion/query_decompose.py
@@ -47,7 +47,7 @@
 
     Question: {question}
 
-    Decompositions:"
+    Decompositions:
     """
 
 

diff --git a/autorag/nodes/queryexpansion/run.py b/autorag/nodes/queryexpansion/run.py
@@ -68,7 +68,7 @@ def run_query_expansion_node(modules: List[Callable],
     # Run evaluation when there are more than one module.
     if len(modules) > 1:
         # pop general keys from strategies (e.g. metrics, speed_threshold)
-        general_key = ['metrics', 'speed_threshold']
+        general_key = ['metrics', 'speed_threshold', 'strategy']
         general_strategy = dict(filter(lambda x: x[0] in general_key, strategies.items()))
         extra_strategy = dict(filter(lambda x: x[0] not in general_key, strategies.items()))
 
@@ -93,7 +93,8 @@ def run_query_expansion_node(modules: List[Callable],
         # run evaluation
         evaluation_results = list(map(lambda result: evaluate_one_query_expansion_node(
             retrieval_callables, retrieval_params, result['queries'].tolist(), retrieval_gt,
-            general_strategy['metrics'], project_dir, previous_result, strategies.get('strategy', 'mean')), results))
+            general_strategy['metrics'], project_dir, previous_result, general_strategy.get('strategy', 'mean')),
+                                      results))
 
         evaluation_df = pd.DataFrame({
             'filename': filenames,

diff --git a/autorag/utils/util.py b/autorag/utils/util.py
@@ -8,13 +8,16 @@
 import os
 import re
 import string
-import unicodedata
 from copy import deepcopy
-from typing import List, Callable, Dict, Optional, Any, Collection
+from typing import List, Callable, Dict, Optional, Any, Collection, Iterable
 
+import numpy as np
 import pandas as pd
 import tiktoken
+import unicodedata
 from llama_index.embeddings.openai import OpenAIEmbedding
+from pydantic import BaseModel as BM
+from pydantic.v1 import BaseModel
 
 logger = logging.getLogger("AutoRAG")
 
@@ -454,3 +457,30 @@ def embedding_query_content(queries: List[str], contents_list: List[List[str]],
     content_embeddings_flatten = embedding_model.get_text_embedding_batch(flatten_contents)
     content_embeddings = reconstruct_list(content_embeddings_flatten, content_lengths)
     return query_embeddings, content_embeddings
+
+
+def to_list(item):
+    """Recursively convert collections to Python lists."""
+    if isinstance(item, np.ndarray):
+        # Convert numpy array to list and recursively process each element
+        return [to_list(sub_item) for sub_item in item.tolist()]
+    elif isinstance(item, pd.Series):
+        # Convert pandas Series to list and recursively process each element
+        return [to_list(sub_item) for sub_item in item.tolist()]
+    elif isinstance(item, Iterable) and not isinstance(item, (str, bytes, BaseModel, BM)):
+        # Recursively process each element in other iterables
+        return [to_list(sub_item) for sub_item in item]
+    else:
+        return item
+
+
+def convert_inputs_to_list(func):
+    """Decorator to convert all function inputs to Python lists."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        new_args = [to_list(arg) for arg in args]
+        new_kwargs = {k: to_list(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+
+    return wrapper
diff --git a/docs/source/nodes/query_expansion/query_decompose.md b/docs/source/nodes/query_expansion/query_decompose.md
@@ -31,3 +31,55 @@ modules:
   llm: openai
   model: [ gpt-3.5-turbo-16k, gpt-3.5-turbo-1106 ]
 ```
+
+## Default Prompt
+
+When the question doesn't need decomposition, it must return "The question needs no decomposition".
+Plus, each question will be allocated in `{question}`, so you have to write it in the prompt.
+
+```
+Decompose a question in self-contained sub-questions. Use \"The question needs no decomposition\" when no decomposition is needed.
+
+Example 1:
+
+Question: Is Hamlet more common on IMDB than Comedy of Errors?
+Decompositions: 
+1: How many listings of Hamlet are there on IMDB?
+2: How many listing of Comedy of Errors is there on IMDB?
+
+Example 2:
+
+Question: Are birds important to badminton?
+
+Decompositions:
+The question needs no decomposition
+
+Example 3:
+
+Question: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?
+
+Decompositions:
+1: What is the minimum driving age in the US?
+2: What is the minimum age for someone to be employed in the US?
+
+Example 4:
+
+Question: Are all cucumbers the same texture?
+
+Decompositions:
+The question needs no decomposition
+
+Example 5:
+
+Question: Hydrogen's atomic number squared exceeds number of Spice Girls?
+
+Decompositions:
+1: What is the atomic number of hydrogen?
+2: How many Spice Girls are there?
+
+Example 6:
+
+Question: {question}
+
+Decompositions:
+```
diff --git a/tests/autorag/evaluate/metric/test_retrieval_metric.py b/tests/autorag/evaluate/metric/test_retrieval_metric.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 from autorag.evaluation.metric import (retrieval_f1, retrieval_precision, retrieval_recall, retrieval_ndcg,
@@ -34,6 +35,15 @@ def test_retrieval_f1():
         assert gt == pytest.approx(res, rel=1e-4)
 
 
+def test_numpy_retrieval_metric():
+    retrieval_gt_np = [[np.array(['test-1', 'test-4'])], np.array([['test-2']])]
+    pred_np = np.array([['test-2', 'test-3', 'test-1'], ['test-5', 'test-6', 'test-8']])
+    solution = [1.0, 0.0]
+    result = retrieval_recall(retrieval_gt=retrieval_gt_np, pred_ids=pred_np)
+    for gt, res in zip(solution, result):
+        assert gt == pytest.approx(res, rel=1e-4)
+
+
 def test_retrieval_recall():
     solution = [0.5, 1 / 3, 1, 2 / 3, 1, None, None, 1]
     result = retrieval_recall(retrieval_gt=retrieval_gt, pred_ids=pred)

diff --git a/tests/autorag/nodes/passageaugmenter/test_run_passage_augmenter.py b/tests/autorag/nodes/passageaugmenter/test_run_passage_augmenter.py
@@ -29,6 +29,7 @@ def test_run_passage_augmenter_node(node_line_dir):
     module_params = [{'top_k': 2, 'num_passages': 1}]
     strategies = {
         'metrics': ['retrieval_f1', 'retrieval_recall'],
+        'strategy': 'rank',
     }
     best_result = run_passage_augmenter_node(modules, module_params, previous_result, node_line_dir, strategies)
     assert os.path.exists(os.path.join(node_line_dir, "passage_augmenter"))

diff --git a/tests/autorag/nodes/passagecompressor/test_run_passage_compressor_node.py b/tests/autorag/nodes/passagecompressor/test_run_passage_compressor_node.py
@@ -82,6 +82,7 @@ def test_run_passage_compressor_node(node_line_dir):
                      {'llm': 'mock', 'model': 'gpt-3.5-turbo'}]
     strategies = {
         'metrics': ['retrieval_token_f1', 'retrieval_token_precision'],
+        'strategy': 'normalize_mean',
         'speed_threshold': 5,
     }
     best_result = run_passage_compressor_node(modules, module_params, previous_result, node_line_dir, strategies)

diff --git a/tests/autorag/nodes/passagefilter/test_passage_filter_run.py b/tests/autorag/nodes/passagefilter/test_passage_filter_run.py
@@ -29,6 +29,7 @@ def test_run_passage_filter_node(node_line_dir):
     module_params = [{'threshold': 0.87}]
     strategies = {
         'metrics': ['retrieval_f1', 'retrieval_recall'],
+        'strategy': 'rank',
     }
     best_result = run_passage_filter_node(modules, module_params, previous_result, node_line_dir, strategies)
     assert os.path.exists(os.path.join(node_line_dir, "passage_filter"))

diff --git a/tests/autorag/nodes/passagereranker/test_passage_reranker_run.py b/tests/autorag/nodes/passagereranker/test_passage_reranker_run.py
@@ -82,6 +82,7 @@ def test_run_passage_reranker_node(node_line_dir):
     module_params = [{'top_k': 4, 'model_name': 'castorini_monot5-3b-msmarco-10k'}]
     strategies = {
         'metrics': ['retrieval_f1', 'retrieval_recall'],
+        'strategy': 'rank',
     }
     best_result = run_passage_reranker_node(modules, module_params, previous_result, node_line_dir, strategies)
     assert os.path.exists(os.path.join(node_line_dir, "passage_reranker"))

diff --git a/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py b/tests/autorag/nodes/promptmaker/test_prompt_maker_run.py
@@ -105,6 +105,7 @@ def test_run_prompt_maker_node(node_line_dir):
         'speed_threshold': 5,
         'token_threshold': 25,
         'tokenizer': 'gpt-3.5-turbo',
+        'strategy': 'rank',
         'generator_modules': [{
             'module_type': 'llama_index_llm',
             'llm': 'mock',

diff --git a/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py b/tests/autorag/nodes/queryexpansion/test_query_expansion_run.py
@@ -100,6 +100,7 @@ def test_run_query_expansion_node(node_line_dir):
         'metrics': metrics,
         'speed_threshold': 5,
         'top_k': 4,
+        'strategy': 'rank',
         'retrieval_modules': [{'module_type': 'bm25', 'bm25_tokenizer': 'gpt2'}],
     }
     best_result = run_query_expansion_node(modules, module_params, previous_result, node_line_dir, strategies)

diff --git a/tests/autorag/nodes/retrieval/test_run_retrieval_node.py b/tests/autorag/nodes/retrieval/test_run_retrieval_node.py
@@ -52,6 +52,7 @@ def test_run_retrieval_node(node_line_dir):
     qa_path = os.path.join(project_dir, "data", "qa.parquet")
     strategies = {
         'metrics': ['retrieval_f1', 'retrieval_recall'],
+        'strategy': 'normalize_mean',
         'speed_threshold': 5,
     }
     previous_result = pd.read_parquet(qa_path)

diff --git a/tests/autorag/utils/test_util.py b/tests/autorag/utils/test_util.py
@@ -5,16 +5,20 @@
 import tempfile
 from datetime import datetime, date
 
+import numpy as np
 import pandas as pd
 import pytest
 import tiktoken
+from llama_index.core.base.embeddings.base import BaseEmbedding
 from llama_index.core.llms import CompletionResponse
+from llama_index.embeddings.openai import OpenAIEmbedding
 
 from autorag.utils import fetch_contents
 from autorag.utils.util import load_summary_file, result_to_dataframe, \
     make_combinations, explode, replace_value_in_dict, normalize_string, convert_string_to_tuple_in_dict, process_batch, \
     convert_env_in_dict, openai_truncate_by_token, convert_datetime_string, split_dataframe, find_trial_dir, \
-    find_node_summary_files, normalize_unicode, dict_to_markdown, dict_to_markdown_table
+    find_node_summary_files, normalize_unicode, dict_to_markdown, dict_to_markdown_table, convert_inputs_to_list, \
+    to_list
 from tests.mock import MockLLM
 
 root_dir = pathlib.PurePath(os.path.dirname(os.path.realpath(__file__))).parent.parent
@@ -407,3 +411,23 @@ def test_dict_to_markdown_table():
 | key2 | value2 |
 """
     assert result == result_text
+
+
+@convert_inputs_to_list
+def convert_inputs_to_list_function(int_type, str_type, iterable_type, iterable_type2):
+    assert isinstance(int_type, int)
+    assert isinstance(str_type, str)
+    assert isinstance(iterable_type, list)
+    assert isinstance(iterable_type2, list)
+
+
+def test_convert_inputs_to_list():
+    convert_inputs_to_list_function(1, 'jax', (2, 3), (5, 6, [4, 66]))
+    convert_inputs_to_list_function(1, 'jax', np.array([3, 4]), [pd.Series([12, 13]), 14])
+    convert_inputs_to_list_function(4, 'jax', pd.Series([7, 8, 9]), np.array([[3, 4], [4, 5]]))
+
+
+def test_to_list():
+    embedding_model = OpenAIEmbedding()
+    new_model = to_list(embedding_model)
+    assert isinstance(new_model, BaseEmbedding)
-Original file line number
+Diff line change
@@ Expand Up / @@ -47,7 +47,7 @@ @@
         Question: {question}
-        Decompositions:"
+        Decompositions:
         """
@@ Expand Down @@